1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/notify.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 #include "spdk/dma.h" 50 51 #include "spdk/bdev_module.h" 52 #include "spdk/log.h" 53 #include "spdk/string.h" 54 55 #include "bdev_internal.h" 56 #include "spdk_internal/trace_defs.h" 57 58 #ifdef SPDK_CONFIG_VTUNE 59 #include "ittnotify.h" 60 #include "ittnotify_types.h" 61 int __itt_init_ittlib(const char *, __itt_group_id); 62 #endif 63 64 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 65 #define SPDK_BDEV_IO_CACHE_SIZE 256 66 #define SPDK_BDEV_AUTO_EXAMINE true 67 #define BUF_SMALL_POOL_SIZE 8191 68 #define BUF_LARGE_POOL_SIZE 1023 69 #define NOMEM_THRESHOLD_COUNT 8 70 71 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 72 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 73 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 74 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 75 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 76 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 77 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 78 79 #define SPDK_BDEV_POOL_ALIGNMENT 512 80 81 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 82 * when splitting into children requests at a time. 83 */ 84 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 85 86 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 87 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 88 }; 89 90 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 91 92 RB_HEAD(bdev_name_tree, spdk_bdev_name); 93 94 static int 95 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 96 { 97 return strcmp(name1->name, name2->name); 98 } 99 100 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 101 102 struct spdk_bdev_mgr { 103 struct spdk_mempool *bdev_io_pool; 104 105 struct spdk_mempool *buf_small_pool; 106 struct spdk_mempool *buf_large_pool; 107 108 void *zero_buffer; 109 110 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 111 112 struct spdk_bdev_list bdevs; 113 struct bdev_name_tree bdev_names; 114 115 bool init_complete; 116 bool module_init_complete; 117 118 pthread_mutex_t mutex; 119 120 #ifdef SPDK_CONFIG_VTUNE 121 __itt_domain *domain; 122 #endif 123 }; 124 125 static struct spdk_bdev_mgr g_bdev_mgr = { 126 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 127 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 128 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 129 .init_complete = false, 130 .module_init_complete = false, 131 .mutex = PTHREAD_MUTEX_INITIALIZER, 132 }; 133 134 typedef void (*lock_range_cb)(void *ctx, int status); 135 136 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 137 138 struct lba_range { 139 uint64_t offset; 140 uint64_t length; 141 void *locked_ctx; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 }; 145 146 static struct spdk_bdev_opts g_bdev_opts = { 147 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 148 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 149 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 150 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 151 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 152 }; 153 154 static spdk_bdev_init_cb g_init_cb_fn = NULL; 155 static void *g_init_cb_arg = NULL; 156 157 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 158 static void *g_fini_cb_arg = NULL; 159 static struct spdk_thread *g_fini_thread = NULL; 160 161 struct spdk_bdev_qos_limit { 162 /** IOs or bytes allowed per second (i.e., 1s). */ 163 uint64_t limit; 164 165 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 166 * For remaining bytes, allowed to run negative if an I/O is submitted when 167 * some bytes are remaining, but the I/O is bigger than that amount. The 168 * excess will be deducted from the next timeslice. 169 */ 170 int64_t remaining_this_timeslice; 171 172 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t min_per_timeslice; 174 175 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 176 uint32_t max_per_timeslice; 177 178 /** Function to check whether to queue the IO. */ 179 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 181 /** Function to update for the submitted IO. */ 182 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 }; 184 185 struct spdk_bdev_qos { 186 /** Types of structure of rate limits. */ 187 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 188 189 /** The channel that all I/O are funneled through. */ 190 struct spdk_bdev_channel *ch; 191 192 /** The thread on which the poller is running. */ 193 struct spdk_thread *thread; 194 195 /** Queue of I/O waiting to be issued. */ 196 bdev_io_tailq_t queued; 197 198 /** Size of a timeslice in tsc ticks. */ 199 uint64_t timeslice_size; 200 201 /** Timestamp of start of last timeslice. */ 202 uint64_t last_timeslice; 203 204 /** Poller that processes queued I/O commands each time slice. */ 205 struct spdk_poller *poller; 206 }; 207 208 struct spdk_bdev_mgmt_channel { 209 bdev_io_stailq_t need_buf_small; 210 bdev_io_stailq_t need_buf_large; 211 212 /* 213 * Each thread keeps a cache of bdev_io - this allows 214 * bdev threads which are *not* DPDK threads to still 215 * benefit from a per-thread bdev_io cache. Without 216 * this, non-DPDK threads fetching from the mempool 217 * incur a cmpxchg on get and put. 218 */ 219 bdev_io_stailq_t per_thread_cache; 220 uint32_t per_thread_cache_count; 221 uint32_t bdev_io_cache_size; 222 223 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 224 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 225 }; 226 227 /* 228 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 229 * will queue here their IO that awaits retry. It makes it possible to retry sending 230 * IO to one bdev after IO from other bdev completes. 231 */ 232 struct spdk_bdev_shared_resource { 233 /* The bdev management channel */ 234 struct spdk_bdev_mgmt_channel *mgmt_ch; 235 236 /* 237 * Count of I/O submitted to bdev module and waiting for completion. 238 * Incremented before submit_request() is called on an spdk_bdev_io. 239 */ 240 uint64_t io_outstanding; 241 242 /* 243 * Queue of IO awaiting retry because of a previous NOMEM status returned 244 * on this channel. 245 */ 246 bdev_io_tailq_t nomem_io; 247 248 /* 249 * Threshold which io_outstanding must drop to before retrying nomem_io. 250 */ 251 uint64_t nomem_threshold; 252 253 /* I/O channel allocated by a bdev module */ 254 struct spdk_io_channel *shared_ch; 255 256 /* Refcount of bdev channels using this resource */ 257 uint32_t ref; 258 259 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 260 }; 261 262 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 263 #define BDEV_CH_QOS_ENABLED (1 << 1) 264 265 struct spdk_bdev_channel { 266 struct spdk_bdev *bdev; 267 268 /* The channel for the underlying device */ 269 struct spdk_io_channel *channel; 270 271 /* Per io_device per thread data */ 272 struct spdk_bdev_shared_resource *shared_resource; 273 274 struct spdk_bdev_io_stat stat; 275 276 /* 277 * Count of I/O submitted to the underlying dev module through this channel 278 * and waiting for completion. 279 */ 280 uint64_t io_outstanding; 281 282 /* 283 * List of all submitted I/Os including I/O that are generated via splitting. 284 */ 285 bdev_io_tailq_t io_submitted; 286 287 /* 288 * List of spdk_bdev_io that are currently queued because they write to a locked 289 * LBA range. 290 */ 291 bdev_io_tailq_t io_locked; 292 293 uint32_t flags; 294 295 struct spdk_histogram_data *histogram; 296 297 #ifdef SPDK_CONFIG_VTUNE 298 uint64_t start_tsc; 299 uint64_t interval_tsc; 300 __itt_string_handle *handle; 301 struct spdk_bdev_io_stat prev_stat; 302 #endif 303 304 bdev_io_tailq_t queued_resets; 305 306 lba_range_tailq_t locked_ranges; 307 }; 308 309 struct media_event_entry { 310 struct spdk_bdev_media_event event; 311 TAILQ_ENTRY(media_event_entry) tailq; 312 }; 313 314 #define MEDIA_EVENT_POOL_SIZE 64 315 316 struct spdk_bdev_desc { 317 struct spdk_bdev *bdev; 318 struct spdk_thread *thread; 319 struct { 320 spdk_bdev_event_cb_t event_fn; 321 void *ctx; 322 } callback; 323 bool closed; 324 bool write; 325 bool memory_domains_supported; 326 pthread_mutex_t mutex; 327 uint32_t refs; 328 TAILQ_HEAD(, media_event_entry) pending_media_events; 329 TAILQ_HEAD(, media_event_entry) free_media_events; 330 struct media_event_entry *media_events_buffer; 331 TAILQ_ENTRY(spdk_bdev_desc) link; 332 333 uint64_t timeout_in_sec; 334 spdk_bdev_io_timeout_cb cb_fn; 335 void *cb_arg; 336 struct spdk_poller *io_timeout_poller; 337 }; 338 339 struct spdk_bdev_iostat_ctx { 340 struct spdk_bdev_io_stat *stat; 341 spdk_bdev_get_device_stat_cb cb; 342 void *cb_arg; 343 }; 344 345 struct set_qos_limit_ctx { 346 void (*cb_fn)(void *cb_arg, int status); 347 void *cb_arg; 348 struct spdk_bdev *bdev; 349 }; 350 351 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 352 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 353 354 static inline void bdev_io_complete(void *ctx); 355 356 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 357 static void bdev_write_zero_buffer_next(void *_bdev_io); 358 359 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 360 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 361 362 static int 363 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 364 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 365 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 366 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 367 static int 368 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 369 struct iovec *iov, int iovcnt, void *md_buf, 370 uint64_t offset_blocks, uint64_t num_blocks, 371 spdk_bdev_io_completion_cb cb, void *cb_arg, 372 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 373 374 static int 375 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 376 uint64_t offset, uint64_t length, 377 lock_range_cb cb_fn, void *cb_arg); 378 379 static int 380 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 381 uint64_t offset, uint64_t length, 382 lock_range_cb cb_fn, void *cb_arg); 383 384 static inline void bdev_io_complete(void *ctx); 385 386 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 387 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 388 389 void 390 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 391 { 392 if (!opts) { 393 SPDK_ERRLOG("opts should not be NULL\n"); 394 return; 395 } 396 397 if (!opts_size) { 398 SPDK_ERRLOG("opts_size should not be zero value\n"); 399 return; 400 } 401 402 opts->opts_size = opts_size; 403 404 #define SET_FIELD(field) \ 405 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 406 opts->field = g_bdev_opts.field; \ 407 } \ 408 409 SET_FIELD(bdev_io_pool_size); 410 SET_FIELD(bdev_io_cache_size); 411 SET_FIELD(bdev_auto_examine); 412 SET_FIELD(small_buf_pool_size); 413 SET_FIELD(large_buf_pool_size); 414 415 /* Do not remove this statement, you should always update this statement when you adding a new field, 416 * and do not forget to add the SET_FIELD statement for your added field. */ 417 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 418 419 #undef SET_FIELD 420 } 421 422 int 423 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 424 { 425 uint32_t min_pool_size; 426 427 if (!opts) { 428 SPDK_ERRLOG("opts cannot be NULL\n"); 429 return -1; 430 } 431 432 if (!opts->opts_size) { 433 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 434 return -1; 435 } 436 437 /* 438 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 439 * initialization. A second mgmt_ch will be created on the same thread when the application starts 440 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 441 */ 442 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 443 if (opts->bdev_io_pool_size < min_pool_size) { 444 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 445 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 446 spdk_thread_get_count()); 447 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 448 return -1; 449 } 450 451 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 452 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 453 return -1; 454 } 455 456 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 457 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 458 return -1; 459 } 460 461 #define SET_FIELD(field) \ 462 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 463 g_bdev_opts.field = opts->field; \ 464 } \ 465 466 SET_FIELD(bdev_io_pool_size); 467 SET_FIELD(bdev_io_cache_size); 468 SET_FIELD(bdev_auto_examine); 469 SET_FIELD(small_buf_pool_size); 470 SET_FIELD(large_buf_pool_size); 471 472 g_bdev_opts.opts_size = opts->opts_size; 473 474 #undef SET_FIELD 475 476 return 0; 477 } 478 479 static struct spdk_bdev * 480 bdev_get_by_name(const char *bdev_name) 481 { 482 struct spdk_bdev_name find; 483 struct spdk_bdev_name *res; 484 485 find.name = (char *)bdev_name; 486 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 487 if (res != NULL) { 488 return res->bdev; 489 } 490 491 return NULL; 492 } 493 494 struct spdk_bdev * 495 spdk_bdev_get_by_name(const char *bdev_name) 496 { 497 struct spdk_bdev *bdev; 498 499 pthread_mutex_lock(&g_bdev_mgr.mutex); 500 bdev = bdev_get_by_name(bdev_name); 501 pthread_mutex_unlock(&g_bdev_mgr.mutex); 502 503 return bdev; 504 } 505 506 struct spdk_bdev_wait_for_examine_ctx { 507 struct spdk_poller *poller; 508 spdk_bdev_wait_for_examine_cb cb_fn; 509 void *cb_arg; 510 }; 511 512 static bool 513 bdev_module_all_actions_completed(void); 514 515 static int 516 bdev_wait_for_examine_cb(void *arg) 517 { 518 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 519 520 if (!bdev_module_all_actions_completed()) { 521 return SPDK_POLLER_IDLE; 522 } 523 524 spdk_poller_unregister(&ctx->poller); 525 ctx->cb_fn(ctx->cb_arg); 526 free(ctx); 527 528 return SPDK_POLLER_BUSY; 529 } 530 531 int 532 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 533 { 534 struct spdk_bdev_wait_for_examine_ctx *ctx; 535 536 ctx = calloc(1, sizeof(*ctx)); 537 if (ctx == NULL) { 538 return -ENOMEM; 539 } 540 ctx->cb_fn = cb_fn; 541 ctx->cb_arg = cb_arg; 542 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 543 544 return 0; 545 } 546 547 struct spdk_bdev_examine_item { 548 char *name; 549 TAILQ_ENTRY(spdk_bdev_examine_item) link; 550 }; 551 552 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 553 554 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 555 g_bdev_examine_allowlist); 556 557 static inline bool 558 bdev_examine_allowlist_check(const char *name) 559 { 560 struct spdk_bdev_examine_item *item; 561 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 562 if (strcmp(name, item->name) == 0) { 563 return true; 564 } 565 } 566 return false; 567 } 568 569 static inline void 570 bdev_examine_allowlist_free(void) 571 { 572 struct spdk_bdev_examine_item *item; 573 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 574 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 575 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 576 free(item->name); 577 free(item); 578 } 579 } 580 581 static inline bool 582 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 583 { 584 struct spdk_bdev_alias *tmp; 585 if (bdev_examine_allowlist_check(bdev->name)) { 586 return true; 587 } 588 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 589 if (bdev_examine_allowlist_check(tmp->alias.name)) { 590 return true; 591 } 592 } 593 return false; 594 } 595 596 static inline bool 597 bdev_ok_to_examine(struct spdk_bdev *bdev) 598 { 599 if (g_bdev_opts.bdev_auto_examine) { 600 return true; 601 } else { 602 return bdev_in_examine_allowlist(bdev); 603 } 604 } 605 606 static void 607 bdev_examine(struct spdk_bdev *bdev) 608 { 609 struct spdk_bdev_module *module; 610 uint32_t action; 611 612 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 613 if (module->examine_config && bdev_ok_to_examine(bdev)) { 614 action = module->internal.action_in_progress; 615 module->internal.action_in_progress++; 616 module->examine_config(bdev); 617 if (action != module->internal.action_in_progress) { 618 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 619 module->name); 620 } 621 } 622 } 623 624 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 625 if (bdev->internal.claim_module->examine_disk) { 626 bdev->internal.claim_module->internal.action_in_progress++; 627 bdev->internal.claim_module->examine_disk(bdev); 628 } 629 return; 630 } 631 632 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 633 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 634 module->internal.action_in_progress++; 635 module->examine_disk(bdev); 636 } 637 } 638 } 639 640 int 641 spdk_bdev_examine(const char *name) 642 { 643 struct spdk_bdev *bdev; 644 struct spdk_bdev_examine_item *item; 645 646 if (g_bdev_opts.bdev_auto_examine) { 647 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 648 return -EINVAL; 649 } 650 651 if (bdev_examine_allowlist_check(name)) { 652 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 653 return -EEXIST; 654 } 655 656 item = calloc(1, sizeof(*item)); 657 if (!item) { 658 return -ENOMEM; 659 } 660 item->name = strdup(name); 661 if (!item->name) { 662 free(item); 663 return -ENOMEM; 664 } 665 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 666 667 bdev = spdk_bdev_get_by_name(name); 668 if (bdev) { 669 bdev_examine(bdev); 670 } 671 return 0; 672 } 673 674 static inline void 675 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 676 { 677 struct spdk_bdev_examine_item *item; 678 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 679 spdk_json_write_object_begin(w); 680 spdk_json_write_named_string(w, "method", "bdev_examine"); 681 spdk_json_write_named_object_begin(w, "params"); 682 spdk_json_write_named_string(w, "name", item->name); 683 spdk_json_write_object_end(w); 684 spdk_json_write_object_end(w); 685 } 686 } 687 688 struct spdk_bdev * 689 spdk_bdev_first(void) 690 { 691 struct spdk_bdev *bdev; 692 693 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 694 if (bdev) { 695 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 696 } 697 698 return bdev; 699 } 700 701 struct spdk_bdev * 702 spdk_bdev_next(struct spdk_bdev *prev) 703 { 704 struct spdk_bdev *bdev; 705 706 bdev = TAILQ_NEXT(prev, internal.link); 707 if (bdev) { 708 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 709 } 710 711 return bdev; 712 } 713 714 static struct spdk_bdev * 715 _bdev_next_leaf(struct spdk_bdev *bdev) 716 { 717 while (bdev != NULL) { 718 if (bdev->internal.claim_module == NULL) { 719 return bdev; 720 } else { 721 bdev = TAILQ_NEXT(bdev, internal.link); 722 } 723 } 724 725 return bdev; 726 } 727 728 struct spdk_bdev * 729 spdk_bdev_first_leaf(void) 730 { 731 struct spdk_bdev *bdev; 732 733 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 734 735 if (bdev) { 736 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 737 } 738 739 return bdev; 740 } 741 742 struct spdk_bdev * 743 spdk_bdev_next_leaf(struct spdk_bdev *prev) 744 { 745 struct spdk_bdev *bdev; 746 747 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 748 749 if (bdev) { 750 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 751 } 752 753 return bdev; 754 } 755 756 static inline bool 757 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 758 { 759 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 760 } 761 762 void 763 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 764 { 765 struct iovec *iovs; 766 767 if (bdev_io->u.bdev.iovs == NULL) { 768 bdev_io->u.bdev.iovs = &bdev_io->iov; 769 bdev_io->u.bdev.iovcnt = 1; 770 } 771 772 iovs = bdev_io->u.bdev.iovs; 773 774 assert(iovs != NULL); 775 assert(bdev_io->u.bdev.iovcnt >= 1); 776 777 iovs[0].iov_base = buf; 778 iovs[0].iov_len = len; 779 } 780 781 void 782 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 783 { 784 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 785 bdev_io->u.bdev.md_buf = md_buf; 786 } 787 788 static bool 789 _is_buf_allocated(const struct iovec *iovs) 790 { 791 if (iovs == NULL) { 792 return false; 793 } 794 795 return iovs[0].iov_base != NULL; 796 } 797 798 static bool 799 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 800 { 801 int i; 802 uintptr_t iov_base; 803 804 if (spdk_likely(alignment == 1)) { 805 return true; 806 } 807 808 for (i = 0; i < iovcnt; i++) { 809 iov_base = (uintptr_t)iovs[i].iov_base; 810 if ((iov_base & (alignment - 1)) != 0) { 811 return false; 812 } 813 } 814 815 return true; 816 } 817 818 static void 819 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 820 { 821 int i; 822 size_t len; 823 824 for (i = 0; i < iovcnt; i++) { 825 len = spdk_min(iovs[i].iov_len, buf_len); 826 memcpy(buf, iovs[i].iov_base, len); 827 buf += len; 828 buf_len -= len; 829 } 830 } 831 832 static void 833 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 834 { 835 int i; 836 size_t len; 837 838 for (i = 0; i < iovcnt; i++) { 839 len = spdk_min(iovs[i].iov_len, buf_len); 840 memcpy(iovs[i].iov_base, buf, len); 841 buf += len; 842 buf_len -= len; 843 } 844 } 845 846 static void 847 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 848 { 849 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 850 void *buf; 851 852 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 853 buf = bdev_io->internal.buf; 854 bdev_io->internal.buf = NULL; 855 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 856 bdev_io->internal.get_aux_buf_cb = NULL; 857 } else { 858 assert(bdev_io->internal.get_buf_cb != NULL); 859 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 860 bdev_io->internal.get_buf_cb = NULL; 861 } 862 } 863 864 static void 865 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 866 { 867 struct spdk_bdev_io *bdev_io = ctx; 868 869 if (rc) { 870 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 871 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 872 } 873 bdev_io_get_buf_complete(bdev_io, !rc); 874 } 875 876 static void 877 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 878 { 879 int rc = 0; 880 881 /* save original md_buf */ 882 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 883 bdev_io->internal.orig_md_iov.iov_len = len; 884 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 885 bdev_io->internal.bounce_md_iov.iov_len = len; 886 /* set bounce md_buf */ 887 bdev_io->u.bdev.md_buf = md_buf; 888 889 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 890 if (bdev_io_use_memory_domain(bdev_io)) { 891 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 892 bdev_io->internal.ext_opts->memory_domain_ctx, 893 &bdev_io->internal.orig_md_iov, 1, 894 &bdev_io->internal.bounce_md_iov, 1, 895 bdev_io->internal.data_transfer_cpl, 896 bdev_io); 897 if (rc == 0) { 898 /* Continue to submit IO in completion callback */ 899 return; 900 } 901 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 902 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 903 } else { 904 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 905 } 906 } 907 908 assert(bdev_io->internal.data_transfer_cpl); 909 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 910 } 911 912 static void 913 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 914 { 915 struct spdk_bdev *bdev = bdev_io->bdev; 916 uint64_t md_len; 917 void *buf; 918 919 if (spdk_bdev_is_md_separate(bdev)) { 920 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 921 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 922 923 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 924 925 if (bdev_io->u.bdev.md_buf != NULL) { 926 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 927 return; 928 } else { 929 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 930 } 931 } 932 933 bdev_io_get_buf_complete(bdev_io, true); 934 } 935 936 static void 937 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 938 { 939 struct spdk_bdev_io *bdev_io = ctx; 940 941 if (rc) { 942 SPDK_ERRLOG("Failed to get data buffer\n"); 943 assert(bdev_io->internal.data_transfer_cpl); 944 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 945 return; 946 } 947 948 _bdev_io_set_md_buf(bdev_io); 949 } 950 951 static void 952 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 953 bdev_copy_bounce_buffer_cpl cpl_cb) 954 { 955 int rc = 0; 956 957 bdev_io->internal.data_transfer_cpl = cpl_cb; 958 /* save original iovec */ 959 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 960 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 961 /* set bounce iov */ 962 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 963 bdev_io->u.bdev.iovcnt = 1; 964 /* set bounce buffer for this operation */ 965 bdev_io->u.bdev.iovs[0].iov_base = buf; 966 bdev_io->u.bdev.iovs[0].iov_len = len; 967 /* if this is write path, copy data from original buffer to bounce buffer */ 968 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 969 if (bdev_io_use_memory_domain(bdev_io)) { 970 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 971 bdev_io->internal.ext_opts->memory_domain_ctx, 972 bdev_io->internal.orig_iovs, 973 (uint32_t) bdev_io->internal.orig_iovcnt, 974 bdev_io->u.bdev.iovs, 1, 975 _bdev_io_pull_bounce_data_buf_done, 976 bdev_io); 977 if (rc == 0) { 978 /* Continue to submit IO in completion callback */ 979 return; 980 } 981 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 982 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 983 } else { 984 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 985 } 986 } 987 988 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 989 } 990 991 static void 992 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 993 { 994 struct spdk_bdev *bdev = bdev_io->bdev; 995 bool buf_allocated; 996 uint64_t alignment; 997 void *aligned_buf; 998 999 bdev_io->internal.buf = buf; 1000 1001 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1002 bdev_io_get_buf_complete(bdev_io, true); 1003 return; 1004 } 1005 1006 alignment = spdk_bdev_get_buf_align(bdev); 1007 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1008 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1009 1010 if (buf_allocated) { 1011 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1012 /* Continue in completion callback */ 1013 return; 1014 } else { 1015 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1016 } 1017 1018 _bdev_io_set_md_buf(bdev_io); 1019 } 1020 1021 static void 1022 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1023 { 1024 struct spdk_bdev *bdev = bdev_io->bdev; 1025 struct spdk_mempool *pool; 1026 struct spdk_bdev_io *tmp; 1027 bdev_io_stailq_t *stailq; 1028 struct spdk_bdev_mgmt_channel *ch; 1029 uint64_t md_len, alignment; 1030 1031 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1032 alignment = spdk_bdev_get_buf_align(bdev); 1033 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1034 1035 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1036 SPDK_BDEV_POOL_ALIGNMENT) { 1037 pool = g_bdev_mgr.buf_small_pool; 1038 stailq = &ch->need_buf_small; 1039 } else { 1040 pool = g_bdev_mgr.buf_large_pool; 1041 stailq = &ch->need_buf_large; 1042 } 1043 1044 if (STAILQ_EMPTY(stailq)) { 1045 spdk_mempool_put(pool, buf); 1046 } else { 1047 tmp = STAILQ_FIRST(stailq); 1048 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 1049 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 1050 } 1051 } 1052 1053 static void 1054 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1055 { 1056 assert(bdev_io->internal.buf != NULL); 1057 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1058 bdev_io->internal.buf = NULL; 1059 } 1060 1061 void 1062 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1063 { 1064 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1065 1066 assert(buf != NULL); 1067 _bdev_io_put_buf(bdev_io, buf, len); 1068 } 1069 1070 static void 1071 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1072 { 1073 struct spdk_bdev *bdev = bdev_ch->bdev; 1074 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1075 struct spdk_bdev_io *bdev_io; 1076 1077 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1078 /* 1079 * Allow some more I/O to complete before retrying the nomem_io queue. 1080 * Some drivers (such as nvme) cannot immediately take a new I/O in 1081 * the context of a completion, because the resources for the I/O are 1082 * not released until control returns to the bdev poller. Also, we 1083 * may require several small I/O to complete before a larger I/O 1084 * (that requires splitting) can be submitted. 1085 */ 1086 return; 1087 } 1088 1089 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1090 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1091 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1092 bdev_io->internal.ch->io_outstanding++; 1093 shared_resource->io_outstanding++; 1094 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1095 bdev_io->internal.error.nvme.cdw0 = 0; 1096 bdev_io->num_retries++; 1097 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1098 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1099 break; 1100 } 1101 } 1102 } 1103 1104 static inline void 1105 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1106 struct spdk_bdev_shared_resource *shared_resource) 1107 { 1108 assert(bdev_ch->io_outstanding > 0); 1109 assert(shared_resource->io_outstanding > 0); 1110 bdev_ch->io_outstanding--; 1111 shared_resource->io_outstanding--; 1112 } 1113 1114 static inline bool 1115 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1116 { 1117 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1118 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1119 1120 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1121 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1122 /* 1123 * Wait for some of the outstanding I/O to complete before we 1124 * retry any of the nomem_io. Normally we will wait for 1125 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1126 * depth channels we will instead wait for half to complete. 1127 */ 1128 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1129 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1130 return true; 1131 } 1132 1133 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1134 bdev_ch_retry_io(bdev_ch); 1135 } 1136 1137 return false; 1138 } 1139 1140 static void 1141 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1142 { 1143 struct spdk_bdev_io *bdev_io = ctx; 1144 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1145 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1146 1147 if (rc) { 1148 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1149 } 1150 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1151 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1152 */ 1153 bdev_io_put_buf(bdev_io); 1154 1155 /* Continue with IO completion flow */ 1156 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1157 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1158 return; 1159 } 1160 1161 bdev_io_complete(bdev_io); 1162 } 1163 1164 static inline void 1165 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1166 { 1167 int rc = 0; 1168 1169 /* do the same for metadata buffer */ 1170 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1171 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1172 1173 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1174 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1175 if (bdev_io_use_memory_domain(bdev_io)) { 1176 /* If memory domain is used then we need to call async push function */ 1177 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1178 bdev_io->internal.ext_opts->memory_domain_ctx, 1179 &bdev_io->internal.orig_md_iov, 1180 (uint32_t)bdev_io->internal.orig_iovcnt, 1181 &bdev_io->internal.bounce_md_iov, 1, 1182 bdev_io->internal.data_transfer_cpl, 1183 bdev_io); 1184 if (rc == 0) { 1185 /* Continue IO completion in async callback */ 1186 return; 1187 } 1188 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1189 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1190 } else { 1191 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1192 bdev_io->internal.orig_md_iov.iov_len); 1193 } 1194 } 1195 } 1196 1197 assert(bdev_io->internal.data_transfer_cpl); 1198 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1199 } 1200 1201 static void 1202 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1203 { 1204 struct spdk_bdev_io *bdev_io = ctx; 1205 1206 assert(bdev_io->internal.data_transfer_cpl); 1207 1208 if (rc) { 1209 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1210 return; 1211 } 1212 1213 /* set original buffer for this io */ 1214 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1215 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1216 /* disable bouncing buffer for this io */ 1217 bdev_io->internal.orig_iovcnt = 0; 1218 bdev_io->internal.orig_iovs = NULL; 1219 1220 _bdev_io_push_bounce_md_buffer(bdev_io); 1221 } 1222 1223 static inline void 1224 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1225 { 1226 int rc = 0; 1227 1228 bdev_io->internal.data_transfer_cpl = cpl_cb; 1229 1230 /* if this is read path, copy data from bounce buffer to original buffer */ 1231 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1232 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1233 if (bdev_io_use_memory_domain(bdev_io)) { 1234 /* If memory domain is used then we need to call async push function */ 1235 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1236 bdev_io->internal.ext_opts->memory_domain_ctx, 1237 bdev_io->internal.orig_iovs, 1238 (uint32_t)bdev_io->internal.orig_iovcnt, 1239 &bdev_io->internal.bounce_iov, 1, 1240 _bdev_io_push_bounce_data_buffer_done, 1241 bdev_io); 1242 if (rc == 0) { 1243 /* Continue IO completion in async callback */ 1244 return; 1245 } 1246 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1247 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1248 } else { 1249 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1250 bdev_io->internal.orig_iovcnt, 1251 bdev_io->internal.bounce_iov.iov_base, 1252 bdev_io->internal.bounce_iov.iov_len); 1253 } 1254 } 1255 1256 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1257 } 1258 1259 static void 1260 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1261 { 1262 struct spdk_bdev *bdev = bdev_io->bdev; 1263 struct spdk_mempool *pool; 1264 bdev_io_stailq_t *stailq; 1265 struct spdk_bdev_mgmt_channel *mgmt_ch; 1266 uint64_t alignment, md_len; 1267 void *buf; 1268 1269 alignment = spdk_bdev_get_buf_align(bdev); 1270 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1271 1272 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1273 SPDK_BDEV_POOL_ALIGNMENT) { 1274 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1275 len + alignment); 1276 bdev_io_get_buf_complete(bdev_io, false); 1277 return; 1278 } 1279 1280 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1281 1282 bdev_io->internal.buf_len = len; 1283 1284 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1285 SPDK_BDEV_POOL_ALIGNMENT) { 1286 pool = g_bdev_mgr.buf_small_pool; 1287 stailq = &mgmt_ch->need_buf_small; 1288 } else { 1289 pool = g_bdev_mgr.buf_large_pool; 1290 stailq = &mgmt_ch->need_buf_large; 1291 } 1292 1293 buf = spdk_mempool_get(pool); 1294 if (!buf) { 1295 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1296 } else { 1297 _bdev_io_set_buf(bdev_io, buf, len); 1298 } 1299 } 1300 1301 void 1302 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1303 { 1304 struct spdk_bdev *bdev = bdev_io->bdev; 1305 uint64_t alignment; 1306 1307 assert(cb != NULL); 1308 bdev_io->internal.get_buf_cb = cb; 1309 1310 alignment = spdk_bdev_get_buf_align(bdev); 1311 1312 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1313 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1314 /* Buffer already present and aligned */ 1315 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1316 return; 1317 } 1318 1319 bdev_io_get_buf(bdev_io, len); 1320 } 1321 1322 static void 1323 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1324 bool success) 1325 { 1326 if (!success) { 1327 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1328 bdev_io_complete(bdev_io); 1329 } else { 1330 bdev_io_submit(bdev_io); 1331 } 1332 } 1333 1334 static void 1335 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1336 uint64_t len) 1337 { 1338 assert(cb != NULL); 1339 bdev_io->internal.get_buf_cb = cb; 1340 1341 bdev_io_get_buf(bdev_io, len); 1342 } 1343 1344 void 1345 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1346 { 1347 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1348 1349 assert(cb != NULL); 1350 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1351 bdev_io->internal.get_aux_buf_cb = cb; 1352 bdev_io_get_buf(bdev_io, len); 1353 } 1354 1355 static int 1356 bdev_module_get_max_ctx_size(void) 1357 { 1358 struct spdk_bdev_module *bdev_module; 1359 int max_bdev_module_size = 0; 1360 1361 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1362 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1363 max_bdev_module_size = bdev_module->get_ctx_size(); 1364 } 1365 } 1366 1367 return max_bdev_module_size; 1368 } 1369 1370 static void 1371 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1372 { 1373 int i; 1374 struct spdk_bdev_qos *qos = bdev->internal.qos; 1375 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1376 1377 if (!qos) { 1378 return; 1379 } 1380 1381 spdk_bdev_get_qos_rate_limits(bdev, limits); 1382 1383 spdk_json_write_object_begin(w); 1384 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1385 1386 spdk_json_write_named_object_begin(w, "params"); 1387 spdk_json_write_named_string(w, "name", bdev->name); 1388 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1389 if (limits[i] > 0) { 1390 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1391 } 1392 } 1393 spdk_json_write_object_end(w); 1394 1395 spdk_json_write_object_end(w); 1396 } 1397 1398 void 1399 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1400 { 1401 struct spdk_bdev_module *bdev_module; 1402 struct spdk_bdev *bdev; 1403 1404 assert(w != NULL); 1405 1406 spdk_json_write_array_begin(w); 1407 1408 spdk_json_write_object_begin(w); 1409 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1410 spdk_json_write_named_object_begin(w, "params"); 1411 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1412 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1413 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1414 spdk_json_write_object_end(w); 1415 spdk_json_write_object_end(w); 1416 1417 bdev_examine_allowlist_config_json(w); 1418 1419 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1420 if (bdev_module->config_json) { 1421 bdev_module->config_json(w); 1422 } 1423 } 1424 1425 pthread_mutex_lock(&g_bdev_mgr.mutex); 1426 1427 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1428 if (bdev->fn_table->write_config_json) { 1429 bdev->fn_table->write_config_json(bdev, w); 1430 } 1431 1432 bdev_qos_config_json(bdev, w); 1433 } 1434 1435 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1436 1437 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1438 spdk_json_write_object_begin(w); 1439 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1440 spdk_json_write_object_end(w); 1441 1442 spdk_json_write_array_end(w); 1443 } 1444 1445 static int 1446 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1447 { 1448 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1449 struct spdk_bdev_io *bdev_io; 1450 uint32_t i; 1451 1452 STAILQ_INIT(&ch->need_buf_small); 1453 STAILQ_INIT(&ch->need_buf_large); 1454 1455 STAILQ_INIT(&ch->per_thread_cache); 1456 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1457 1458 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1459 ch->per_thread_cache_count = 0; 1460 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1461 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1462 assert(bdev_io != NULL); 1463 ch->per_thread_cache_count++; 1464 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1465 } 1466 1467 TAILQ_INIT(&ch->shared_resources); 1468 TAILQ_INIT(&ch->io_wait_queue); 1469 1470 return 0; 1471 } 1472 1473 static void 1474 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1475 { 1476 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1477 struct spdk_bdev_io *bdev_io; 1478 1479 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1480 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1481 } 1482 1483 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1484 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1485 } 1486 1487 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1488 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1489 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1490 ch->per_thread_cache_count--; 1491 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1492 } 1493 1494 assert(ch->per_thread_cache_count == 0); 1495 } 1496 1497 static void 1498 bdev_init_complete(int rc) 1499 { 1500 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1501 void *cb_arg = g_init_cb_arg; 1502 struct spdk_bdev_module *m; 1503 1504 g_bdev_mgr.init_complete = true; 1505 g_init_cb_fn = NULL; 1506 g_init_cb_arg = NULL; 1507 1508 /* 1509 * For modules that need to know when subsystem init is complete, 1510 * inform them now. 1511 */ 1512 if (rc == 0) { 1513 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1514 if (m->init_complete) { 1515 m->init_complete(); 1516 } 1517 } 1518 } 1519 1520 cb_fn(cb_arg, rc); 1521 } 1522 1523 static bool 1524 bdev_module_all_actions_completed(void) 1525 { 1526 struct spdk_bdev_module *m; 1527 1528 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1529 if (m->internal.action_in_progress > 0) { 1530 return false; 1531 } 1532 } 1533 return true; 1534 } 1535 1536 static void 1537 bdev_module_action_complete(void) 1538 { 1539 /* 1540 * Don't finish bdev subsystem initialization if 1541 * module pre-initialization is still in progress, or 1542 * the subsystem been already initialized. 1543 */ 1544 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1545 return; 1546 } 1547 1548 /* 1549 * Check all bdev modules for inits/examinations in progress. If any 1550 * exist, return immediately since we cannot finish bdev subsystem 1551 * initialization until all are completed. 1552 */ 1553 if (!bdev_module_all_actions_completed()) { 1554 return; 1555 } 1556 1557 /* 1558 * Modules already finished initialization - now that all 1559 * the bdev modules have finished their asynchronous I/O 1560 * processing, the entire bdev layer can be marked as complete. 1561 */ 1562 bdev_init_complete(0); 1563 } 1564 1565 static void 1566 bdev_module_action_done(struct spdk_bdev_module *module) 1567 { 1568 assert(module->internal.action_in_progress > 0); 1569 module->internal.action_in_progress--; 1570 bdev_module_action_complete(); 1571 } 1572 1573 void 1574 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1575 { 1576 bdev_module_action_done(module); 1577 } 1578 1579 void 1580 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1581 { 1582 bdev_module_action_done(module); 1583 } 1584 1585 /** The last initialized bdev module */ 1586 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1587 1588 static void 1589 bdev_init_failed(void *cb_arg) 1590 { 1591 struct spdk_bdev_module *module = cb_arg; 1592 1593 module->internal.action_in_progress--; 1594 bdev_init_complete(-1); 1595 } 1596 1597 static int 1598 bdev_modules_init(void) 1599 { 1600 struct spdk_bdev_module *module; 1601 int rc = 0; 1602 1603 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1604 g_resume_bdev_module = module; 1605 if (module->async_init) { 1606 module->internal.action_in_progress = 1; 1607 } 1608 rc = module->module_init(); 1609 if (rc != 0) { 1610 /* Bump action_in_progress to prevent other modules from completion of modules_init 1611 * Send message to defer application shutdown until resources are cleaned up */ 1612 module->internal.action_in_progress = 1; 1613 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1614 return rc; 1615 } 1616 } 1617 1618 g_resume_bdev_module = NULL; 1619 return 0; 1620 } 1621 1622 void 1623 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1624 { 1625 int cache_size; 1626 int rc = 0; 1627 char mempool_name[32]; 1628 1629 assert(cb_fn != NULL); 1630 1631 g_init_cb_fn = cb_fn; 1632 g_init_cb_arg = cb_arg; 1633 1634 spdk_notify_type_register("bdev_register"); 1635 spdk_notify_type_register("bdev_unregister"); 1636 1637 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1638 1639 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1640 g_bdev_opts.bdev_io_pool_size, 1641 sizeof(struct spdk_bdev_io) + 1642 bdev_module_get_max_ctx_size(), 1643 0, 1644 SPDK_ENV_SOCKET_ID_ANY); 1645 1646 if (g_bdev_mgr.bdev_io_pool == NULL) { 1647 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1648 bdev_init_complete(-1); 1649 return; 1650 } 1651 1652 /** 1653 * Ensure no more than half of the total buffers end up local caches, by 1654 * using spdk_env_get_core_count() to determine how many local caches we need 1655 * to account for. 1656 */ 1657 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1658 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1659 1660 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1661 g_bdev_opts.small_buf_pool_size, 1662 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1663 SPDK_BDEV_POOL_ALIGNMENT, 1664 cache_size, 1665 SPDK_ENV_SOCKET_ID_ANY); 1666 if (!g_bdev_mgr.buf_small_pool) { 1667 SPDK_ERRLOG("create rbuf small pool failed\n"); 1668 bdev_init_complete(-1); 1669 return; 1670 } 1671 1672 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1673 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1674 1675 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1676 g_bdev_opts.large_buf_pool_size, 1677 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1678 SPDK_BDEV_POOL_ALIGNMENT, 1679 cache_size, 1680 SPDK_ENV_SOCKET_ID_ANY); 1681 if (!g_bdev_mgr.buf_large_pool) { 1682 SPDK_ERRLOG("create rbuf large pool failed\n"); 1683 bdev_init_complete(-1); 1684 return; 1685 } 1686 1687 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1688 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1689 if (!g_bdev_mgr.zero_buffer) { 1690 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1691 bdev_init_complete(-1); 1692 return; 1693 } 1694 1695 #ifdef SPDK_CONFIG_VTUNE 1696 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1697 #endif 1698 1699 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1700 bdev_mgmt_channel_destroy, 1701 sizeof(struct spdk_bdev_mgmt_channel), 1702 "bdev_mgr"); 1703 1704 rc = bdev_modules_init(); 1705 g_bdev_mgr.module_init_complete = true; 1706 if (rc != 0) { 1707 SPDK_ERRLOG("bdev modules init failed\n"); 1708 return; 1709 } 1710 1711 bdev_module_action_complete(); 1712 } 1713 1714 static void 1715 bdev_mgr_unregister_cb(void *io_device) 1716 { 1717 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1718 1719 if (g_bdev_mgr.bdev_io_pool) { 1720 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1721 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1722 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1723 g_bdev_opts.bdev_io_pool_size); 1724 } 1725 1726 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1727 } 1728 1729 if (g_bdev_mgr.buf_small_pool) { 1730 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1731 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1732 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1733 g_bdev_opts.small_buf_pool_size); 1734 assert(false); 1735 } 1736 1737 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1738 } 1739 1740 if (g_bdev_mgr.buf_large_pool) { 1741 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1742 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1743 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1744 g_bdev_opts.large_buf_pool_size); 1745 assert(false); 1746 } 1747 1748 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1749 } 1750 1751 spdk_free(g_bdev_mgr.zero_buffer); 1752 1753 bdev_examine_allowlist_free(); 1754 1755 cb_fn(g_fini_cb_arg); 1756 g_fini_cb_fn = NULL; 1757 g_fini_cb_arg = NULL; 1758 g_bdev_mgr.init_complete = false; 1759 g_bdev_mgr.module_init_complete = false; 1760 } 1761 1762 static void 1763 bdev_module_fini_iter(void *arg) 1764 { 1765 struct spdk_bdev_module *bdev_module; 1766 1767 /* FIXME: Handling initialization failures is broken now, 1768 * so we won't even try cleaning up after successfully 1769 * initialized modules. if module_init_complete is false, 1770 * just call spdk_bdev_mgr_unregister_cb 1771 */ 1772 if (!g_bdev_mgr.module_init_complete) { 1773 bdev_mgr_unregister_cb(NULL); 1774 return; 1775 } 1776 1777 /* Start iterating from the last touched module */ 1778 if (!g_resume_bdev_module) { 1779 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1780 } else { 1781 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1782 internal.tailq); 1783 } 1784 1785 while (bdev_module) { 1786 if (bdev_module->async_fini) { 1787 /* Save our place so we can resume later. We must 1788 * save the variable here, before calling module_fini() 1789 * below, because in some cases the module may immediately 1790 * call spdk_bdev_module_fini_done() and re-enter 1791 * this function to continue iterating. */ 1792 g_resume_bdev_module = bdev_module; 1793 } 1794 1795 if (bdev_module->module_fini) { 1796 bdev_module->module_fini(); 1797 } 1798 1799 if (bdev_module->async_fini) { 1800 return; 1801 } 1802 1803 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1804 internal.tailq); 1805 } 1806 1807 g_resume_bdev_module = NULL; 1808 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1809 } 1810 1811 void 1812 spdk_bdev_module_fini_done(void) 1813 { 1814 if (spdk_get_thread() != g_fini_thread) { 1815 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1816 } else { 1817 bdev_module_fini_iter(NULL); 1818 } 1819 } 1820 1821 static void 1822 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1823 { 1824 struct spdk_bdev *bdev = cb_arg; 1825 1826 if (bdeverrno && bdev) { 1827 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1828 bdev->name); 1829 1830 /* 1831 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1832 * bdev; try to continue by manually removing this bdev from the list and continue 1833 * with the next bdev in the list. 1834 */ 1835 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1836 } 1837 1838 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1839 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1840 /* 1841 * Bdev module finish need to be deferred as we might be in the middle of some context 1842 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1843 * after returning. 1844 */ 1845 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1846 return; 1847 } 1848 1849 /* 1850 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1851 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1852 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1853 * base bdevs. 1854 * 1855 * Also, walk the list in the reverse order. 1856 */ 1857 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1858 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1859 if (bdev->internal.claim_module != NULL) { 1860 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1861 bdev->name, bdev->internal.claim_module->name); 1862 continue; 1863 } 1864 1865 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1866 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1867 return; 1868 } 1869 1870 /* 1871 * If any bdev fails to unclaim underlying bdev properly, we may face the 1872 * case of bdev list consisting of claimed bdevs only (if claims are managed 1873 * correctly, this would mean there's a loop in the claims graph which is 1874 * clearly impossible). Warn and unregister last bdev on the list then. 1875 */ 1876 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1877 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1878 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1879 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1880 return; 1881 } 1882 } 1883 1884 static void 1885 bdev_module_fini_start_iter(void *arg) 1886 { 1887 struct spdk_bdev_module *bdev_module; 1888 1889 if (!g_resume_bdev_module) { 1890 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1891 } else { 1892 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1893 } 1894 1895 while (bdev_module) { 1896 if (bdev_module->async_fini_start) { 1897 /* Save our place so we can resume later. We must 1898 * save the variable here, before calling fini_start() 1899 * below, because in some cases the module may immediately 1900 * call spdk_bdev_module_fini_start_done() and re-enter 1901 * this function to continue iterating. */ 1902 g_resume_bdev_module = bdev_module; 1903 } 1904 1905 if (bdev_module->fini_start) { 1906 bdev_module->fini_start(); 1907 } 1908 1909 if (bdev_module->async_fini_start) { 1910 return; 1911 } 1912 1913 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1914 } 1915 1916 g_resume_bdev_module = NULL; 1917 1918 bdev_finish_unregister_bdevs_iter(NULL, 0); 1919 } 1920 1921 void 1922 spdk_bdev_module_fini_start_done(void) 1923 { 1924 if (spdk_get_thread() != g_fini_thread) { 1925 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1926 } else { 1927 bdev_module_fini_start_iter(NULL); 1928 } 1929 } 1930 1931 void 1932 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1933 { 1934 assert(cb_fn != NULL); 1935 1936 g_fini_thread = spdk_get_thread(); 1937 1938 g_fini_cb_fn = cb_fn; 1939 g_fini_cb_arg = cb_arg; 1940 1941 bdev_module_fini_start_iter(NULL); 1942 } 1943 1944 struct spdk_bdev_io * 1945 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1946 { 1947 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1948 struct spdk_bdev_io *bdev_io; 1949 1950 if (ch->per_thread_cache_count > 0) { 1951 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1952 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1953 ch->per_thread_cache_count--; 1954 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1955 /* 1956 * Don't try to look for bdev_ios in the global pool if there are 1957 * waiters on bdev_ios - we don't want this caller to jump the line. 1958 */ 1959 bdev_io = NULL; 1960 } else { 1961 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1962 } 1963 1964 return bdev_io; 1965 } 1966 1967 void 1968 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1969 { 1970 struct spdk_bdev_mgmt_channel *ch; 1971 1972 assert(bdev_io != NULL); 1973 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1974 1975 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1976 1977 if (bdev_io->internal.buf != NULL) { 1978 bdev_io_put_buf(bdev_io); 1979 } 1980 1981 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1982 ch->per_thread_cache_count++; 1983 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1984 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1985 struct spdk_bdev_io_wait_entry *entry; 1986 1987 entry = TAILQ_FIRST(&ch->io_wait_queue); 1988 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1989 entry->cb_fn(entry->cb_arg); 1990 } 1991 } else { 1992 /* We should never have a full cache with entries on the io wait queue. */ 1993 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1994 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1995 } 1996 } 1997 1998 static bool 1999 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2000 { 2001 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2002 2003 switch (limit) { 2004 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2005 return true; 2006 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2007 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2008 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2009 return false; 2010 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2011 default: 2012 return false; 2013 } 2014 } 2015 2016 static bool 2017 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2018 { 2019 switch (bdev_io->type) { 2020 case SPDK_BDEV_IO_TYPE_NVME_IO: 2021 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2022 case SPDK_BDEV_IO_TYPE_READ: 2023 case SPDK_BDEV_IO_TYPE_WRITE: 2024 return true; 2025 case SPDK_BDEV_IO_TYPE_ZCOPY: 2026 if (bdev_io->u.bdev.zcopy.start) { 2027 return true; 2028 } else { 2029 return false; 2030 } 2031 default: 2032 return false; 2033 } 2034 } 2035 2036 static bool 2037 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2038 { 2039 switch (bdev_io->type) { 2040 case SPDK_BDEV_IO_TYPE_NVME_IO: 2041 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2042 /* Bit 1 (0x2) set for read operation */ 2043 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2044 return true; 2045 } else { 2046 return false; 2047 } 2048 case SPDK_BDEV_IO_TYPE_READ: 2049 return true; 2050 case SPDK_BDEV_IO_TYPE_ZCOPY: 2051 /* Populate to read from disk */ 2052 if (bdev_io->u.bdev.zcopy.populate) { 2053 return true; 2054 } else { 2055 return false; 2056 } 2057 default: 2058 return false; 2059 } 2060 } 2061 2062 static uint64_t 2063 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2064 { 2065 struct spdk_bdev *bdev = bdev_io->bdev; 2066 2067 switch (bdev_io->type) { 2068 case SPDK_BDEV_IO_TYPE_NVME_IO: 2069 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2070 return bdev_io->u.nvme_passthru.nbytes; 2071 case SPDK_BDEV_IO_TYPE_READ: 2072 case SPDK_BDEV_IO_TYPE_WRITE: 2073 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2074 case SPDK_BDEV_IO_TYPE_ZCOPY: 2075 /* Track the data in the start phase only */ 2076 if (bdev_io->u.bdev.zcopy.start) { 2077 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2078 } else { 2079 return 0; 2080 } 2081 default: 2082 return 0; 2083 } 2084 } 2085 2086 static bool 2087 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2088 { 2089 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2090 return true; 2091 } else { 2092 return false; 2093 } 2094 } 2095 2096 static bool 2097 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2098 { 2099 if (bdev_is_read_io(io) == false) { 2100 return false; 2101 } 2102 2103 return bdev_qos_rw_queue_io(limit, io); 2104 } 2105 2106 static bool 2107 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2108 { 2109 if (bdev_is_read_io(io) == true) { 2110 return false; 2111 } 2112 2113 return bdev_qos_rw_queue_io(limit, io); 2114 } 2115 2116 static void 2117 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2118 { 2119 limit->remaining_this_timeslice--; 2120 } 2121 2122 static void 2123 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2124 { 2125 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2126 } 2127 2128 static void 2129 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2130 { 2131 if (bdev_is_read_io(io) == false) { 2132 return; 2133 } 2134 2135 return bdev_qos_rw_bps_update_quota(limit, io); 2136 } 2137 2138 static void 2139 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2140 { 2141 if (bdev_is_read_io(io) == true) { 2142 return; 2143 } 2144 2145 return bdev_qos_rw_bps_update_quota(limit, io); 2146 } 2147 2148 static void 2149 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2150 { 2151 int i; 2152 2153 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2154 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2155 qos->rate_limits[i].queue_io = NULL; 2156 qos->rate_limits[i].update_quota = NULL; 2157 continue; 2158 } 2159 2160 switch (i) { 2161 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2162 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2163 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2164 break; 2165 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2166 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2167 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2168 break; 2169 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2170 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2171 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2172 break; 2173 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2174 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2175 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2176 break; 2177 default: 2178 break; 2179 } 2180 } 2181 } 2182 2183 static void 2184 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2185 struct spdk_bdev_io *bdev_io, 2186 enum spdk_bdev_io_status status) 2187 { 2188 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2189 2190 bdev_io->internal.in_submit_request = true; 2191 bdev_ch->io_outstanding++; 2192 shared_resource->io_outstanding++; 2193 spdk_bdev_io_complete(bdev_io, status); 2194 bdev_io->internal.in_submit_request = false; 2195 } 2196 2197 static inline void 2198 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2199 { 2200 struct spdk_bdev *bdev = bdev_io->bdev; 2201 struct spdk_io_channel *ch = bdev_ch->channel; 2202 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2203 2204 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2205 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2206 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2207 2208 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2209 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 2210 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 2211 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2212 SPDK_BDEV_IO_STATUS_SUCCESS); 2213 return; 2214 } 2215 } 2216 2217 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2218 bdev_ch->io_outstanding++; 2219 shared_resource->io_outstanding++; 2220 bdev_io->internal.in_submit_request = true; 2221 bdev->fn_table->submit_request(ch, bdev_io); 2222 bdev_io->internal.in_submit_request = false; 2223 } else { 2224 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2225 } 2226 } 2227 2228 static int 2229 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2230 { 2231 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2232 int i, submitted_ios = 0; 2233 2234 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2235 if (bdev_qos_io_to_limit(bdev_io) == true) { 2236 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2237 if (!qos->rate_limits[i].queue_io) { 2238 continue; 2239 } 2240 2241 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2242 bdev_io) == true) { 2243 return submitted_ios; 2244 } 2245 } 2246 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2247 if (!qos->rate_limits[i].update_quota) { 2248 continue; 2249 } 2250 2251 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2252 } 2253 } 2254 2255 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2256 bdev_io_do_submit(ch, bdev_io); 2257 submitted_ios++; 2258 } 2259 2260 return submitted_ios; 2261 } 2262 2263 static void 2264 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2265 { 2266 int rc; 2267 2268 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2269 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2270 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2271 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2272 &bdev_io->internal.waitq_entry); 2273 if (rc != 0) { 2274 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2275 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2276 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2277 } 2278 } 2279 2280 static bool 2281 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2282 { 2283 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 2284 uint32_t max_size = bdev_io->bdev->max_segment_size; 2285 int max_segs = bdev_io->bdev->max_num_segments; 2286 2287 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 2288 2289 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2290 return false; 2291 } 2292 2293 if (io_boundary) { 2294 uint64_t start_stripe, end_stripe; 2295 2296 start_stripe = bdev_io->u.bdev.offset_blocks; 2297 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2298 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2299 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2300 start_stripe >>= spdk_u32log2(io_boundary); 2301 end_stripe >>= spdk_u32log2(io_boundary); 2302 } else { 2303 start_stripe /= io_boundary; 2304 end_stripe /= io_boundary; 2305 } 2306 2307 if (start_stripe != end_stripe) { 2308 return true; 2309 } 2310 } 2311 2312 if (max_segs) { 2313 if (bdev_io->u.bdev.iovcnt > max_segs) { 2314 return true; 2315 } 2316 } 2317 2318 if (max_size) { 2319 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2320 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2321 return true; 2322 } 2323 } 2324 } 2325 2326 return false; 2327 } 2328 2329 static bool 2330 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2331 { 2332 uint32_t num_unmap_segments; 2333 2334 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2335 return false; 2336 } 2337 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2338 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2339 return true; 2340 } 2341 2342 return false; 2343 } 2344 2345 static bool 2346 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2347 { 2348 if (!bdev_io->bdev->max_write_zeroes) { 2349 return false; 2350 } 2351 2352 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2353 return true; 2354 } 2355 2356 return false; 2357 } 2358 2359 static bool 2360 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2361 { 2362 switch (bdev_io->type) { 2363 case SPDK_BDEV_IO_TYPE_READ: 2364 case SPDK_BDEV_IO_TYPE_WRITE: 2365 return bdev_rw_should_split(bdev_io); 2366 case SPDK_BDEV_IO_TYPE_UNMAP: 2367 return bdev_unmap_should_split(bdev_io); 2368 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2369 return bdev_write_zeroes_should_split(bdev_io); 2370 default: 2371 return false; 2372 } 2373 } 2374 2375 static uint32_t 2376 _to_next_boundary(uint64_t offset, uint32_t boundary) 2377 { 2378 return (boundary - (offset % boundary)); 2379 } 2380 2381 static void 2382 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2383 2384 static void 2385 _bdev_rw_split(void *_bdev_io); 2386 2387 static void 2388 bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2389 2390 static void 2391 _bdev_unmap_split(void *_bdev_io) 2392 { 2393 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2394 } 2395 2396 static void 2397 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2398 2399 static void 2400 _bdev_write_zeroes_split(void *_bdev_io) 2401 { 2402 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2403 } 2404 2405 static int 2406 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2407 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2408 { 2409 int rc; 2410 uint64_t current_offset, current_remaining; 2411 spdk_bdev_io_wait_cb io_wait_fn; 2412 2413 current_offset = *offset; 2414 current_remaining = *remaining; 2415 2416 bdev_io->u.bdev.split_outstanding++; 2417 2418 io_wait_fn = _bdev_rw_split; 2419 switch (bdev_io->type) { 2420 case SPDK_BDEV_IO_TYPE_READ: 2421 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2422 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2423 iov, iovcnt, md_buf, current_offset, 2424 num_blocks, 2425 bdev_io_split_done, bdev_io, 2426 bdev_io->internal.ext_opts, true); 2427 break; 2428 case SPDK_BDEV_IO_TYPE_WRITE: 2429 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2430 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2431 iov, iovcnt, md_buf, current_offset, 2432 num_blocks, 2433 bdev_io_split_done, bdev_io, 2434 bdev_io->internal.ext_opts, true); 2435 break; 2436 case SPDK_BDEV_IO_TYPE_UNMAP: 2437 io_wait_fn = _bdev_unmap_split; 2438 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2439 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2440 current_offset, num_blocks, 2441 bdev_io_split_done, bdev_io); 2442 break; 2443 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2444 io_wait_fn = _bdev_write_zeroes_split; 2445 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2446 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2447 current_offset, num_blocks, 2448 bdev_io_split_done, bdev_io); 2449 break; 2450 default: 2451 assert(false); 2452 rc = -EINVAL; 2453 break; 2454 } 2455 2456 if (rc == 0) { 2457 current_offset += num_blocks; 2458 current_remaining -= num_blocks; 2459 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2460 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2461 *offset = current_offset; 2462 *remaining = current_remaining; 2463 } else { 2464 bdev_io->u.bdev.split_outstanding--; 2465 if (rc == -ENOMEM) { 2466 if (bdev_io->u.bdev.split_outstanding == 0) { 2467 /* No I/O is outstanding. Hence we should wait here. */ 2468 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2469 } 2470 } else { 2471 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2472 if (bdev_io->u.bdev.split_outstanding == 0) { 2473 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2474 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2475 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2476 } 2477 } 2478 } 2479 2480 return rc; 2481 } 2482 2483 static void 2484 _bdev_rw_split(void *_bdev_io) 2485 { 2486 struct iovec *parent_iov, *iov; 2487 struct spdk_bdev_io *bdev_io = _bdev_io; 2488 struct spdk_bdev *bdev = bdev_io->bdev; 2489 uint64_t parent_offset, current_offset, remaining; 2490 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2491 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2492 uint32_t iovcnt, iov_len, child_iovsize; 2493 uint32_t blocklen = bdev->blocklen; 2494 uint32_t io_boundary = bdev->optimal_io_boundary; 2495 uint32_t max_segment_size = bdev->max_segment_size; 2496 uint32_t max_child_iovcnt = bdev->max_num_segments; 2497 void *md_buf = NULL; 2498 int rc; 2499 2500 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2501 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2502 BDEV_IO_NUM_CHILD_IOV; 2503 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2504 2505 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2506 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2507 parent_offset = bdev_io->u.bdev.offset_blocks; 2508 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2509 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2510 2511 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2512 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2513 if (parent_iov_offset < parent_iov->iov_len) { 2514 break; 2515 } 2516 parent_iov_offset -= parent_iov->iov_len; 2517 } 2518 2519 child_iovcnt = 0; 2520 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2521 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2522 to_next_boundary = spdk_min(remaining, to_next_boundary); 2523 to_next_boundary_bytes = to_next_boundary * blocklen; 2524 2525 iov = &bdev_io->child_iov[child_iovcnt]; 2526 iovcnt = 0; 2527 2528 if (bdev_io->u.bdev.md_buf) { 2529 md_buf = (char *)bdev_io->u.bdev.md_buf + 2530 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2531 } 2532 2533 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2534 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2535 iovcnt < child_iovsize) { 2536 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2537 iov_len = parent_iov->iov_len - parent_iov_offset; 2538 2539 iov_len = spdk_min(iov_len, max_segment_size); 2540 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2541 to_next_boundary_bytes -= iov_len; 2542 2543 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2544 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2545 2546 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2547 parent_iov_offset += iov_len; 2548 } else { 2549 parent_iovpos++; 2550 parent_iov_offset = 0; 2551 } 2552 child_iovcnt++; 2553 iovcnt++; 2554 } 2555 2556 if (to_next_boundary_bytes > 0) { 2557 /* We had to stop this child I/O early because we ran out of 2558 * child_iov space or were limited by max_num_segments. 2559 * Ensure the iovs to be aligned with block size and 2560 * then adjust to_next_boundary before starting the 2561 * child I/O. 2562 */ 2563 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2564 iovcnt == child_iovsize); 2565 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2566 if (to_last_block_bytes != 0) { 2567 uint32_t child_iovpos = child_iovcnt - 1; 2568 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2569 * so the loop will naturally end 2570 */ 2571 2572 to_last_block_bytes = blocklen - to_last_block_bytes; 2573 to_next_boundary_bytes += to_last_block_bytes; 2574 while (to_last_block_bytes > 0 && iovcnt > 0) { 2575 iov_len = spdk_min(to_last_block_bytes, 2576 bdev_io->child_iov[child_iovpos].iov_len); 2577 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2578 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2579 child_iovpos--; 2580 if (--iovcnt == 0) { 2581 /* If the child IO is less than a block size just return. 2582 * If the first child IO of any split round is less than 2583 * a block size, an error exit. 2584 */ 2585 if (bdev_io->u.bdev.split_outstanding == 0) { 2586 SPDK_ERRLOG("The first child io was less than a block size\n"); 2587 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2588 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2589 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2590 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2591 } 2592 2593 return; 2594 } 2595 } 2596 2597 to_last_block_bytes -= iov_len; 2598 2599 if (parent_iov_offset == 0) { 2600 parent_iovpos--; 2601 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2602 } 2603 parent_iov_offset -= iov_len; 2604 } 2605 2606 assert(to_last_block_bytes == 0); 2607 } 2608 to_next_boundary -= to_next_boundary_bytes / blocklen; 2609 } 2610 2611 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2612 ¤t_offset, &remaining); 2613 if (spdk_unlikely(rc)) { 2614 return; 2615 } 2616 } 2617 } 2618 2619 static void 2620 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2621 { 2622 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2623 uint32_t num_children_reqs = 0; 2624 int rc; 2625 2626 offset = bdev_io->u.bdev.split_current_offset_blocks; 2627 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2628 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2629 2630 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2631 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2632 2633 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2634 &offset, &remaining); 2635 if (spdk_likely(rc == 0)) { 2636 num_children_reqs++; 2637 } else { 2638 return; 2639 } 2640 } 2641 } 2642 2643 static void 2644 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2645 { 2646 uint64_t offset, write_zeroes_blocks, remaining; 2647 uint32_t num_children_reqs = 0; 2648 int rc; 2649 2650 offset = bdev_io->u.bdev.split_current_offset_blocks; 2651 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2652 2653 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2654 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2655 2656 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2657 &offset, &remaining); 2658 if (spdk_likely(rc == 0)) { 2659 num_children_reqs++; 2660 } else { 2661 return; 2662 } 2663 } 2664 } 2665 2666 static void 2667 parent_bdev_io_complete(void *ctx, int rc) 2668 { 2669 struct spdk_bdev_io *parent_io = ctx; 2670 2671 if (rc) { 2672 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2673 } 2674 2675 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2676 parent_io->internal.caller_ctx); 2677 } 2678 2679 static void 2680 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2681 { 2682 struct spdk_bdev_io *parent_io = cb_arg; 2683 2684 spdk_bdev_free_io(bdev_io); 2685 2686 if (!success) { 2687 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2688 /* If any child I/O failed, stop further splitting process. */ 2689 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2690 parent_io->u.bdev.split_remaining_num_blocks = 0; 2691 } 2692 parent_io->u.bdev.split_outstanding--; 2693 if (parent_io->u.bdev.split_outstanding != 0) { 2694 return; 2695 } 2696 2697 /* 2698 * Parent I/O finishes when all blocks are consumed. 2699 */ 2700 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2701 assert(parent_io->internal.cb != bdev_io_split_done); 2702 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2703 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2704 2705 if (parent_io->internal.orig_iovcnt != 0) { 2706 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2707 /* bdev IO will be completed in the callback */ 2708 } else { 2709 parent_bdev_io_complete(parent_io, 0); 2710 } 2711 return; 2712 } 2713 2714 /* 2715 * Continue with the splitting process. This function will complete the parent I/O if the 2716 * splitting is done. 2717 */ 2718 switch (parent_io->type) { 2719 case SPDK_BDEV_IO_TYPE_READ: 2720 case SPDK_BDEV_IO_TYPE_WRITE: 2721 _bdev_rw_split(parent_io); 2722 break; 2723 case SPDK_BDEV_IO_TYPE_UNMAP: 2724 bdev_unmap_split(parent_io); 2725 break; 2726 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2727 bdev_write_zeroes_split(parent_io); 2728 break; 2729 default: 2730 assert(false); 2731 break; 2732 } 2733 } 2734 2735 static void 2736 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2737 2738 static void 2739 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2740 { 2741 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2742 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2743 bdev_io->u.bdev.split_outstanding = 0; 2744 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2745 2746 switch (bdev_io->type) { 2747 case SPDK_BDEV_IO_TYPE_READ: 2748 case SPDK_BDEV_IO_TYPE_WRITE: 2749 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2750 _bdev_rw_split(bdev_io); 2751 } else { 2752 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2753 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2754 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2755 } 2756 break; 2757 case SPDK_BDEV_IO_TYPE_UNMAP: 2758 bdev_unmap_split(bdev_io); 2759 break; 2760 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2761 bdev_write_zeroes_split(bdev_io); 2762 break; 2763 default: 2764 assert(false); 2765 break; 2766 } 2767 } 2768 2769 static void 2770 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2771 { 2772 if (!success) { 2773 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2774 return; 2775 } 2776 2777 _bdev_rw_split(bdev_io); 2778 } 2779 2780 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2781 * be inlined, at least on some compilers. 2782 */ 2783 static inline void 2784 _bdev_io_submit(void *ctx) 2785 { 2786 struct spdk_bdev_io *bdev_io = ctx; 2787 struct spdk_bdev *bdev = bdev_io->bdev; 2788 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2789 uint64_t tsc; 2790 2791 tsc = spdk_get_ticks(); 2792 bdev_io->internal.submit_tsc = tsc; 2793 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, 2794 (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2795 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2796 2797 if (spdk_likely(bdev_ch->flags == 0)) { 2798 bdev_io_do_submit(bdev_ch, bdev_io); 2799 return; 2800 } 2801 2802 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2803 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2804 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2805 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2806 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2807 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2808 } else { 2809 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2810 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2811 } 2812 } else { 2813 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2814 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2815 } 2816 } 2817 2818 bool 2819 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2820 2821 bool 2822 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2823 { 2824 if (range1->length == 0 || range2->length == 0) { 2825 return false; 2826 } 2827 2828 if (range1->offset + range1->length <= range2->offset) { 2829 return false; 2830 } 2831 2832 if (range2->offset + range2->length <= range1->offset) { 2833 return false; 2834 } 2835 2836 return true; 2837 } 2838 2839 static bool 2840 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2841 { 2842 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2843 struct lba_range r; 2844 2845 switch (bdev_io->type) { 2846 case SPDK_BDEV_IO_TYPE_NVME_IO: 2847 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2848 /* Don't try to decode the NVMe command - just assume worst-case and that 2849 * it overlaps a locked range. 2850 */ 2851 return true; 2852 case SPDK_BDEV_IO_TYPE_WRITE: 2853 case SPDK_BDEV_IO_TYPE_UNMAP: 2854 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2855 case SPDK_BDEV_IO_TYPE_ZCOPY: 2856 r.offset = bdev_io->u.bdev.offset_blocks; 2857 r.length = bdev_io->u.bdev.num_blocks; 2858 if (!bdev_lba_range_overlapped(range, &r)) { 2859 /* This I/O doesn't overlap the specified LBA range. */ 2860 return false; 2861 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2862 /* This I/O overlaps, but the I/O is on the same channel that locked this 2863 * range, and the caller_ctx is the same as the locked_ctx. This means 2864 * that this I/O is associated with the lock, and is allowed to execute. 2865 */ 2866 return false; 2867 } else { 2868 return true; 2869 } 2870 default: 2871 return false; 2872 } 2873 } 2874 2875 void 2876 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2877 { 2878 struct spdk_bdev *bdev = bdev_io->bdev; 2879 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2880 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2881 2882 assert(thread != NULL); 2883 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2884 2885 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2886 struct lba_range *range; 2887 2888 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2889 if (bdev_io_range_is_locked(bdev_io, range)) { 2890 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2891 return; 2892 } 2893 } 2894 } 2895 2896 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2897 2898 if (bdev_io_should_split(bdev_io)) { 2899 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2900 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2901 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2902 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2903 bdev_io_split(NULL, bdev_io); 2904 return; 2905 } 2906 2907 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2908 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2909 _bdev_io_submit(bdev_io); 2910 } else { 2911 bdev_io->internal.io_submit_ch = ch; 2912 bdev_io->internal.ch = bdev->internal.qos->ch; 2913 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2914 } 2915 } else { 2916 _bdev_io_submit(bdev_io); 2917 } 2918 } 2919 2920 static inline void 2921 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 2922 { 2923 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 2924 2925 /* Zero part we don't copy */ 2926 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 2927 memcpy(opts_copy, opts, opts->size); 2928 opts_copy->size = sizeof(*opts_copy); 2929 opts_copy->metadata = bdev_io->u.bdev.md_buf; 2930 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 2931 bdev_io->u.bdev.ext_opts = opts_copy; 2932 } 2933 2934 static inline void 2935 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 2936 { 2937 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 2938 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 2939 * For write operation we need to pull buffers from memory domain before submitting IO. 2940 * Once read operation completes, we need to use memory_domain push functionality to 2941 * update data in original memory domain IO buffer 2942 * This IO request will go through a regular IO flow, so clear memory domains pointers in 2943 * the copied ext_opts */ 2944 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 2945 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 2946 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 2947 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2948 } 2949 2950 static inline void 2951 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 2952 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 2953 { 2954 if (opts) { 2955 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 2956 assert(opts->size <= sizeof(*opts)); 2957 /* 2958 * copy if size is smaller than opts struct to avoid having to check size 2959 * on every access to bdev_io->u.bdev.ext_opts 2960 */ 2961 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 2962 _bdev_io_copy_ext_opts(bdev_io, opts); 2963 if (use_pull_push) { 2964 _bdev_io_ext_use_bounce_buffer(bdev_io); 2965 return; 2966 } 2967 } 2968 } 2969 bdev_io_submit(bdev_io); 2970 } 2971 2972 static void 2973 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2974 { 2975 struct spdk_bdev *bdev = bdev_io->bdev; 2976 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2977 struct spdk_io_channel *ch = bdev_ch->channel; 2978 2979 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2980 2981 bdev_io->internal.in_submit_request = true; 2982 bdev->fn_table->submit_request(ch, bdev_io); 2983 bdev_io->internal.in_submit_request = false; 2984 } 2985 2986 void 2987 bdev_io_init(struct spdk_bdev_io *bdev_io, 2988 struct spdk_bdev *bdev, void *cb_arg, 2989 spdk_bdev_io_completion_cb cb) 2990 { 2991 bdev_io->bdev = bdev; 2992 bdev_io->internal.caller_ctx = cb_arg; 2993 bdev_io->internal.cb = cb; 2994 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2995 bdev_io->internal.in_submit_request = false; 2996 bdev_io->internal.buf = NULL; 2997 bdev_io->internal.io_submit_ch = NULL; 2998 bdev_io->internal.orig_iovs = NULL; 2999 bdev_io->internal.orig_iovcnt = 0; 3000 bdev_io->internal.orig_md_iov.iov_base = NULL; 3001 bdev_io->internal.error.nvme.cdw0 = 0; 3002 bdev_io->num_retries = 0; 3003 bdev_io->internal.get_buf_cb = NULL; 3004 bdev_io->internal.get_aux_buf_cb = NULL; 3005 bdev_io->internal.ext_opts = NULL; 3006 bdev_io->internal.data_transfer_cpl = NULL; 3007 } 3008 3009 static bool 3010 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3011 { 3012 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3013 } 3014 3015 bool 3016 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3017 { 3018 bool supported; 3019 3020 supported = bdev_io_type_supported(bdev, io_type); 3021 3022 if (!supported) { 3023 switch (io_type) { 3024 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3025 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3026 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3027 break; 3028 default: 3029 break; 3030 } 3031 } 3032 3033 return supported; 3034 } 3035 3036 int 3037 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3038 { 3039 if (bdev->fn_table->dump_info_json) { 3040 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3041 } 3042 3043 return 0; 3044 } 3045 3046 static void 3047 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3048 { 3049 uint32_t max_per_timeslice = 0; 3050 int i; 3051 3052 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3053 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3054 qos->rate_limits[i].max_per_timeslice = 0; 3055 continue; 3056 } 3057 3058 max_per_timeslice = qos->rate_limits[i].limit * 3059 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3060 3061 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3062 qos->rate_limits[i].min_per_timeslice); 3063 3064 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3065 } 3066 3067 bdev_qos_set_ops(qos); 3068 } 3069 3070 static int 3071 bdev_channel_poll_qos(void *arg) 3072 { 3073 struct spdk_bdev_qos *qos = arg; 3074 uint64_t now = spdk_get_ticks(); 3075 int i; 3076 3077 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3078 /* We received our callback earlier than expected - return 3079 * immediately and wait to do accounting until at least one 3080 * timeslice has actually expired. This should never happen 3081 * with a well-behaved timer implementation. 3082 */ 3083 return SPDK_POLLER_IDLE; 3084 } 3085 3086 /* Reset for next round of rate limiting */ 3087 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3088 /* We may have allowed the IOs or bytes to slightly overrun in the last 3089 * timeslice. remaining_this_timeslice is signed, so if it's negative 3090 * here, we'll account for the overrun so that the next timeslice will 3091 * be appropriately reduced. 3092 */ 3093 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3094 qos->rate_limits[i].remaining_this_timeslice = 0; 3095 } 3096 } 3097 3098 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3099 qos->last_timeslice += qos->timeslice_size; 3100 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3101 qos->rate_limits[i].remaining_this_timeslice += 3102 qos->rate_limits[i].max_per_timeslice; 3103 } 3104 } 3105 3106 return bdev_qos_io_submit(qos->ch, qos); 3107 } 3108 3109 static void 3110 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3111 { 3112 struct spdk_bdev_shared_resource *shared_resource; 3113 struct lba_range *range; 3114 3115 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3116 range = TAILQ_FIRST(&ch->locked_ranges); 3117 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3118 free(range); 3119 } 3120 3121 spdk_put_io_channel(ch->channel); 3122 3123 shared_resource = ch->shared_resource; 3124 3125 assert(TAILQ_EMPTY(&ch->io_locked)); 3126 assert(TAILQ_EMPTY(&ch->io_submitted)); 3127 assert(ch->io_outstanding == 0); 3128 assert(shared_resource->ref > 0); 3129 shared_resource->ref--; 3130 if (shared_resource->ref == 0) { 3131 assert(shared_resource->io_outstanding == 0); 3132 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3133 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3134 free(shared_resource); 3135 } 3136 } 3137 3138 /* Caller must hold bdev->internal.mutex. */ 3139 static void 3140 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3141 { 3142 struct spdk_bdev_qos *qos = bdev->internal.qos; 3143 int i; 3144 3145 /* Rate limiting on this bdev enabled */ 3146 if (qos) { 3147 if (qos->ch == NULL) { 3148 struct spdk_io_channel *io_ch; 3149 3150 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3151 bdev->name, spdk_get_thread()); 3152 3153 /* No qos channel has been selected, so set one up */ 3154 3155 /* Take another reference to ch */ 3156 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3157 assert(io_ch != NULL); 3158 qos->ch = ch; 3159 3160 qos->thread = spdk_io_channel_get_thread(io_ch); 3161 3162 TAILQ_INIT(&qos->queued); 3163 3164 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3165 if (bdev_qos_is_iops_rate_limit(i) == true) { 3166 qos->rate_limits[i].min_per_timeslice = 3167 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3168 } else { 3169 qos->rate_limits[i].min_per_timeslice = 3170 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3171 } 3172 3173 if (qos->rate_limits[i].limit == 0) { 3174 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3175 } 3176 } 3177 bdev_qos_update_max_quota_per_timeslice(qos); 3178 qos->timeslice_size = 3179 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3180 qos->last_timeslice = spdk_get_ticks(); 3181 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3182 qos, 3183 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3184 } 3185 3186 ch->flags |= BDEV_CH_QOS_ENABLED; 3187 } 3188 } 3189 3190 struct poll_timeout_ctx { 3191 struct spdk_bdev_desc *desc; 3192 uint64_t timeout_in_sec; 3193 spdk_bdev_io_timeout_cb cb_fn; 3194 void *cb_arg; 3195 }; 3196 3197 static void 3198 bdev_desc_free(struct spdk_bdev_desc *desc) 3199 { 3200 pthread_mutex_destroy(&desc->mutex); 3201 free(desc->media_events_buffer); 3202 free(desc); 3203 } 3204 3205 static void 3206 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 3207 { 3208 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3209 struct spdk_bdev_desc *desc = ctx->desc; 3210 3211 free(ctx); 3212 3213 pthread_mutex_lock(&desc->mutex); 3214 desc->refs--; 3215 if (desc->closed == true && desc->refs == 0) { 3216 pthread_mutex_unlock(&desc->mutex); 3217 bdev_desc_free(desc); 3218 return; 3219 } 3220 pthread_mutex_unlock(&desc->mutex); 3221 } 3222 3223 static void 3224 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 3225 { 3226 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3227 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3228 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 3229 struct spdk_bdev_desc *desc = ctx->desc; 3230 struct spdk_bdev_io *bdev_io; 3231 uint64_t now; 3232 3233 pthread_mutex_lock(&desc->mutex); 3234 if (desc->closed == true) { 3235 pthread_mutex_unlock(&desc->mutex); 3236 spdk_for_each_channel_continue(i, -1); 3237 return; 3238 } 3239 pthread_mutex_unlock(&desc->mutex); 3240 3241 now = spdk_get_ticks(); 3242 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3243 /* Exclude any I/O that are generated via splitting. */ 3244 if (bdev_io->internal.cb == bdev_io_split_done) { 3245 continue; 3246 } 3247 3248 /* Once we find an I/O that has not timed out, we can immediately 3249 * exit the loop. 3250 */ 3251 if (now < (bdev_io->internal.submit_tsc + 3252 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3253 goto end; 3254 } 3255 3256 if (bdev_io->internal.desc == desc) { 3257 ctx->cb_fn(ctx->cb_arg, bdev_io); 3258 } 3259 } 3260 3261 end: 3262 spdk_for_each_channel_continue(i, 0); 3263 } 3264 3265 static int 3266 bdev_poll_timeout_io(void *arg) 3267 { 3268 struct spdk_bdev_desc *desc = arg; 3269 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3270 struct poll_timeout_ctx *ctx; 3271 3272 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3273 if (!ctx) { 3274 SPDK_ERRLOG("failed to allocate memory\n"); 3275 return SPDK_POLLER_BUSY; 3276 } 3277 ctx->desc = desc; 3278 ctx->cb_arg = desc->cb_arg; 3279 ctx->cb_fn = desc->cb_fn; 3280 ctx->timeout_in_sec = desc->timeout_in_sec; 3281 3282 /* Take a ref on the descriptor in case it gets closed while we are checking 3283 * all of the channels. 3284 */ 3285 pthread_mutex_lock(&desc->mutex); 3286 desc->refs++; 3287 pthread_mutex_unlock(&desc->mutex); 3288 3289 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3290 bdev_channel_poll_timeout_io, 3291 ctx, 3292 bdev_channel_poll_timeout_io_done); 3293 3294 return SPDK_POLLER_BUSY; 3295 } 3296 3297 int 3298 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3299 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3300 { 3301 assert(desc->thread == spdk_get_thread()); 3302 3303 spdk_poller_unregister(&desc->io_timeout_poller); 3304 3305 if (timeout_in_sec) { 3306 assert(cb_fn != NULL); 3307 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3308 desc, 3309 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3310 1000); 3311 if (desc->io_timeout_poller == NULL) { 3312 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3313 return -1; 3314 } 3315 } 3316 3317 desc->cb_fn = cb_fn; 3318 desc->cb_arg = cb_arg; 3319 desc->timeout_in_sec = timeout_in_sec; 3320 3321 return 0; 3322 } 3323 3324 static int 3325 bdev_channel_create(void *io_device, void *ctx_buf) 3326 { 3327 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3328 struct spdk_bdev_channel *ch = ctx_buf; 3329 struct spdk_io_channel *mgmt_io_ch; 3330 struct spdk_bdev_mgmt_channel *mgmt_ch; 3331 struct spdk_bdev_shared_resource *shared_resource; 3332 struct lba_range *range; 3333 3334 ch->bdev = bdev; 3335 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3336 if (!ch->channel) { 3337 return -1; 3338 } 3339 3340 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3341 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3342 3343 assert(ch->histogram == NULL); 3344 if (bdev->internal.histogram_enabled) { 3345 ch->histogram = spdk_histogram_data_alloc(); 3346 if (ch->histogram == NULL) { 3347 SPDK_ERRLOG("Could not allocate histogram\n"); 3348 } 3349 } 3350 3351 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3352 if (!mgmt_io_ch) { 3353 spdk_put_io_channel(ch->channel); 3354 return -1; 3355 } 3356 3357 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3358 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3359 if (shared_resource->shared_ch == ch->channel) { 3360 spdk_put_io_channel(mgmt_io_ch); 3361 shared_resource->ref++; 3362 break; 3363 } 3364 } 3365 3366 if (shared_resource == NULL) { 3367 shared_resource = calloc(1, sizeof(*shared_resource)); 3368 if (shared_resource == NULL) { 3369 spdk_put_io_channel(ch->channel); 3370 spdk_put_io_channel(mgmt_io_ch); 3371 return -1; 3372 } 3373 3374 shared_resource->mgmt_ch = mgmt_ch; 3375 shared_resource->io_outstanding = 0; 3376 TAILQ_INIT(&shared_resource->nomem_io); 3377 shared_resource->nomem_threshold = 0; 3378 shared_resource->shared_ch = ch->channel; 3379 shared_resource->ref = 1; 3380 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3381 } 3382 3383 memset(&ch->stat, 0, sizeof(ch->stat)); 3384 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3385 ch->io_outstanding = 0; 3386 TAILQ_INIT(&ch->queued_resets); 3387 TAILQ_INIT(&ch->locked_ranges); 3388 ch->flags = 0; 3389 ch->shared_resource = shared_resource; 3390 3391 TAILQ_INIT(&ch->io_submitted); 3392 TAILQ_INIT(&ch->io_locked); 3393 3394 #ifdef SPDK_CONFIG_VTUNE 3395 { 3396 char *name; 3397 __itt_init_ittlib(NULL, 0); 3398 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3399 if (!name) { 3400 bdev_channel_destroy_resource(ch); 3401 return -1; 3402 } 3403 ch->handle = __itt_string_handle_create(name); 3404 free(name); 3405 ch->start_tsc = spdk_get_ticks(); 3406 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3407 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3408 } 3409 #endif 3410 3411 pthread_mutex_lock(&bdev->internal.mutex); 3412 bdev_enable_qos(bdev, ch); 3413 3414 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3415 struct lba_range *new_range; 3416 3417 new_range = calloc(1, sizeof(*new_range)); 3418 if (new_range == NULL) { 3419 pthread_mutex_unlock(&bdev->internal.mutex); 3420 bdev_channel_destroy_resource(ch); 3421 return -1; 3422 } 3423 new_range->length = range->length; 3424 new_range->offset = range->offset; 3425 new_range->locked_ctx = range->locked_ctx; 3426 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3427 } 3428 3429 pthread_mutex_unlock(&bdev->internal.mutex); 3430 3431 return 0; 3432 } 3433 3434 /* 3435 * Abort I/O that are waiting on a data buffer. These types of I/O are 3436 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3437 */ 3438 static void 3439 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3440 { 3441 bdev_io_stailq_t tmp; 3442 struct spdk_bdev_io *bdev_io; 3443 3444 STAILQ_INIT(&tmp); 3445 3446 while (!STAILQ_EMPTY(queue)) { 3447 bdev_io = STAILQ_FIRST(queue); 3448 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3449 if (bdev_io->internal.ch == ch) { 3450 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3451 } else { 3452 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3453 } 3454 } 3455 3456 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3457 } 3458 3459 /* 3460 * Abort I/O that are queued waiting for submission. These types of I/O are 3461 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3462 */ 3463 static void 3464 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3465 { 3466 struct spdk_bdev_io *bdev_io, *tmp; 3467 3468 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3469 if (bdev_io->internal.ch == ch) { 3470 TAILQ_REMOVE(queue, bdev_io, internal.link); 3471 /* 3472 * spdk_bdev_io_complete() assumes that the completed I/O had 3473 * been submitted to the bdev module. Since in this case it 3474 * hadn't, bump io_outstanding to account for the decrement 3475 * that spdk_bdev_io_complete() will do. 3476 */ 3477 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3478 ch->io_outstanding++; 3479 ch->shared_resource->io_outstanding++; 3480 } 3481 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3482 } 3483 } 3484 } 3485 3486 static bool 3487 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3488 { 3489 struct spdk_bdev_io *bdev_io; 3490 3491 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3492 if (bdev_io == bio_to_abort) { 3493 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3494 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3495 return true; 3496 } 3497 } 3498 3499 return false; 3500 } 3501 3502 static bool 3503 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3504 { 3505 struct spdk_bdev_io *bdev_io; 3506 3507 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3508 if (bdev_io == bio_to_abort) { 3509 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3510 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3511 return true; 3512 } 3513 } 3514 3515 return false; 3516 } 3517 3518 static void 3519 bdev_qos_channel_destroy(void *cb_arg) 3520 { 3521 struct spdk_bdev_qos *qos = cb_arg; 3522 3523 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3524 spdk_poller_unregister(&qos->poller); 3525 3526 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3527 3528 free(qos); 3529 } 3530 3531 static int 3532 bdev_qos_destroy(struct spdk_bdev *bdev) 3533 { 3534 int i; 3535 3536 /* 3537 * Cleanly shutting down the QoS poller is tricky, because 3538 * during the asynchronous operation the user could open 3539 * a new descriptor and create a new channel, spawning 3540 * a new QoS poller. 3541 * 3542 * The strategy is to create a new QoS structure here and swap it 3543 * in. The shutdown path then continues to refer to the old one 3544 * until it completes and then releases it. 3545 */ 3546 struct spdk_bdev_qos *new_qos, *old_qos; 3547 3548 old_qos = bdev->internal.qos; 3549 3550 new_qos = calloc(1, sizeof(*new_qos)); 3551 if (!new_qos) { 3552 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3553 return -ENOMEM; 3554 } 3555 3556 /* Copy the old QoS data into the newly allocated structure */ 3557 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3558 3559 /* Zero out the key parts of the QoS structure */ 3560 new_qos->ch = NULL; 3561 new_qos->thread = NULL; 3562 new_qos->poller = NULL; 3563 TAILQ_INIT(&new_qos->queued); 3564 /* 3565 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3566 * It will be used later for the new QoS structure. 3567 */ 3568 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3569 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3570 new_qos->rate_limits[i].min_per_timeslice = 0; 3571 new_qos->rate_limits[i].max_per_timeslice = 0; 3572 } 3573 3574 bdev->internal.qos = new_qos; 3575 3576 if (old_qos->thread == NULL) { 3577 free(old_qos); 3578 } else { 3579 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3580 } 3581 3582 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3583 * been destroyed yet. The destruction path will end up waiting for the final 3584 * channel to be put before it releases resources. */ 3585 3586 return 0; 3587 } 3588 3589 static void 3590 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3591 { 3592 total->bytes_read += add->bytes_read; 3593 total->num_read_ops += add->num_read_ops; 3594 total->bytes_written += add->bytes_written; 3595 total->num_write_ops += add->num_write_ops; 3596 total->bytes_unmapped += add->bytes_unmapped; 3597 total->num_unmap_ops += add->num_unmap_ops; 3598 total->read_latency_ticks += add->read_latency_ticks; 3599 total->write_latency_ticks += add->write_latency_ticks; 3600 total->unmap_latency_ticks += add->unmap_latency_ticks; 3601 } 3602 3603 static void 3604 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3605 { 3606 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3607 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3608 3609 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3610 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3611 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3612 } 3613 3614 static void 3615 bdev_channel_destroy(void *io_device, void *ctx_buf) 3616 { 3617 struct spdk_bdev_channel *ch = ctx_buf; 3618 3619 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3620 spdk_get_thread()); 3621 3622 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3623 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3624 3625 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3626 pthread_mutex_lock(&ch->bdev->internal.mutex); 3627 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3628 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3629 3630 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3631 3632 bdev_channel_abort_queued_ios(ch); 3633 3634 if (ch->histogram) { 3635 spdk_histogram_data_free(ch->histogram); 3636 } 3637 3638 bdev_channel_destroy_resource(ch); 3639 } 3640 3641 /* 3642 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3643 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3644 */ 3645 static int 3646 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3647 { 3648 struct spdk_bdev_name *tmp; 3649 3650 bdev_name->name = strdup(name); 3651 if (bdev_name->name == NULL) { 3652 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3653 return -ENOMEM; 3654 } 3655 3656 bdev_name->bdev = bdev; 3657 3658 pthread_mutex_lock(&g_bdev_mgr.mutex); 3659 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3660 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3661 3662 if (tmp != NULL) { 3663 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3664 free(bdev_name->name); 3665 return -EEXIST; 3666 } 3667 3668 return 0; 3669 } 3670 3671 static void 3672 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3673 { 3674 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3675 free(bdev_name->name); 3676 } 3677 3678 static void 3679 bdev_name_del(struct spdk_bdev_name *bdev_name) 3680 { 3681 pthread_mutex_lock(&g_bdev_mgr.mutex); 3682 bdev_name_del_unsafe(bdev_name); 3683 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3684 } 3685 3686 int 3687 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3688 { 3689 struct spdk_bdev_alias *tmp; 3690 int ret; 3691 3692 if (alias == NULL) { 3693 SPDK_ERRLOG("Empty alias passed\n"); 3694 return -EINVAL; 3695 } 3696 3697 tmp = calloc(1, sizeof(*tmp)); 3698 if (tmp == NULL) { 3699 SPDK_ERRLOG("Unable to allocate alias\n"); 3700 return -ENOMEM; 3701 } 3702 3703 ret = bdev_name_add(&tmp->alias, bdev, alias); 3704 if (ret != 0) { 3705 free(tmp); 3706 return ret; 3707 } 3708 3709 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3710 3711 return 0; 3712 } 3713 3714 static int 3715 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3716 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3717 { 3718 struct spdk_bdev_alias *tmp; 3719 3720 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3721 if (strcmp(alias, tmp->alias.name) == 0) { 3722 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3723 alias_del_fn(&tmp->alias); 3724 free(tmp); 3725 return 0; 3726 } 3727 } 3728 3729 return -ENOENT; 3730 } 3731 3732 int 3733 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3734 { 3735 int rc; 3736 3737 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3738 if (rc == -ENOENT) { 3739 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3740 } 3741 3742 return rc; 3743 } 3744 3745 void 3746 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3747 { 3748 struct spdk_bdev_alias *p, *tmp; 3749 3750 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3751 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3752 bdev_name_del(&p->alias); 3753 free(p); 3754 } 3755 } 3756 3757 struct spdk_io_channel * 3758 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3759 { 3760 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3761 } 3762 3763 void * 3764 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3765 { 3766 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3767 void *ctx = NULL; 3768 3769 if (bdev->fn_table->get_module_ctx) { 3770 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3771 } 3772 3773 return ctx; 3774 } 3775 3776 const char * 3777 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3778 { 3779 return bdev->module->name; 3780 } 3781 3782 const char * 3783 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3784 { 3785 return bdev->name; 3786 } 3787 3788 const char * 3789 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3790 { 3791 return bdev->product_name; 3792 } 3793 3794 const struct spdk_bdev_aliases_list * 3795 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3796 { 3797 return &bdev->aliases; 3798 } 3799 3800 uint32_t 3801 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3802 { 3803 return bdev->blocklen; 3804 } 3805 3806 uint32_t 3807 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3808 { 3809 return bdev->write_unit_size; 3810 } 3811 3812 uint64_t 3813 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3814 { 3815 return bdev->blockcnt; 3816 } 3817 3818 const char * 3819 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3820 { 3821 return qos_rpc_type[type]; 3822 } 3823 3824 void 3825 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3826 { 3827 int i; 3828 3829 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3830 3831 pthread_mutex_lock(&bdev->internal.mutex); 3832 if (bdev->internal.qos) { 3833 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3834 if (bdev->internal.qos->rate_limits[i].limit != 3835 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3836 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3837 if (bdev_qos_is_iops_rate_limit(i) == false) { 3838 /* Change from Byte to Megabyte which is user visible. */ 3839 limits[i] = limits[i] / 1024 / 1024; 3840 } 3841 } 3842 } 3843 } 3844 pthread_mutex_unlock(&bdev->internal.mutex); 3845 } 3846 3847 size_t 3848 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3849 { 3850 return 1 << bdev->required_alignment; 3851 } 3852 3853 uint32_t 3854 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3855 { 3856 return bdev->optimal_io_boundary; 3857 } 3858 3859 bool 3860 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3861 { 3862 return bdev->write_cache; 3863 } 3864 3865 const struct spdk_uuid * 3866 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3867 { 3868 return &bdev->uuid; 3869 } 3870 3871 uint16_t 3872 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3873 { 3874 return bdev->acwu; 3875 } 3876 3877 uint32_t 3878 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3879 { 3880 return bdev->md_len; 3881 } 3882 3883 bool 3884 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3885 { 3886 return (bdev->md_len != 0) && bdev->md_interleave; 3887 } 3888 3889 bool 3890 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3891 { 3892 return (bdev->md_len != 0) && !bdev->md_interleave; 3893 } 3894 3895 bool 3896 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3897 { 3898 return bdev->zoned; 3899 } 3900 3901 uint32_t 3902 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3903 { 3904 if (spdk_bdev_is_md_interleaved(bdev)) { 3905 return bdev->blocklen - bdev->md_len; 3906 } else { 3907 return bdev->blocklen; 3908 } 3909 } 3910 3911 uint32_t 3912 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3913 { 3914 return bdev->phys_blocklen; 3915 } 3916 3917 static uint32_t 3918 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3919 { 3920 if (!spdk_bdev_is_md_interleaved(bdev)) { 3921 return bdev->blocklen + bdev->md_len; 3922 } else { 3923 return bdev->blocklen; 3924 } 3925 } 3926 3927 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3928 { 3929 if (bdev->md_len != 0) { 3930 return bdev->dif_type; 3931 } else { 3932 return SPDK_DIF_DISABLE; 3933 } 3934 } 3935 3936 bool 3937 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3938 { 3939 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3940 return bdev->dif_is_head_of_md; 3941 } else { 3942 return false; 3943 } 3944 } 3945 3946 bool 3947 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3948 enum spdk_dif_check_type check_type) 3949 { 3950 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3951 return false; 3952 } 3953 3954 switch (check_type) { 3955 case SPDK_DIF_CHECK_TYPE_REFTAG: 3956 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3957 case SPDK_DIF_CHECK_TYPE_APPTAG: 3958 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3959 case SPDK_DIF_CHECK_TYPE_GUARD: 3960 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3961 default: 3962 return false; 3963 } 3964 } 3965 3966 uint64_t 3967 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3968 { 3969 return bdev->internal.measured_queue_depth; 3970 } 3971 3972 uint64_t 3973 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3974 { 3975 return bdev->internal.period; 3976 } 3977 3978 uint64_t 3979 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3980 { 3981 return bdev->internal.weighted_io_time; 3982 } 3983 3984 uint64_t 3985 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3986 { 3987 return bdev->internal.io_time; 3988 } 3989 3990 static void 3991 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3992 { 3993 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3994 3995 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3996 3997 if (bdev->internal.measured_queue_depth) { 3998 bdev->internal.io_time += bdev->internal.period; 3999 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4000 } 4001 } 4002 4003 static void 4004 _calculate_measured_qd(struct spdk_io_channel_iter *i) 4005 { 4006 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 4007 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 4008 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 4009 4010 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4011 spdk_for_each_channel_continue(i, 0); 4012 } 4013 4014 static int 4015 bdev_calculate_measured_queue_depth(void *ctx) 4016 { 4017 struct spdk_bdev *bdev = ctx; 4018 bdev->internal.temporary_queue_depth = 0; 4019 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 4020 _calculate_measured_qd_cpl); 4021 return SPDK_POLLER_BUSY; 4022 } 4023 4024 void 4025 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4026 { 4027 bdev->internal.period = period; 4028 4029 if (bdev->internal.qd_poller != NULL) { 4030 spdk_poller_unregister(&bdev->internal.qd_poller); 4031 bdev->internal.measured_queue_depth = UINT64_MAX; 4032 } 4033 4034 if (period != 0) { 4035 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 4036 period); 4037 } 4038 } 4039 4040 static void 4041 _resize_notify(void *arg) 4042 { 4043 struct spdk_bdev_desc *desc = arg; 4044 4045 pthread_mutex_lock(&desc->mutex); 4046 desc->refs--; 4047 if (!desc->closed) { 4048 pthread_mutex_unlock(&desc->mutex); 4049 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4050 desc->bdev, 4051 desc->callback.ctx); 4052 return; 4053 } else if (0 == desc->refs) { 4054 /* This descriptor was closed after this resize_notify message was sent. 4055 * spdk_bdev_close() could not free the descriptor since this message was 4056 * in flight, so we free it now using bdev_desc_free(). 4057 */ 4058 pthread_mutex_unlock(&desc->mutex); 4059 bdev_desc_free(desc); 4060 return; 4061 } 4062 pthread_mutex_unlock(&desc->mutex); 4063 } 4064 4065 int 4066 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4067 { 4068 struct spdk_bdev_desc *desc; 4069 int ret; 4070 4071 if (size == bdev->blockcnt) { 4072 return 0; 4073 } 4074 4075 pthread_mutex_lock(&bdev->internal.mutex); 4076 4077 /* bdev has open descriptors */ 4078 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4079 bdev->blockcnt > size) { 4080 ret = -EBUSY; 4081 } else { 4082 bdev->blockcnt = size; 4083 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4084 pthread_mutex_lock(&desc->mutex); 4085 if (!desc->closed) { 4086 desc->refs++; 4087 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4088 } 4089 pthread_mutex_unlock(&desc->mutex); 4090 } 4091 ret = 0; 4092 } 4093 4094 pthread_mutex_unlock(&bdev->internal.mutex); 4095 4096 return ret; 4097 } 4098 4099 /* 4100 * Convert I/O offset and length from bytes to blocks. 4101 * 4102 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4103 */ 4104 static uint64_t 4105 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4106 uint64_t num_bytes, uint64_t *num_blocks) 4107 { 4108 uint32_t block_size = bdev->blocklen; 4109 uint8_t shift_cnt; 4110 4111 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4112 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4113 shift_cnt = spdk_u32log2(block_size); 4114 *offset_blocks = offset_bytes >> shift_cnt; 4115 *num_blocks = num_bytes >> shift_cnt; 4116 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4117 (num_bytes - (*num_blocks << shift_cnt)); 4118 } else { 4119 *offset_blocks = offset_bytes / block_size; 4120 *num_blocks = num_bytes / block_size; 4121 return (offset_bytes % block_size) | (num_bytes % block_size); 4122 } 4123 } 4124 4125 static bool 4126 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4127 { 4128 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4129 * has been an overflow and hence the offset has been wrapped around */ 4130 if (offset_blocks + num_blocks < offset_blocks) { 4131 return false; 4132 } 4133 4134 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4135 if (offset_blocks + num_blocks > bdev->blockcnt) { 4136 return false; 4137 } 4138 4139 return true; 4140 } 4141 4142 static bool 4143 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 4144 { 4145 return _is_buf_allocated(iovs) == (md_buf != NULL); 4146 } 4147 4148 static int 4149 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4150 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4151 spdk_bdev_io_completion_cb cb, void *cb_arg) 4152 { 4153 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4154 struct spdk_bdev_io *bdev_io; 4155 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4156 4157 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4158 return -EINVAL; 4159 } 4160 4161 bdev_io = bdev_channel_get_io(channel); 4162 if (!bdev_io) { 4163 return -ENOMEM; 4164 } 4165 4166 bdev_io->internal.ch = channel; 4167 bdev_io->internal.desc = desc; 4168 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4169 bdev_io->u.bdev.iovs = &bdev_io->iov; 4170 bdev_io->u.bdev.iovs[0].iov_base = buf; 4171 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4172 bdev_io->u.bdev.iovcnt = 1; 4173 bdev_io->u.bdev.md_buf = md_buf; 4174 bdev_io->u.bdev.num_blocks = num_blocks; 4175 bdev_io->u.bdev.offset_blocks = offset_blocks; 4176 bdev_io->u.bdev.ext_opts = NULL; 4177 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4178 4179 bdev_io_submit(bdev_io); 4180 return 0; 4181 } 4182 4183 int 4184 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4185 void *buf, uint64_t offset, uint64_t nbytes, 4186 spdk_bdev_io_completion_cb cb, void *cb_arg) 4187 { 4188 uint64_t offset_blocks, num_blocks; 4189 4190 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4191 nbytes, &num_blocks) != 0) { 4192 return -EINVAL; 4193 } 4194 4195 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4196 } 4197 4198 int 4199 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4200 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4201 spdk_bdev_io_completion_cb cb, void *cb_arg) 4202 { 4203 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4204 } 4205 4206 int 4207 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4208 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4209 spdk_bdev_io_completion_cb cb, void *cb_arg) 4210 { 4211 struct iovec iov = { 4212 .iov_base = buf, 4213 }; 4214 4215 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4216 return -EINVAL; 4217 } 4218 4219 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4220 return -EINVAL; 4221 } 4222 4223 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4224 cb, cb_arg); 4225 } 4226 4227 int 4228 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4229 struct iovec *iov, int iovcnt, 4230 uint64_t offset, uint64_t nbytes, 4231 spdk_bdev_io_completion_cb cb, void *cb_arg) 4232 { 4233 uint64_t offset_blocks, num_blocks; 4234 4235 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4236 nbytes, &num_blocks) != 0) { 4237 return -EINVAL; 4238 } 4239 4240 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4241 } 4242 4243 static int 4244 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4245 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4246 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4247 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4248 { 4249 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4250 struct spdk_bdev_io *bdev_io; 4251 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4252 4253 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4254 return -EINVAL; 4255 } 4256 4257 bdev_io = bdev_channel_get_io(channel); 4258 if (!bdev_io) { 4259 return -ENOMEM; 4260 } 4261 4262 bdev_io->internal.ch = channel; 4263 bdev_io->internal.desc = desc; 4264 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4265 bdev_io->u.bdev.iovs = iov; 4266 bdev_io->u.bdev.iovcnt = iovcnt; 4267 bdev_io->u.bdev.md_buf = md_buf; 4268 bdev_io->u.bdev.num_blocks = num_blocks; 4269 bdev_io->u.bdev.offset_blocks = offset_blocks; 4270 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4271 bdev_io->internal.ext_opts = opts; 4272 bdev_io->u.bdev.ext_opts = opts; 4273 4274 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4275 4276 return 0; 4277 } 4278 4279 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4280 struct iovec *iov, int iovcnt, 4281 uint64_t offset_blocks, uint64_t num_blocks, 4282 spdk_bdev_io_completion_cb cb, void *cb_arg) 4283 { 4284 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4285 num_blocks, cb, cb_arg, NULL, false); 4286 } 4287 4288 int 4289 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4290 struct iovec *iov, int iovcnt, void *md_buf, 4291 uint64_t offset_blocks, uint64_t num_blocks, 4292 spdk_bdev_io_completion_cb cb, void *cb_arg) 4293 { 4294 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4295 return -EINVAL; 4296 } 4297 4298 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4299 return -EINVAL; 4300 } 4301 4302 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4303 num_blocks, cb, cb_arg, NULL, false); 4304 } 4305 4306 static inline bool 4307 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4308 { 4309 /* 4310 * We check if opts size is at least of size when we first introduced 4311 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4312 * are not checked internal. 4313 */ 4314 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4315 sizeof(opts->metadata) && 4316 opts->size <= sizeof(*opts) && 4317 /* When memory domain is used, the user must provide data buffers */ 4318 (!opts->memory_domain || (iov && iov[0].iov_base)); 4319 } 4320 4321 int 4322 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4323 struct iovec *iov, int iovcnt, 4324 uint64_t offset_blocks, uint64_t num_blocks, 4325 spdk_bdev_io_completion_cb cb, void *cb_arg, 4326 struct spdk_bdev_ext_io_opts *opts) 4327 { 4328 void *md = NULL; 4329 4330 if (opts) { 4331 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4332 return -EINVAL; 4333 } 4334 md = opts->metadata; 4335 } 4336 4337 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4338 return -EINVAL; 4339 } 4340 4341 if (md && !_bdev_io_check_md_buf(iov, md)) { 4342 return -EINVAL; 4343 } 4344 4345 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4346 num_blocks, cb, cb_arg, opts, false); 4347 } 4348 4349 static int 4350 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4351 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4352 spdk_bdev_io_completion_cb cb, void *cb_arg) 4353 { 4354 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4355 struct spdk_bdev_io *bdev_io; 4356 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4357 4358 if (!desc->write) { 4359 return -EBADF; 4360 } 4361 4362 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4363 return -EINVAL; 4364 } 4365 4366 bdev_io = bdev_channel_get_io(channel); 4367 if (!bdev_io) { 4368 return -ENOMEM; 4369 } 4370 4371 bdev_io->internal.ch = channel; 4372 bdev_io->internal.desc = desc; 4373 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4374 bdev_io->u.bdev.iovs = &bdev_io->iov; 4375 bdev_io->u.bdev.iovs[0].iov_base = buf; 4376 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4377 bdev_io->u.bdev.iovcnt = 1; 4378 bdev_io->u.bdev.md_buf = md_buf; 4379 bdev_io->u.bdev.num_blocks = num_blocks; 4380 bdev_io->u.bdev.offset_blocks = offset_blocks; 4381 bdev_io->u.bdev.ext_opts = NULL; 4382 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4383 4384 bdev_io_submit(bdev_io); 4385 return 0; 4386 } 4387 4388 int 4389 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4390 void *buf, uint64_t offset, uint64_t nbytes, 4391 spdk_bdev_io_completion_cb cb, void *cb_arg) 4392 { 4393 uint64_t offset_blocks, num_blocks; 4394 4395 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4396 nbytes, &num_blocks) != 0) { 4397 return -EINVAL; 4398 } 4399 4400 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4401 } 4402 4403 int 4404 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4405 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4406 spdk_bdev_io_completion_cb cb, void *cb_arg) 4407 { 4408 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4409 cb, cb_arg); 4410 } 4411 4412 int 4413 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4414 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4415 spdk_bdev_io_completion_cb cb, void *cb_arg) 4416 { 4417 struct iovec iov = { 4418 .iov_base = buf, 4419 }; 4420 4421 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4422 return -EINVAL; 4423 } 4424 4425 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4426 return -EINVAL; 4427 } 4428 4429 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4430 cb, cb_arg); 4431 } 4432 4433 static int 4434 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4435 struct iovec *iov, int iovcnt, void *md_buf, 4436 uint64_t offset_blocks, uint64_t num_blocks, 4437 spdk_bdev_io_completion_cb cb, void *cb_arg, 4438 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4439 { 4440 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4441 struct spdk_bdev_io *bdev_io; 4442 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4443 4444 if (!desc->write) { 4445 return -EBADF; 4446 } 4447 4448 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4449 return -EINVAL; 4450 } 4451 4452 bdev_io = bdev_channel_get_io(channel); 4453 if (!bdev_io) { 4454 return -ENOMEM; 4455 } 4456 4457 bdev_io->internal.ch = channel; 4458 bdev_io->internal.desc = desc; 4459 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4460 bdev_io->u.bdev.iovs = iov; 4461 bdev_io->u.bdev.iovcnt = iovcnt; 4462 bdev_io->u.bdev.md_buf = md_buf; 4463 bdev_io->u.bdev.num_blocks = num_blocks; 4464 bdev_io->u.bdev.offset_blocks = offset_blocks; 4465 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4466 bdev_io->internal.ext_opts = opts; 4467 bdev_io->u.bdev.ext_opts = opts; 4468 4469 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4470 4471 return 0; 4472 } 4473 4474 int 4475 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4476 struct iovec *iov, int iovcnt, 4477 uint64_t offset, uint64_t len, 4478 spdk_bdev_io_completion_cb cb, void *cb_arg) 4479 { 4480 uint64_t offset_blocks, num_blocks; 4481 4482 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4483 len, &num_blocks) != 0) { 4484 return -EINVAL; 4485 } 4486 4487 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4488 } 4489 4490 int 4491 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4492 struct iovec *iov, int iovcnt, 4493 uint64_t offset_blocks, uint64_t num_blocks, 4494 spdk_bdev_io_completion_cb cb, void *cb_arg) 4495 { 4496 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4497 num_blocks, cb, cb_arg, NULL, false); 4498 } 4499 4500 int 4501 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4502 struct iovec *iov, int iovcnt, void *md_buf, 4503 uint64_t offset_blocks, uint64_t num_blocks, 4504 spdk_bdev_io_completion_cb cb, void *cb_arg) 4505 { 4506 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4507 return -EINVAL; 4508 } 4509 4510 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4511 return -EINVAL; 4512 } 4513 4514 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4515 num_blocks, cb, cb_arg, NULL, false); 4516 } 4517 4518 int 4519 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4520 struct iovec *iov, int iovcnt, 4521 uint64_t offset_blocks, uint64_t num_blocks, 4522 spdk_bdev_io_completion_cb cb, void *cb_arg, 4523 struct spdk_bdev_ext_io_opts *opts) 4524 { 4525 void *md = NULL; 4526 4527 if (opts) { 4528 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4529 return -EINVAL; 4530 } 4531 md = opts->metadata; 4532 } 4533 4534 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4535 return -EINVAL; 4536 } 4537 4538 if (md && !_bdev_io_check_md_buf(iov, md)) { 4539 return -EINVAL; 4540 } 4541 4542 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4543 num_blocks, cb, cb_arg, opts, false); 4544 } 4545 4546 static void 4547 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4548 { 4549 struct spdk_bdev_io *parent_io = cb_arg; 4550 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4551 int i, rc = 0; 4552 4553 if (!success) { 4554 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4555 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4556 spdk_bdev_free_io(bdev_io); 4557 return; 4558 } 4559 4560 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4561 rc = memcmp(read_buf, 4562 parent_io->u.bdev.iovs[i].iov_base, 4563 parent_io->u.bdev.iovs[i].iov_len); 4564 if (rc) { 4565 break; 4566 } 4567 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4568 } 4569 4570 spdk_bdev_free_io(bdev_io); 4571 4572 if (rc == 0) { 4573 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4574 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4575 } else { 4576 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4577 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4578 } 4579 } 4580 4581 static void 4582 bdev_compare_do_read(void *_bdev_io) 4583 { 4584 struct spdk_bdev_io *bdev_io = _bdev_io; 4585 int rc; 4586 4587 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4588 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4589 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4590 bdev_compare_do_read_done, bdev_io); 4591 4592 if (rc == -ENOMEM) { 4593 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4594 } else if (rc != 0) { 4595 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4596 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4597 } 4598 } 4599 4600 static int 4601 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4602 struct iovec *iov, int iovcnt, void *md_buf, 4603 uint64_t offset_blocks, uint64_t num_blocks, 4604 spdk_bdev_io_completion_cb cb, void *cb_arg) 4605 { 4606 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4607 struct spdk_bdev_io *bdev_io; 4608 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4609 4610 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4611 return -EINVAL; 4612 } 4613 4614 bdev_io = bdev_channel_get_io(channel); 4615 if (!bdev_io) { 4616 return -ENOMEM; 4617 } 4618 4619 bdev_io->internal.ch = channel; 4620 bdev_io->internal.desc = desc; 4621 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4622 bdev_io->u.bdev.iovs = iov; 4623 bdev_io->u.bdev.iovcnt = iovcnt; 4624 bdev_io->u.bdev.md_buf = md_buf; 4625 bdev_io->u.bdev.num_blocks = num_blocks; 4626 bdev_io->u.bdev.offset_blocks = offset_blocks; 4627 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4628 4629 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4630 bdev_io_submit(bdev_io); 4631 return 0; 4632 } 4633 4634 bdev_compare_do_read(bdev_io); 4635 4636 return 0; 4637 } 4638 4639 int 4640 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4641 struct iovec *iov, int iovcnt, 4642 uint64_t offset_blocks, uint64_t num_blocks, 4643 spdk_bdev_io_completion_cb cb, void *cb_arg) 4644 { 4645 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4646 num_blocks, cb, cb_arg); 4647 } 4648 4649 int 4650 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4651 struct iovec *iov, int iovcnt, void *md_buf, 4652 uint64_t offset_blocks, uint64_t num_blocks, 4653 spdk_bdev_io_completion_cb cb, void *cb_arg) 4654 { 4655 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4656 return -EINVAL; 4657 } 4658 4659 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4660 return -EINVAL; 4661 } 4662 4663 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4664 num_blocks, cb, cb_arg); 4665 } 4666 4667 static int 4668 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4669 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4670 spdk_bdev_io_completion_cb cb, void *cb_arg) 4671 { 4672 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4673 struct spdk_bdev_io *bdev_io; 4674 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4675 4676 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4677 return -EINVAL; 4678 } 4679 4680 bdev_io = bdev_channel_get_io(channel); 4681 if (!bdev_io) { 4682 return -ENOMEM; 4683 } 4684 4685 bdev_io->internal.ch = channel; 4686 bdev_io->internal.desc = desc; 4687 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4688 bdev_io->u.bdev.iovs = &bdev_io->iov; 4689 bdev_io->u.bdev.iovs[0].iov_base = buf; 4690 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4691 bdev_io->u.bdev.iovcnt = 1; 4692 bdev_io->u.bdev.md_buf = md_buf; 4693 bdev_io->u.bdev.num_blocks = num_blocks; 4694 bdev_io->u.bdev.offset_blocks = offset_blocks; 4695 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4696 4697 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4698 bdev_io_submit(bdev_io); 4699 return 0; 4700 } 4701 4702 bdev_compare_do_read(bdev_io); 4703 4704 return 0; 4705 } 4706 4707 int 4708 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4709 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4710 spdk_bdev_io_completion_cb cb, void *cb_arg) 4711 { 4712 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4713 cb, cb_arg); 4714 } 4715 4716 int 4717 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4718 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4719 spdk_bdev_io_completion_cb cb, void *cb_arg) 4720 { 4721 struct iovec iov = { 4722 .iov_base = buf, 4723 }; 4724 4725 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4726 return -EINVAL; 4727 } 4728 4729 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4730 return -EINVAL; 4731 } 4732 4733 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4734 cb, cb_arg); 4735 } 4736 4737 static void 4738 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4739 { 4740 struct spdk_bdev_io *bdev_io = ctx; 4741 4742 if (unlock_status) { 4743 SPDK_ERRLOG("LBA range unlock failed\n"); 4744 } 4745 4746 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4747 false, bdev_io->internal.caller_ctx); 4748 } 4749 4750 static void 4751 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4752 { 4753 bdev_io->internal.status = status; 4754 4755 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4756 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4757 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4758 } 4759 4760 static void 4761 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4762 { 4763 struct spdk_bdev_io *parent_io = cb_arg; 4764 4765 if (!success) { 4766 SPDK_ERRLOG("Compare and write operation failed\n"); 4767 } 4768 4769 spdk_bdev_free_io(bdev_io); 4770 4771 bdev_comparev_and_writev_blocks_unlock(parent_io, 4772 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4773 } 4774 4775 static void 4776 bdev_compare_and_write_do_write(void *_bdev_io) 4777 { 4778 struct spdk_bdev_io *bdev_io = _bdev_io; 4779 int rc; 4780 4781 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4782 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4783 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4784 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4785 bdev_compare_and_write_do_write_done, bdev_io); 4786 4787 4788 if (rc == -ENOMEM) { 4789 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4790 } else if (rc != 0) { 4791 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4792 } 4793 } 4794 4795 static void 4796 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4797 { 4798 struct spdk_bdev_io *parent_io = cb_arg; 4799 4800 spdk_bdev_free_io(bdev_io); 4801 4802 if (!success) { 4803 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4804 return; 4805 } 4806 4807 bdev_compare_and_write_do_write(parent_io); 4808 } 4809 4810 static void 4811 bdev_compare_and_write_do_compare(void *_bdev_io) 4812 { 4813 struct spdk_bdev_io *bdev_io = _bdev_io; 4814 int rc; 4815 4816 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4817 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4818 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4819 bdev_compare_and_write_do_compare_done, bdev_io); 4820 4821 if (rc == -ENOMEM) { 4822 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4823 } else if (rc != 0) { 4824 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4825 } 4826 } 4827 4828 static void 4829 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4830 { 4831 struct spdk_bdev_io *bdev_io = ctx; 4832 4833 if (status) { 4834 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4835 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4836 return; 4837 } 4838 4839 bdev_compare_and_write_do_compare(bdev_io); 4840 } 4841 4842 int 4843 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4844 struct iovec *compare_iov, int compare_iovcnt, 4845 struct iovec *write_iov, int write_iovcnt, 4846 uint64_t offset_blocks, uint64_t num_blocks, 4847 spdk_bdev_io_completion_cb cb, void *cb_arg) 4848 { 4849 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4850 struct spdk_bdev_io *bdev_io; 4851 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4852 4853 if (!desc->write) { 4854 return -EBADF; 4855 } 4856 4857 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4858 return -EINVAL; 4859 } 4860 4861 if (num_blocks > bdev->acwu) { 4862 return -EINVAL; 4863 } 4864 4865 bdev_io = bdev_channel_get_io(channel); 4866 if (!bdev_io) { 4867 return -ENOMEM; 4868 } 4869 4870 bdev_io->internal.ch = channel; 4871 bdev_io->internal.desc = desc; 4872 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 4873 bdev_io->u.bdev.iovs = compare_iov; 4874 bdev_io->u.bdev.iovcnt = compare_iovcnt; 4875 bdev_io->u.bdev.fused_iovs = write_iov; 4876 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4877 bdev_io->u.bdev.md_buf = NULL; 4878 bdev_io->u.bdev.num_blocks = num_blocks; 4879 bdev_io->u.bdev.offset_blocks = offset_blocks; 4880 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4881 4882 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4883 bdev_io_submit(bdev_io); 4884 return 0; 4885 } 4886 4887 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4888 bdev_comparev_and_writev_blocks_locked, bdev_io); 4889 } 4890 4891 int 4892 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4893 struct iovec *iov, int iovcnt, 4894 uint64_t offset_blocks, uint64_t num_blocks, 4895 bool populate, 4896 spdk_bdev_io_completion_cb cb, void *cb_arg) 4897 { 4898 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4899 struct spdk_bdev_io *bdev_io; 4900 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4901 4902 if (!desc->write) { 4903 return -EBADF; 4904 } 4905 4906 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4907 return -EINVAL; 4908 } 4909 4910 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4911 return -ENOTSUP; 4912 } 4913 4914 bdev_io = bdev_channel_get_io(channel); 4915 if (!bdev_io) { 4916 return -ENOMEM; 4917 } 4918 4919 bdev_io->internal.ch = channel; 4920 bdev_io->internal.desc = desc; 4921 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4922 bdev_io->u.bdev.num_blocks = num_blocks; 4923 bdev_io->u.bdev.offset_blocks = offset_blocks; 4924 bdev_io->u.bdev.iovs = iov; 4925 bdev_io->u.bdev.iovcnt = iovcnt; 4926 bdev_io->u.bdev.md_buf = NULL; 4927 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4928 bdev_io->u.bdev.zcopy.commit = 0; 4929 bdev_io->u.bdev.zcopy.start = 1; 4930 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4931 4932 bdev_io_submit(bdev_io); 4933 4934 return 0; 4935 } 4936 4937 int 4938 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4939 spdk_bdev_io_completion_cb cb, void *cb_arg) 4940 { 4941 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4942 return -EINVAL; 4943 } 4944 4945 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4946 bdev_io->u.bdev.zcopy.start = 0; 4947 bdev_io->internal.caller_ctx = cb_arg; 4948 bdev_io->internal.cb = cb; 4949 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4950 4951 bdev_io_submit(bdev_io); 4952 4953 return 0; 4954 } 4955 4956 int 4957 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4958 uint64_t offset, uint64_t len, 4959 spdk_bdev_io_completion_cb cb, void *cb_arg) 4960 { 4961 uint64_t offset_blocks, num_blocks; 4962 4963 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4964 len, &num_blocks) != 0) { 4965 return -EINVAL; 4966 } 4967 4968 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4969 } 4970 4971 int 4972 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4973 uint64_t offset_blocks, uint64_t num_blocks, 4974 spdk_bdev_io_completion_cb cb, void *cb_arg) 4975 { 4976 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4977 struct spdk_bdev_io *bdev_io; 4978 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4979 4980 if (!desc->write) { 4981 return -EBADF; 4982 } 4983 4984 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4985 return -EINVAL; 4986 } 4987 4988 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4989 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4990 return -ENOTSUP; 4991 } 4992 4993 bdev_io = bdev_channel_get_io(channel); 4994 4995 if (!bdev_io) { 4996 return -ENOMEM; 4997 } 4998 4999 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5000 bdev_io->internal.ch = channel; 5001 bdev_io->internal.desc = desc; 5002 bdev_io->u.bdev.offset_blocks = offset_blocks; 5003 bdev_io->u.bdev.num_blocks = num_blocks; 5004 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5005 5006 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5007 bdev_io_submit(bdev_io); 5008 return 0; 5009 } 5010 5011 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5012 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5013 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5014 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5015 bdev_write_zero_buffer_next(bdev_io); 5016 5017 return 0; 5018 } 5019 5020 int 5021 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5022 uint64_t offset, uint64_t nbytes, 5023 spdk_bdev_io_completion_cb cb, void *cb_arg) 5024 { 5025 uint64_t offset_blocks, num_blocks; 5026 5027 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5028 nbytes, &num_blocks) != 0) { 5029 return -EINVAL; 5030 } 5031 5032 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5033 } 5034 5035 int 5036 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5037 uint64_t offset_blocks, uint64_t num_blocks, 5038 spdk_bdev_io_completion_cb cb, void *cb_arg) 5039 { 5040 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5041 struct spdk_bdev_io *bdev_io; 5042 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5043 5044 if (!desc->write) { 5045 return -EBADF; 5046 } 5047 5048 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5049 return -EINVAL; 5050 } 5051 5052 if (num_blocks == 0) { 5053 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5054 return -EINVAL; 5055 } 5056 5057 bdev_io = bdev_channel_get_io(channel); 5058 if (!bdev_io) { 5059 return -ENOMEM; 5060 } 5061 5062 bdev_io->internal.ch = channel; 5063 bdev_io->internal.desc = desc; 5064 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5065 5066 bdev_io->u.bdev.iovs = &bdev_io->iov; 5067 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5068 bdev_io->u.bdev.iovs[0].iov_len = 0; 5069 bdev_io->u.bdev.iovcnt = 1; 5070 5071 bdev_io->u.bdev.offset_blocks = offset_blocks; 5072 bdev_io->u.bdev.num_blocks = num_blocks; 5073 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5074 5075 bdev_io_submit(bdev_io); 5076 return 0; 5077 } 5078 5079 int 5080 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5081 uint64_t offset, uint64_t length, 5082 spdk_bdev_io_completion_cb cb, void *cb_arg) 5083 { 5084 uint64_t offset_blocks, num_blocks; 5085 5086 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5087 length, &num_blocks) != 0) { 5088 return -EINVAL; 5089 } 5090 5091 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5092 } 5093 5094 int 5095 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5096 uint64_t offset_blocks, uint64_t num_blocks, 5097 spdk_bdev_io_completion_cb cb, void *cb_arg) 5098 { 5099 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5100 struct spdk_bdev_io *bdev_io; 5101 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5102 5103 if (!desc->write) { 5104 return -EBADF; 5105 } 5106 5107 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5108 return -EINVAL; 5109 } 5110 5111 bdev_io = bdev_channel_get_io(channel); 5112 if (!bdev_io) { 5113 return -ENOMEM; 5114 } 5115 5116 bdev_io->internal.ch = channel; 5117 bdev_io->internal.desc = desc; 5118 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5119 bdev_io->u.bdev.iovs = NULL; 5120 bdev_io->u.bdev.iovcnt = 0; 5121 bdev_io->u.bdev.offset_blocks = offset_blocks; 5122 bdev_io->u.bdev.num_blocks = num_blocks; 5123 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5124 5125 bdev_io_submit(bdev_io); 5126 return 0; 5127 } 5128 5129 static void 5130 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 5131 { 5132 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 5133 struct spdk_bdev_io *bdev_io; 5134 5135 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5136 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5137 bdev_io_submit_reset(bdev_io); 5138 } 5139 5140 static void 5141 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 5142 { 5143 struct spdk_io_channel *ch; 5144 struct spdk_bdev_channel *channel; 5145 struct spdk_bdev_mgmt_channel *mgmt_channel; 5146 struct spdk_bdev_shared_resource *shared_resource; 5147 bdev_io_tailq_t tmp_queued; 5148 5149 TAILQ_INIT(&tmp_queued); 5150 5151 ch = spdk_io_channel_iter_get_channel(i); 5152 channel = spdk_io_channel_get_ctx(ch); 5153 shared_resource = channel->shared_resource; 5154 mgmt_channel = shared_resource->mgmt_ch; 5155 5156 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5157 5158 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5159 /* The QoS object is always valid and readable while 5160 * the channel flag is set, so the lock here should not 5161 * be necessary. We're not in the fast path though, so 5162 * just take it anyway. */ 5163 pthread_mutex_lock(&channel->bdev->internal.mutex); 5164 if (channel->bdev->internal.qos->ch == channel) { 5165 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5166 } 5167 pthread_mutex_unlock(&channel->bdev->internal.mutex); 5168 } 5169 5170 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5171 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 5172 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 5173 bdev_abort_all_queued_io(&tmp_queued, channel); 5174 5175 spdk_for_each_channel_continue(i, 0); 5176 } 5177 5178 static void 5179 bdev_start_reset(void *ctx) 5180 { 5181 struct spdk_bdev_channel *ch = ctx; 5182 5183 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 5184 ch, bdev_reset_dev); 5185 } 5186 5187 static void 5188 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5189 { 5190 struct spdk_bdev *bdev = ch->bdev; 5191 5192 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5193 5194 pthread_mutex_lock(&bdev->internal.mutex); 5195 if (bdev->internal.reset_in_progress == NULL) { 5196 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5197 /* 5198 * Take a channel reference for the target bdev for the life of this 5199 * reset. This guards against the channel getting destroyed while 5200 * spdk_for_each_channel() calls related to this reset IO are in 5201 * progress. We will release the reference when this reset is 5202 * completed. 5203 */ 5204 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5205 bdev_start_reset(ch); 5206 } 5207 pthread_mutex_unlock(&bdev->internal.mutex); 5208 } 5209 5210 int 5211 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5212 spdk_bdev_io_completion_cb cb, void *cb_arg) 5213 { 5214 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5215 struct spdk_bdev_io *bdev_io; 5216 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5217 5218 bdev_io = bdev_channel_get_io(channel); 5219 if (!bdev_io) { 5220 return -ENOMEM; 5221 } 5222 5223 bdev_io->internal.ch = channel; 5224 bdev_io->internal.desc = desc; 5225 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5226 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5227 bdev_io->u.reset.ch_ref = NULL; 5228 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5229 5230 pthread_mutex_lock(&bdev->internal.mutex); 5231 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5232 pthread_mutex_unlock(&bdev->internal.mutex); 5233 5234 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5235 internal.ch_link); 5236 5237 bdev_channel_start_reset(channel); 5238 5239 return 0; 5240 } 5241 5242 void 5243 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5244 struct spdk_bdev_io_stat *stat) 5245 { 5246 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5247 5248 *stat = channel->stat; 5249 } 5250 5251 static void 5252 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 5253 { 5254 void *io_device = spdk_io_channel_iter_get_io_device(i); 5255 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5256 5257 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 5258 bdev_iostat_ctx->cb_arg, 0); 5259 free(bdev_iostat_ctx); 5260 } 5261 5262 static void 5263 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 5264 { 5265 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5266 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 5267 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5268 5269 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 5270 spdk_for_each_channel_continue(i, 0); 5271 } 5272 5273 void 5274 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5275 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5276 { 5277 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5278 5279 assert(bdev != NULL); 5280 assert(stat != NULL); 5281 assert(cb != NULL); 5282 5283 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5284 if (bdev_iostat_ctx == NULL) { 5285 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5286 cb(bdev, stat, cb_arg, -ENOMEM); 5287 return; 5288 } 5289 5290 bdev_iostat_ctx->stat = stat; 5291 bdev_iostat_ctx->cb = cb; 5292 bdev_iostat_ctx->cb_arg = cb_arg; 5293 5294 /* Start with the statistics from previously deleted channels. */ 5295 pthread_mutex_lock(&bdev->internal.mutex); 5296 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 5297 pthread_mutex_unlock(&bdev->internal.mutex); 5298 5299 /* Then iterate and add the statistics from each existing channel. */ 5300 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5301 bdev_get_each_channel_stat, 5302 bdev_iostat_ctx, 5303 bdev_get_device_stat_done); 5304 } 5305 5306 int 5307 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5308 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5309 spdk_bdev_io_completion_cb cb, void *cb_arg) 5310 { 5311 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5312 struct spdk_bdev_io *bdev_io; 5313 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5314 5315 if (!desc->write) { 5316 return -EBADF; 5317 } 5318 5319 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5320 return -ENOTSUP; 5321 } 5322 5323 bdev_io = bdev_channel_get_io(channel); 5324 if (!bdev_io) { 5325 return -ENOMEM; 5326 } 5327 5328 bdev_io->internal.ch = channel; 5329 bdev_io->internal.desc = desc; 5330 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5331 bdev_io->u.nvme_passthru.cmd = *cmd; 5332 bdev_io->u.nvme_passthru.buf = buf; 5333 bdev_io->u.nvme_passthru.nbytes = nbytes; 5334 bdev_io->u.nvme_passthru.md_buf = NULL; 5335 bdev_io->u.nvme_passthru.md_len = 0; 5336 5337 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5338 5339 bdev_io_submit(bdev_io); 5340 return 0; 5341 } 5342 5343 int 5344 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5345 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5346 spdk_bdev_io_completion_cb cb, void *cb_arg) 5347 { 5348 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5349 struct spdk_bdev_io *bdev_io; 5350 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5351 5352 if (!desc->write) { 5353 /* 5354 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5355 * to easily determine if the command is a read or write, but for now just 5356 * do not allow io_passthru with a read-only descriptor. 5357 */ 5358 return -EBADF; 5359 } 5360 5361 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5362 return -ENOTSUP; 5363 } 5364 5365 bdev_io = bdev_channel_get_io(channel); 5366 if (!bdev_io) { 5367 return -ENOMEM; 5368 } 5369 5370 bdev_io->internal.ch = channel; 5371 bdev_io->internal.desc = desc; 5372 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5373 bdev_io->u.nvme_passthru.cmd = *cmd; 5374 bdev_io->u.nvme_passthru.buf = buf; 5375 bdev_io->u.nvme_passthru.nbytes = nbytes; 5376 bdev_io->u.nvme_passthru.md_buf = NULL; 5377 bdev_io->u.nvme_passthru.md_len = 0; 5378 5379 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5380 5381 bdev_io_submit(bdev_io); 5382 return 0; 5383 } 5384 5385 int 5386 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5387 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5388 spdk_bdev_io_completion_cb cb, void *cb_arg) 5389 { 5390 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5391 struct spdk_bdev_io *bdev_io; 5392 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5393 5394 if (!desc->write) { 5395 /* 5396 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5397 * to easily determine if the command is a read or write, but for now just 5398 * do not allow io_passthru with a read-only descriptor. 5399 */ 5400 return -EBADF; 5401 } 5402 5403 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5404 return -ENOTSUP; 5405 } 5406 5407 bdev_io = bdev_channel_get_io(channel); 5408 if (!bdev_io) { 5409 return -ENOMEM; 5410 } 5411 5412 bdev_io->internal.ch = channel; 5413 bdev_io->internal.desc = desc; 5414 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5415 bdev_io->u.nvme_passthru.cmd = *cmd; 5416 bdev_io->u.nvme_passthru.buf = buf; 5417 bdev_io->u.nvme_passthru.nbytes = nbytes; 5418 bdev_io->u.nvme_passthru.md_buf = md_buf; 5419 bdev_io->u.nvme_passthru.md_len = md_len; 5420 5421 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5422 5423 bdev_io_submit(bdev_io); 5424 return 0; 5425 } 5426 5427 static void bdev_abort_retry(void *ctx); 5428 static void bdev_abort(struct spdk_bdev_io *parent_io); 5429 5430 static void 5431 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5432 { 5433 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5434 struct spdk_bdev_io *parent_io = cb_arg; 5435 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5436 5437 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5438 5439 spdk_bdev_free_io(bdev_io); 5440 5441 if (!success) { 5442 /* Check if the target I/O completed in the meantime. */ 5443 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5444 if (tmp_io == bio_to_abort) { 5445 break; 5446 } 5447 } 5448 5449 /* If the target I/O still exists, set the parent to failed. */ 5450 if (tmp_io != NULL) { 5451 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5452 } 5453 } 5454 5455 parent_io->u.bdev.split_outstanding--; 5456 if (parent_io->u.bdev.split_outstanding == 0) { 5457 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5458 bdev_abort_retry(parent_io); 5459 } else { 5460 bdev_io_complete(parent_io); 5461 } 5462 } 5463 } 5464 5465 static int 5466 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5467 struct spdk_bdev_io *bio_to_abort, 5468 spdk_bdev_io_completion_cb cb, void *cb_arg) 5469 { 5470 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5471 struct spdk_bdev_io *bdev_io; 5472 5473 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5474 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5475 /* TODO: Abort reset or abort request. */ 5476 return -ENOTSUP; 5477 } 5478 5479 bdev_io = bdev_channel_get_io(channel); 5480 if (bdev_io == NULL) { 5481 return -ENOMEM; 5482 } 5483 5484 bdev_io->internal.ch = channel; 5485 bdev_io->internal.desc = desc; 5486 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5487 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5488 5489 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5490 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5491 5492 /* Parent abort request is not submitted directly, but to manage its 5493 * execution add it to the submitted list here. 5494 */ 5495 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5496 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5497 5498 bdev_abort(bdev_io); 5499 5500 return 0; 5501 } 5502 5503 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5504 5505 /* Submit the abort request to the underlying bdev module. */ 5506 bdev_io_submit(bdev_io); 5507 5508 return 0; 5509 } 5510 5511 static uint32_t 5512 _bdev_abort(struct spdk_bdev_io *parent_io) 5513 { 5514 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5515 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5516 void *bio_cb_arg; 5517 struct spdk_bdev_io *bio_to_abort; 5518 uint32_t matched_ios; 5519 int rc; 5520 5521 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5522 5523 /* matched_ios is returned and will be kept by the caller. 5524 * 5525 * This funcion will be used for two cases, 1) the same cb_arg is used for 5526 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5527 * Incrementing split_outstanding directly here may confuse readers especially 5528 * for the 1st case. 5529 * 5530 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5531 * works as expected. 5532 */ 5533 matched_ios = 0; 5534 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5535 5536 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5537 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5538 continue; 5539 } 5540 5541 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5542 /* Any I/O which was submitted after this abort command should be excluded. */ 5543 continue; 5544 } 5545 5546 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5547 if (rc != 0) { 5548 if (rc == -ENOMEM) { 5549 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5550 } else { 5551 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5552 } 5553 break; 5554 } 5555 matched_ios++; 5556 } 5557 5558 return matched_ios; 5559 } 5560 5561 static void 5562 bdev_abort_retry(void *ctx) 5563 { 5564 struct spdk_bdev_io *parent_io = ctx; 5565 uint32_t matched_ios; 5566 5567 matched_ios = _bdev_abort(parent_io); 5568 5569 if (matched_ios == 0) { 5570 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5571 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5572 } else { 5573 /* For retry, the case that no target I/O was found is success 5574 * because it means target I/Os completed in the meantime. 5575 */ 5576 bdev_io_complete(parent_io); 5577 } 5578 return; 5579 } 5580 5581 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5582 parent_io->u.bdev.split_outstanding = matched_ios; 5583 } 5584 5585 static void 5586 bdev_abort(struct spdk_bdev_io *parent_io) 5587 { 5588 uint32_t matched_ios; 5589 5590 matched_ios = _bdev_abort(parent_io); 5591 5592 if (matched_ios == 0) { 5593 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5594 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5595 } else { 5596 /* The case the no target I/O was found is failure. */ 5597 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5598 bdev_io_complete(parent_io); 5599 } 5600 return; 5601 } 5602 5603 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5604 parent_io->u.bdev.split_outstanding = matched_ios; 5605 } 5606 5607 int 5608 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5609 void *bio_cb_arg, 5610 spdk_bdev_io_completion_cb cb, void *cb_arg) 5611 { 5612 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5613 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5614 struct spdk_bdev_io *bdev_io; 5615 5616 if (bio_cb_arg == NULL) { 5617 return -EINVAL; 5618 } 5619 5620 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5621 return -ENOTSUP; 5622 } 5623 5624 bdev_io = bdev_channel_get_io(channel); 5625 if (bdev_io == NULL) { 5626 return -ENOMEM; 5627 } 5628 5629 bdev_io->internal.ch = channel; 5630 bdev_io->internal.desc = desc; 5631 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5632 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5633 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5634 5635 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5636 5637 /* Parent abort request is not submitted directly, but to manage its execution, 5638 * add it to the submitted list here. 5639 */ 5640 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5641 5642 bdev_abort(bdev_io); 5643 5644 return 0; 5645 } 5646 5647 int 5648 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5649 struct spdk_bdev_io_wait_entry *entry) 5650 { 5651 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5652 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5653 5654 if (bdev != entry->bdev) { 5655 SPDK_ERRLOG("bdevs do not match\n"); 5656 return -EINVAL; 5657 } 5658 5659 if (mgmt_ch->per_thread_cache_count > 0) { 5660 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5661 return -EINVAL; 5662 } 5663 5664 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5665 return 0; 5666 } 5667 5668 static inline void 5669 bdev_io_complete(void *ctx) 5670 { 5671 struct spdk_bdev_io *bdev_io = ctx; 5672 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5673 uint64_t tsc, tsc_diff; 5674 5675 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5676 /* 5677 * Send the completion to the thread that originally submitted the I/O, 5678 * which may not be the current thread in the case of QoS. 5679 */ 5680 if (bdev_io->internal.io_submit_ch) { 5681 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5682 bdev_io->internal.io_submit_ch = NULL; 5683 } 5684 5685 /* 5686 * Defer completion to avoid potential infinite recursion if the 5687 * user's completion callback issues a new I/O. 5688 */ 5689 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5690 bdev_io_complete, bdev_io); 5691 return; 5692 } 5693 5694 tsc = spdk_get_ticks(); 5695 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5696 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5697 bdev_io->internal.caller_ctx); 5698 5699 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5700 5701 if (bdev_io->internal.ch->histogram) { 5702 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5703 } 5704 5705 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5706 switch (bdev_io->type) { 5707 case SPDK_BDEV_IO_TYPE_READ: 5708 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5709 bdev_io->internal.ch->stat.num_read_ops++; 5710 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5711 break; 5712 case SPDK_BDEV_IO_TYPE_WRITE: 5713 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5714 bdev_io->internal.ch->stat.num_write_ops++; 5715 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5716 break; 5717 case SPDK_BDEV_IO_TYPE_UNMAP: 5718 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5719 bdev_io->internal.ch->stat.num_unmap_ops++; 5720 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5721 break; 5722 case SPDK_BDEV_IO_TYPE_ZCOPY: 5723 /* Track the data in the start phase only */ 5724 if (bdev_io->u.bdev.zcopy.start) { 5725 if (bdev_io->u.bdev.zcopy.populate) { 5726 bdev_io->internal.ch->stat.bytes_read += 5727 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5728 bdev_io->internal.ch->stat.num_read_ops++; 5729 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5730 } else { 5731 bdev_io->internal.ch->stat.bytes_written += 5732 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5733 bdev_io->internal.ch->stat.num_write_ops++; 5734 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5735 } 5736 } 5737 break; 5738 default: 5739 break; 5740 } 5741 } 5742 5743 #ifdef SPDK_CONFIG_VTUNE 5744 uint64_t now_tsc = spdk_get_ticks(); 5745 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5746 uint64_t data[5]; 5747 5748 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5749 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5750 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5751 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5752 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5753 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5754 5755 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5756 __itt_metadata_u64, 5, data); 5757 5758 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5759 bdev_io->internal.ch->start_tsc = now_tsc; 5760 } 5761 #endif 5762 5763 assert(bdev_io->internal.cb != NULL); 5764 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5765 5766 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5767 bdev_io->internal.caller_ctx); 5768 } 5769 5770 static void bdev_destroy_cb(void *io_device); 5771 5772 static void 5773 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 5774 { 5775 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5776 struct spdk_bdev *bdev = bdev_io->bdev; 5777 5778 if (bdev_io->u.reset.ch_ref != NULL) { 5779 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 5780 bdev_io->u.reset.ch_ref = NULL; 5781 } 5782 5783 bdev_io_complete(bdev_io); 5784 5785 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 5786 TAILQ_EMPTY(&bdev->internal.open_descs)) { 5787 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 5788 } 5789 } 5790 5791 static void 5792 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 5793 { 5794 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5795 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5796 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5797 struct spdk_bdev_io *queued_reset; 5798 5799 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 5800 while (!TAILQ_EMPTY(&ch->queued_resets)) { 5801 queued_reset = TAILQ_FIRST(&ch->queued_resets); 5802 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 5803 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 5804 } 5805 5806 spdk_for_each_channel_continue(i, 0); 5807 } 5808 5809 void 5810 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 5811 { 5812 struct spdk_bdev *bdev = bdev_io->bdev; 5813 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5814 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5815 5816 bdev_io->internal.status = status; 5817 5818 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5819 bool unlock_channels = false; 5820 5821 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5822 SPDK_ERRLOG("NOMEM returned for reset\n"); 5823 } 5824 pthread_mutex_lock(&bdev->internal.mutex); 5825 if (bdev_io == bdev->internal.reset_in_progress) { 5826 bdev->internal.reset_in_progress = NULL; 5827 unlock_channels = true; 5828 } 5829 pthread_mutex_unlock(&bdev->internal.mutex); 5830 5831 if (unlock_channels) { 5832 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5833 bdev_io, bdev_reset_complete); 5834 return; 5835 } 5836 } else { 5837 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 5838 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 5839 /* bdev IO will be completed in the callback */ 5840 return; 5841 } 5842 5843 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 5844 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 5845 return; 5846 } 5847 } 5848 5849 bdev_io_complete(bdev_io); 5850 } 5851 5852 void 5853 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5854 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5855 { 5856 if (sc == SPDK_SCSI_STATUS_GOOD) { 5857 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5858 } else { 5859 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5860 bdev_io->internal.error.scsi.sc = sc; 5861 bdev_io->internal.error.scsi.sk = sk; 5862 bdev_io->internal.error.scsi.asc = asc; 5863 bdev_io->internal.error.scsi.ascq = ascq; 5864 } 5865 5866 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5867 } 5868 5869 void 5870 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5871 int *sc, int *sk, int *asc, int *ascq) 5872 { 5873 assert(sc != NULL); 5874 assert(sk != NULL); 5875 assert(asc != NULL); 5876 assert(ascq != NULL); 5877 5878 switch (bdev_io->internal.status) { 5879 case SPDK_BDEV_IO_STATUS_SUCCESS: 5880 *sc = SPDK_SCSI_STATUS_GOOD; 5881 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5882 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5883 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5884 break; 5885 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5886 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5887 break; 5888 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5889 *sc = bdev_io->internal.error.scsi.sc; 5890 *sk = bdev_io->internal.error.scsi.sk; 5891 *asc = bdev_io->internal.error.scsi.asc; 5892 *ascq = bdev_io->internal.error.scsi.ascq; 5893 break; 5894 default: 5895 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5896 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5897 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5898 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5899 break; 5900 } 5901 } 5902 5903 void 5904 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 5905 { 5906 if (aio_result == 0) { 5907 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5908 } else { 5909 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 5910 } 5911 5912 bdev_io->internal.error.aio_result = aio_result; 5913 5914 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5915 } 5916 5917 void 5918 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 5919 { 5920 assert(aio_result != NULL); 5921 5922 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 5923 *aio_result = bdev_io->internal.error.aio_result; 5924 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5925 *aio_result = 0; 5926 } else { 5927 *aio_result = -EIO; 5928 } 5929 } 5930 5931 void 5932 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5933 { 5934 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5935 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5936 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 5937 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 5938 } else { 5939 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5940 } 5941 5942 bdev_io->internal.error.nvme.cdw0 = cdw0; 5943 bdev_io->internal.error.nvme.sct = sct; 5944 bdev_io->internal.error.nvme.sc = sc; 5945 5946 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5947 } 5948 5949 void 5950 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5951 { 5952 assert(sct != NULL); 5953 assert(sc != NULL); 5954 assert(cdw0 != NULL); 5955 5956 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 5957 *sct = SPDK_NVME_SCT_GENERIC; 5958 *sc = SPDK_NVME_SC_SUCCESS; 5959 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5960 *cdw0 = 0; 5961 } else { 5962 *cdw0 = 1U; 5963 } 5964 return; 5965 } 5966 5967 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5968 *sct = bdev_io->internal.error.nvme.sct; 5969 *sc = bdev_io->internal.error.nvme.sc; 5970 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5971 *sct = SPDK_NVME_SCT_GENERIC; 5972 *sc = SPDK_NVME_SC_SUCCESS; 5973 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5974 *sct = SPDK_NVME_SCT_GENERIC; 5975 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5976 } else { 5977 *sct = SPDK_NVME_SCT_GENERIC; 5978 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5979 } 5980 5981 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5982 } 5983 5984 void 5985 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5986 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5987 { 5988 assert(first_sct != NULL); 5989 assert(first_sc != NULL); 5990 assert(second_sct != NULL); 5991 assert(second_sc != NULL); 5992 assert(cdw0 != NULL); 5993 5994 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5995 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5996 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5997 *first_sct = bdev_io->internal.error.nvme.sct; 5998 *first_sc = bdev_io->internal.error.nvme.sc; 5999 *second_sct = SPDK_NVME_SCT_GENERIC; 6000 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6001 } else { 6002 *first_sct = SPDK_NVME_SCT_GENERIC; 6003 *first_sc = SPDK_NVME_SC_SUCCESS; 6004 *second_sct = bdev_io->internal.error.nvme.sct; 6005 *second_sc = bdev_io->internal.error.nvme.sc; 6006 } 6007 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6008 *first_sct = SPDK_NVME_SCT_GENERIC; 6009 *first_sc = SPDK_NVME_SC_SUCCESS; 6010 *second_sct = SPDK_NVME_SCT_GENERIC; 6011 *second_sc = SPDK_NVME_SC_SUCCESS; 6012 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6013 *first_sct = SPDK_NVME_SCT_GENERIC; 6014 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6015 *second_sct = SPDK_NVME_SCT_GENERIC; 6016 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6017 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6018 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6019 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6020 *second_sct = SPDK_NVME_SCT_GENERIC; 6021 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6022 } else { 6023 *first_sct = SPDK_NVME_SCT_GENERIC; 6024 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6025 *second_sct = SPDK_NVME_SCT_GENERIC; 6026 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6027 } 6028 6029 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6030 } 6031 6032 struct spdk_thread * 6033 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6034 { 6035 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6036 } 6037 6038 struct spdk_io_channel * 6039 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6040 { 6041 return bdev_io->internal.ch->channel; 6042 } 6043 6044 static int 6045 bdev_register(struct spdk_bdev *bdev) 6046 { 6047 char *bdev_name; 6048 char uuid[SPDK_UUID_STRING_LEN]; 6049 int ret; 6050 6051 assert(bdev->module != NULL); 6052 6053 if (!bdev->name) { 6054 SPDK_ERRLOG("Bdev name is NULL\n"); 6055 return -EINVAL; 6056 } 6057 6058 if (!strlen(bdev->name)) { 6059 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6060 return -EINVAL; 6061 } 6062 6063 /* Users often register their own I/O devices using the bdev name. In 6064 * order to avoid conflicts, prepend bdev_. */ 6065 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6066 if (!bdev_name) { 6067 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6068 return -ENOMEM; 6069 } 6070 6071 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6072 bdev->internal.measured_queue_depth = UINT64_MAX; 6073 bdev->internal.claim_module = NULL; 6074 bdev->internal.qd_poller = NULL; 6075 bdev->internal.qos = NULL; 6076 6077 TAILQ_INIT(&bdev->internal.open_descs); 6078 TAILQ_INIT(&bdev->internal.locked_ranges); 6079 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6080 TAILQ_INIT(&bdev->aliases); 6081 6082 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6083 if (ret != 0) { 6084 free(bdev_name); 6085 return ret; 6086 } 6087 6088 /* If the user didn't specify a uuid, generate one. */ 6089 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6090 spdk_uuid_generate(&bdev->uuid); 6091 } 6092 6093 /* Add the UUID alias only if it's different than the name */ 6094 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6095 if (strcmp(bdev->name, uuid) != 0) { 6096 ret = spdk_bdev_alias_add(bdev, uuid); 6097 if (ret != 0) { 6098 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6099 bdev_name_del(&bdev->internal.bdev_name); 6100 free(bdev_name); 6101 return ret; 6102 } 6103 } 6104 6105 if (spdk_bdev_get_buf_align(bdev) > 1) { 6106 if (bdev->split_on_optimal_io_boundary) { 6107 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6108 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6109 } else { 6110 bdev->split_on_optimal_io_boundary = true; 6111 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6112 } 6113 } 6114 6115 /* If the user didn't specify a write unit size, set it to one. */ 6116 if (bdev->write_unit_size == 0) { 6117 bdev->write_unit_size = 1; 6118 } 6119 6120 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 6121 if (bdev->acwu == 0) { 6122 bdev->acwu = 1; 6123 } 6124 6125 if (bdev->phys_blocklen == 0) { 6126 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6127 } 6128 6129 bdev->internal.reset_in_progress = NULL; 6130 6131 spdk_io_device_register(__bdev_to_io_dev(bdev), 6132 bdev_channel_create, bdev_channel_destroy, 6133 sizeof(struct spdk_bdev_channel), 6134 bdev_name); 6135 6136 free(bdev_name); 6137 6138 pthread_mutex_init(&bdev->internal.mutex, NULL); 6139 6140 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6141 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6142 6143 return 0; 6144 } 6145 6146 static void 6147 bdev_destroy_cb(void *io_device) 6148 { 6149 int rc; 6150 struct spdk_bdev *bdev; 6151 spdk_bdev_unregister_cb cb_fn; 6152 void *cb_arg; 6153 6154 bdev = __bdev_from_io_dev(io_device); 6155 cb_fn = bdev->internal.unregister_cb; 6156 cb_arg = bdev->internal.unregister_ctx; 6157 6158 pthread_mutex_destroy(&bdev->internal.mutex); 6159 free(bdev->internal.qos); 6160 6161 rc = bdev->fn_table->destruct(bdev->ctxt); 6162 if (rc < 0) { 6163 SPDK_ERRLOG("destruct failed\n"); 6164 } 6165 if (rc <= 0 && cb_fn != NULL) { 6166 cb_fn(cb_arg, rc); 6167 } 6168 } 6169 6170 static void 6171 bdev_register_finished(void *arg) 6172 { 6173 struct spdk_bdev *bdev = arg; 6174 6175 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 6176 } 6177 6178 int 6179 spdk_bdev_register(struct spdk_bdev *bdev) 6180 { 6181 int rc = bdev_register(bdev); 6182 6183 if (rc == 0) { 6184 /* Examine configuration before initializing I/O */ 6185 bdev_examine(bdev); 6186 6187 spdk_bdev_wait_for_examine(bdev_register_finished, bdev); 6188 } 6189 6190 return rc; 6191 } 6192 6193 void 6194 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6195 { 6196 if (bdev->internal.unregister_cb != NULL) { 6197 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6198 } 6199 } 6200 6201 static void 6202 _remove_notify(void *arg) 6203 { 6204 struct spdk_bdev_desc *desc = arg; 6205 6206 pthread_mutex_lock(&desc->mutex); 6207 desc->refs--; 6208 6209 if (!desc->closed) { 6210 pthread_mutex_unlock(&desc->mutex); 6211 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6212 return; 6213 } else if (0 == desc->refs) { 6214 /* This descriptor was closed after this remove_notify message was sent. 6215 * spdk_bdev_close() could not free the descriptor since this message was 6216 * in flight, so we free it now using bdev_desc_free(). 6217 */ 6218 pthread_mutex_unlock(&desc->mutex); 6219 bdev_desc_free(desc); 6220 return; 6221 } 6222 pthread_mutex_unlock(&desc->mutex); 6223 } 6224 6225 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 6226 * returns: 0 - bdev removed and ready to be destructed. 6227 * -EBUSY - bdev can't be destructed yet. */ 6228 static int 6229 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6230 { 6231 struct spdk_bdev_desc *desc, *tmp; 6232 int rc = 0; 6233 char uuid[SPDK_UUID_STRING_LEN]; 6234 6235 /* Notify each descriptor about hotremoval */ 6236 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6237 rc = -EBUSY; 6238 pthread_mutex_lock(&desc->mutex); 6239 /* 6240 * Defer invocation of the event_cb to a separate message that will 6241 * run later on its thread. This ensures this context unwinds and 6242 * we don't recursively unregister this bdev again if the event_cb 6243 * immediately closes its descriptor. 6244 */ 6245 desc->refs++; 6246 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6247 pthread_mutex_unlock(&desc->mutex); 6248 } 6249 6250 /* If there are no descriptors, proceed removing the bdev */ 6251 if (rc == 0) { 6252 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6253 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6254 6255 /* Delete the name and the UUID alias */ 6256 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6257 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6258 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6259 6260 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6261 6262 if (bdev->internal.reset_in_progress != NULL) { 6263 /* If reset is in progress, let the completion callback for reset 6264 * unregister the bdev. 6265 */ 6266 rc = -EBUSY; 6267 } 6268 } 6269 6270 return rc; 6271 } 6272 6273 static void 6274 bdev_unregister_abort_channel(struct spdk_io_channel_iter *i) 6275 { 6276 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 6277 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 6278 6279 bdev_channel_abort_queued_ios(bdev_ch); 6280 spdk_for_each_channel_continue(i, 0); 6281 } 6282 6283 static void 6284 bdev_unregister(struct spdk_io_channel_iter *i, int status) 6285 { 6286 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 6287 int rc; 6288 6289 pthread_mutex_lock(&g_bdev_mgr.mutex); 6290 pthread_mutex_lock(&bdev->internal.mutex); 6291 /* 6292 * Set the status to REMOVING after completing to abort channels. Otherwise, 6293 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6294 * spdk_for_each_channel() is executed and spdk_io_device_unregister() may fail. 6295 */ 6296 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6297 rc = bdev_unregister_unsafe(bdev); 6298 pthread_mutex_unlock(&bdev->internal.mutex); 6299 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6300 6301 if (rc == 0) { 6302 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6303 } 6304 } 6305 6306 void 6307 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6308 { 6309 struct spdk_thread *thread; 6310 6311 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6312 6313 thread = spdk_get_thread(); 6314 if (!thread) { 6315 /* The user called this from a non-SPDK thread. */ 6316 if (cb_fn != NULL) { 6317 cb_fn(cb_arg, -ENOTSUP); 6318 } 6319 return; 6320 } 6321 6322 pthread_mutex_lock(&g_bdev_mgr.mutex); 6323 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6324 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6325 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6326 if (cb_fn) { 6327 cb_fn(cb_arg, -EBUSY); 6328 } 6329 return; 6330 } 6331 6332 pthread_mutex_lock(&bdev->internal.mutex); 6333 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6334 bdev->internal.unregister_cb = cb_fn; 6335 bdev->internal.unregister_ctx = cb_arg; 6336 pthread_mutex_unlock(&bdev->internal.mutex); 6337 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6338 6339 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6340 bdev_unregister_abort_channel, 6341 bdev, 6342 bdev_unregister); 6343 } 6344 6345 static void 6346 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 6347 { 6348 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 6349 } 6350 6351 int 6352 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6353 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6354 { 6355 struct spdk_bdev_desc *desc; 6356 struct spdk_bdev *bdev; 6357 int rc; 6358 6359 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 6360 if (rc != 0) { 6361 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 6362 return rc; 6363 } 6364 6365 bdev = spdk_bdev_desc_get_bdev(desc); 6366 6367 if (bdev->module != module) { 6368 spdk_bdev_close(desc); 6369 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 6370 bdev_name); 6371 return -ENODEV; 6372 } 6373 6374 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 6375 6376 spdk_bdev_close(desc); 6377 6378 return 0; 6379 } 6380 6381 static int 6382 bdev_start_qos(struct spdk_bdev *bdev) 6383 { 6384 struct set_qos_limit_ctx *ctx; 6385 6386 /* Enable QoS */ 6387 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6388 ctx = calloc(1, sizeof(*ctx)); 6389 if (ctx == NULL) { 6390 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6391 return -ENOMEM; 6392 } 6393 ctx->bdev = bdev; 6394 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6395 bdev_enable_qos_msg, ctx, 6396 bdev_enable_qos_done); 6397 } 6398 6399 return 0; 6400 } 6401 6402 static int 6403 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6404 { 6405 struct spdk_thread *thread; 6406 int rc = 0; 6407 6408 thread = spdk_get_thread(); 6409 if (!thread) { 6410 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6411 return -ENOTSUP; 6412 } 6413 6414 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6415 spdk_get_thread()); 6416 6417 desc->bdev = bdev; 6418 desc->thread = thread; 6419 desc->write = write; 6420 6421 pthread_mutex_lock(&bdev->internal.mutex); 6422 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6423 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6424 pthread_mutex_unlock(&bdev->internal.mutex); 6425 return -ENODEV; 6426 } 6427 6428 if (write && bdev->internal.claim_module) { 6429 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6430 bdev->name, bdev->internal.claim_module->name); 6431 pthread_mutex_unlock(&bdev->internal.mutex); 6432 return -EPERM; 6433 } 6434 6435 rc = bdev_start_qos(bdev); 6436 if (rc != 0) { 6437 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6438 pthread_mutex_unlock(&bdev->internal.mutex); 6439 return rc; 6440 } 6441 6442 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6443 6444 pthread_mutex_unlock(&bdev->internal.mutex); 6445 6446 return 0; 6447 } 6448 6449 static int 6450 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 6451 struct spdk_bdev_desc **_desc) 6452 { 6453 struct spdk_bdev_desc *desc; 6454 unsigned int event_id; 6455 6456 desc = calloc(1, sizeof(*desc)); 6457 if (desc == NULL) { 6458 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6459 return -ENOMEM; 6460 } 6461 6462 TAILQ_INIT(&desc->pending_media_events); 6463 TAILQ_INIT(&desc->free_media_events); 6464 6465 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 6466 desc->callback.event_fn = event_cb; 6467 desc->callback.ctx = event_ctx; 6468 pthread_mutex_init(&desc->mutex, NULL); 6469 6470 if (bdev->media_events) { 6471 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6472 sizeof(*desc->media_events_buffer)); 6473 if (desc->media_events_buffer == NULL) { 6474 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6475 bdev_desc_free(desc); 6476 return -ENOMEM; 6477 } 6478 6479 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6480 TAILQ_INSERT_TAIL(&desc->free_media_events, 6481 &desc->media_events_buffer[event_id], tailq); 6482 } 6483 } 6484 6485 *_desc = desc; 6486 6487 return 0; 6488 } 6489 6490 int 6491 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6492 void *event_ctx, struct spdk_bdev_desc **_desc) 6493 { 6494 struct spdk_bdev_desc *desc; 6495 struct spdk_bdev *bdev; 6496 int rc; 6497 6498 if (event_cb == NULL) { 6499 SPDK_ERRLOG("Missing event callback function\n"); 6500 return -EINVAL; 6501 } 6502 6503 pthread_mutex_lock(&g_bdev_mgr.mutex); 6504 6505 bdev = bdev_get_by_name(bdev_name); 6506 6507 if (bdev == NULL) { 6508 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6509 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6510 return -ENODEV; 6511 } 6512 6513 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 6514 if (rc != 0) { 6515 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6516 return rc; 6517 } 6518 6519 rc = bdev_open(bdev, write, desc); 6520 if (rc != 0) { 6521 bdev_desc_free(desc); 6522 desc = NULL; 6523 } 6524 6525 *_desc = desc; 6526 6527 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6528 6529 return rc; 6530 } 6531 6532 static void 6533 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 6534 { 6535 int rc; 6536 6537 pthread_mutex_lock(&bdev->internal.mutex); 6538 pthread_mutex_lock(&desc->mutex); 6539 6540 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6541 6542 desc->closed = true; 6543 6544 if (0 == desc->refs) { 6545 pthread_mutex_unlock(&desc->mutex); 6546 bdev_desc_free(desc); 6547 } else { 6548 pthread_mutex_unlock(&desc->mutex); 6549 } 6550 6551 /* If no more descriptors, kill QoS channel */ 6552 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6553 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6554 bdev->name, spdk_get_thread()); 6555 6556 if (bdev_qos_destroy(bdev)) { 6557 /* There isn't anything we can do to recover here. Just let the 6558 * old QoS poller keep running. The QoS handling won't change 6559 * cores when the user allocates a new channel, but it won't break. */ 6560 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6561 } 6562 } 6563 6564 spdk_bdev_set_qd_sampling_period(bdev, 0); 6565 6566 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6567 rc = bdev_unregister_unsafe(bdev); 6568 pthread_mutex_unlock(&bdev->internal.mutex); 6569 6570 if (rc == 0) { 6571 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6572 } 6573 } else { 6574 pthread_mutex_unlock(&bdev->internal.mutex); 6575 } 6576 } 6577 6578 void 6579 spdk_bdev_close(struct spdk_bdev_desc *desc) 6580 { 6581 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6582 6583 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6584 spdk_get_thread()); 6585 6586 assert(desc->thread == spdk_get_thread()); 6587 6588 spdk_poller_unregister(&desc->io_timeout_poller); 6589 6590 pthread_mutex_lock(&g_bdev_mgr.mutex); 6591 6592 bdev_close(bdev, desc); 6593 6594 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6595 } 6596 6597 int 6598 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6599 struct spdk_bdev_module *module) 6600 { 6601 if (bdev->internal.claim_module != NULL) { 6602 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6603 bdev->internal.claim_module->name); 6604 return -EPERM; 6605 } 6606 6607 if (desc && !desc->write) { 6608 desc->write = true; 6609 } 6610 6611 bdev->internal.claim_module = module; 6612 return 0; 6613 } 6614 6615 void 6616 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6617 { 6618 assert(bdev->internal.claim_module != NULL); 6619 bdev->internal.claim_module = NULL; 6620 } 6621 6622 struct spdk_bdev * 6623 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6624 { 6625 assert(desc != NULL); 6626 return desc->bdev; 6627 } 6628 6629 int 6630 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 6631 { 6632 struct spdk_bdev *bdev, *tmp; 6633 struct spdk_bdev_desc *desc; 6634 int rc = 0; 6635 6636 assert(fn != NULL); 6637 6638 pthread_mutex_lock(&g_bdev_mgr.mutex); 6639 bdev = spdk_bdev_first(); 6640 while (bdev != NULL) { 6641 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6642 if (rc != 0) { 6643 break; 6644 } 6645 rc = bdev_open(bdev, false, desc); 6646 if (rc != 0) { 6647 bdev_desc_free(desc); 6648 break; 6649 } 6650 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6651 6652 rc = fn(ctx, bdev); 6653 6654 pthread_mutex_lock(&g_bdev_mgr.mutex); 6655 tmp = spdk_bdev_next(bdev); 6656 bdev_close(bdev, desc); 6657 if (rc != 0) { 6658 break; 6659 } 6660 bdev = tmp; 6661 } 6662 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6663 6664 return rc; 6665 } 6666 6667 int 6668 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 6669 { 6670 struct spdk_bdev *bdev, *tmp; 6671 struct spdk_bdev_desc *desc; 6672 int rc = 0; 6673 6674 assert(fn != NULL); 6675 6676 pthread_mutex_lock(&g_bdev_mgr.mutex); 6677 bdev = spdk_bdev_first_leaf(); 6678 while (bdev != NULL) { 6679 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6680 if (rc != 0) { 6681 break; 6682 } 6683 rc = bdev_open(bdev, false, desc); 6684 if (rc != 0) { 6685 bdev_desc_free(desc); 6686 break; 6687 } 6688 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6689 6690 rc = fn(ctx, bdev); 6691 6692 pthread_mutex_lock(&g_bdev_mgr.mutex); 6693 tmp = spdk_bdev_next_leaf(bdev); 6694 bdev_close(bdev, desc); 6695 if (rc != 0) { 6696 break; 6697 } 6698 bdev = tmp; 6699 } 6700 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6701 6702 return rc; 6703 } 6704 6705 void 6706 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6707 { 6708 struct iovec *iovs; 6709 int iovcnt; 6710 6711 if (bdev_io == NULL) { 6712 return; 6713 } 6714 6715 switch (bdev_io->type) { 6716 case SPDK_BDEV_IO_TYPE_READ: 6717 case SPDK_BDEV_IO_TYPE_WRITE: 6718 case SPDK_BDEV_IO_TYPE_ZCOPY: 6719 iovs = bdev_io->u.bdev.iovs; 6720 iovcnt = bdev_io->u.bdev.iovcnt; 6721 break; 6722 default: 6723 iovs = NULL; 6724 iovcnt = 0; 6725 break; 6726 } 6727 6728 if (iovp) { 6729 *iovp = iovs; 6730 } 6731 if (iovcntp) { 6732 *iovcntp = iovcnt; 6733 } 6734 } 6735 6736 void * 6737 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 6738 { 6739 if (bdev_io == NULL) { 6740 return NULL; 6741 } 6742 6743 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 6744 return NULL; 6745 } 6746 6747 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 6748 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 6749 return bdev_io->u.bdev.md_buf; 6750 } 6751 6752 return NULL; 6753 } 6754 6755 void * 6756 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 6757 { 6758 if (bdev_io == NULL) { 6759 assert(false); 6760 return NULL; 6761 } 6762 6763 return bdev_io->internal.caller_ctx; 6764 } 6765 6766 void 6767 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 6768 { 6769 6770 if (spdk_bdev_module_list_find(bdev_module->name)) { 6771 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 6772 assert(false); 6773 } 6774 6775 /* 6776 * Modules with examine callbacks must be initialized first, so they are 6777 * ready to handle examine callbacks from later modules that will 6778 * register physical bdevs. 6779 */ 6780 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 6781 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6782 } else { 6783 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6784 } 6785 } 6786 6787 struct spdk_bdev_module * 6788 spdk_bdev_module_list_find(const char *name) 6789 { 6790 struct spdk_bdev_module *bdev_module; 6791 6792 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 6793 if (strcmp(name, bdev_module->name) == 0) { 6794 break; 6795 } 6796 } 6797 6798 return bdev_module; 6799 } 6800 6801 static void 6802 bdev_write_zero_buffer_next(void *_bdev_io) 6803 { 6804 struct spdk_bdev_io *bdev_io = _bdev_io; 6805 uint64_t num_bytes, num_blocks; 6806 void *md_buf = NULL; 6807 int rc; 6808 6809 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 6810 bdev_io->u.bdev.split_remaining_num_blocks, 6811 ZERO_BUFFER_SIZE); 6812 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 6813 6814 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 6815 md_buf = (char *)g_bdev_mgr.zero_buffer + 6816 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 6817 } 6818 6819 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 6820 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6821 g_bdev_mgr.zero_buffer, md_buf, 6822 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 6823 bdev_write_zero_buffer_done, bdev_io); 6824 if (rc == 0) { 6825 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 6826 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 6827 } else if (rc == -ENOMEM) { 6828 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 6829 } else { 6830 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6831 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6832 } 6833 } 6834 6835 static void 6836 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6837 { 6838 struct spdk_bdev_io *parent_io = cb_arg; 6839 6840 spdk_bdev_free_io(bdev_io); 6841 6842 if (!success) { 6843 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6844 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6845 return; 6846 } 6847 6848 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 6849 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6850 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6851 return; 6852 } 6853 6854 bdev_write_zero_buffer_next(parent_io); 6855 } 6856 6857 static void 6858 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 6859 { 6860 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6861 ctx->bdev->internal.qos_mod_in_progress = false; 6862 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6863 6864 if (ctx->cb_fn) { 6865 ctx->cb_fn(ctx->cb_arg, status); 6866 } 6867 free(ctx); 6868 } 6869 6870 static void 6871 bdev_disable_qos_done(void *cb_arg) 6872 { 6873 struct set_qos_limit_ctx *ctx = cb_arg; 6874 struct spdk_bdev *bdev = ctx->bdev; 6875 struct spdk_bdev_io *bdev_io; 6876 struct spdk_bdev_qos *qos; 6877 6878 pthread_mutex_lock(&bdev->internal.mutex); 6879 qos = bdev->internal.qos; 6880 bdev->internal.qos = NULL; 6881 pthread_mutex_unlock(&bdev->internal.mutex); 6882 6883 while (!TAILQ_EMPTY(&qos->queued)) { 6884 /* Send queued I/O back to their original thread for resubmission. */ 6885 bdev_io = TAILQ_FIRST(&qos->queued); 6886 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6887 6888 if (bdev_io->internal.io_submit_ch) { 6889 /* 6890 * Channel was changed when sending it to the QoS thread - change it back 6891 * before sending it back to the original thread. 6892 */ 6893 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6894 bdev_io->internal.io_submit_ch = NULL; 6895 } 6896 6897 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6898 _bdev_io_submit, bdev_io); 6899 } 6900 6901 if (qos->thread != NULL) { 6902 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6903 spdk_poller_unregister(&qos->poller); 6904 } 6905 6906 free(qos); 6907 6908 bdev_set_qos_limit_done(ctx, 0); 6909 } 6910 6911 static void 6912 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6913 { 6914 void *io_device = spdk_io_channel_iter_get_io_device(i); 6915 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6916 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6917 struct spdk_thread *thread; 6918 6919 pthread_mutex_lock(&bdev->internal.mutex); 6920 thread = bdev->internal.qos->thread; 6921 pthread_mutex_unlock(&bdev->internal.mutex); 6922 6923 if (thread != NULL) { 6924 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6925 } else { 6926 bdev_disable_qos_done(ctx); 6927 } 6928 } 6929 6930 static void 6931 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6932 { 6933 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6934 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6935 6936 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6937 6938 spdk_for_each_channel_continue(i, 0); 6939 } 6940 6941 static void 6942 bdev_update_qos_rate_limit_msg(void *cb_arg) 6943 { 6944 struct set_qos_limit_ctx *ctx = cb_arg; 6945 struct spdk_bdev *bdev = ctx->bdev; 6946 6947 pthread_mutex_lock(&bdev->internal.mutex); 6948 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6949 pthread_mutex_unlock(&bdev->internal.mutex); 6950 6951 bdev_set_qos_limit_done(ctx, 0); 6952 } 6953 6954 static void 6955 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6956 { 6957 void *io_device = spdk_io_channel_iter_get_io_device(i); 6958 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6959 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6960 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6961 6962 pthread_mutex_lock(&bdev->internal.mutex); 6963 bdev_enable_qos(bdev, bdev_ch); 6964 pthread_mutex_unlock(&bdev->internal.mutex); 6965 spdk_for_each_channel_continue(i, 0); 6966 } 6967 6968 static void 6969 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6970 { 6971 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6972 6973 bdev_set_qos_limit_done(ctx, status); 6974 } 6975 6976 static void 6977 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6978 { 6979 int i; 6980 6981 assert(bdev->internal.qos != NULL); 6982 6983 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6984 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6985 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6986 6987 if (limits[i] == 0) { 6988 bdev->internal.qos->rate_limits[i].limit = 6989 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6990 } 6991 } 6992 } 6993 } 6994 6995 void 6996 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6997 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6998 { 6999 struct set_qos_limit_ctx *ctx; 7000 uint32_t limit_set_complement; 7001 uint64_t min_limit_per_sec; 7002 int i; 7003 bool disable_rate_limit = true; 7004 7005 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7006 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7007 continue; 7008 } 7009 7010 if (limits[i] > 0) { 7011 disable_rate_limit = false; 7012 } 7013 7014 if (bdev_qos_is_iops_rate_limit(i) == true) { 7015 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7016 } else { 7017 /* Change from megabyte to byte rate limit */ 7018 limits[i] = limits[i] * 1024 * 1024; 7019 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7020 } 7021 7022 limit_set_complement = limits[i] % min_limit_per_sec; 7023 if (limit_set_complement) { 7024 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7025 limits[i], min_limit_per_sec); 7026 limits[i] += min_limit_per_sec - limit_set_complement; 7027 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7028 } 7029 } 7030 7031 ctx = calloc(1, sizeof(*ctx)); 7032 if (ctx == NULL) { 7033 cb_fn(cb_arg, -ENOMEM); 7034 return; 7035 } 7036 7037 ctx->cb_fn = cb_fn; 7038 ctx->cb_arg = cb_arg; 7039 ctx->bdev = bdev; 7040 7041 pthread_mutex_lock(&bdev->internal.mutex); 7042 if (bdev->internal.qos_mod_in_progress) { 7043 pthread_mutex_unlock(&bdev->internal.mutex); 7044 free(ctx); 7045 cb_fn(cb_arg, -EAGAIN); 7046 return; 7047 } 7048 bdev->internal.qos_mod_in_progress = true; 7049 7050 if (disable_rate_limit == true && bdev->internal.qos) { 7051 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7052 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7053 (bdev->internal.qos->rate_limits[i].limit > 0 && 7054 bdev->internal.qos->rate_limits[i].limit != 7055 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7056 disable_rate_limit = false; 7057 break; 7058 } 7059 } 7060 } 7061 7062 if (disable_rate_limit == false) { 7063 if (bdev->internal.qos == NULL) { 7064 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7065 if (!bdev->internal.qos) { 7066 pthread_mutex_unlock(&bdev->internal.mutex); 7067 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7068 bdev_set_qos_limit_done(ctx, -ENOMEM); 7069 return; 7070 } 7071 } 7072 7073 if (bdev->internal.qos->thread == NULL) { 7074 /* Enabling */ 7075 bdev_set_qos_rate_limits(bdev, limits); 7076 7077 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7078 bdev_enable_qos_msg, ctx, 7079 bdev_enable_qos_done); 7080 } else { 7081 /* Updating */ 7082 bdev_set_qos_rate_limits(bdev, limits); 7083 7084 spdk_thread_send_msg(bdev->internal.qos->thread, 7085 bdev_update_qos_rate_limit_msg, ctx); 7086 } 7087 } else { 7088 if (bdev->internal.qos != NULL) { 7089 bdev_set_qos_rate_limits(bdev, limits); 7090 7091 /* Disabling */ 7092 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7093 bdev_disable_qos_msg, ctx, 7094 bdev_disable_qos_msg_done); 7095 } else { 7096 pthread_mutex_unlock(&bdev->internal.mutex); 7097 bdev_set_qos_limit_done(ctx, 0); 7098 return; 7099 } 7100 } 7101 7102 pthread_mutex_unlock(&bdev->internal.mutex); 7103 } 7104 7105 struct spdk_bdev_histogram_ctx { 7106 spdk_bdev_histogram_status_cb cb_fn; 7107 void *cb_arg; 7108 struct spdk_bdev *bdev; 7109 int status; 7110 }; 7111 7112 static void 7113 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 7114 { 7115 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7116 7117 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7118 ctx->bdev->internal.histogram_in_progress = false; 7119 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7120 ctx->cb_fn(ctx->cb_arg, ctx->status); 7121 free(ctx); 7122 } 7123 7124 static void 7125 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 7126 { 7127 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7128 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7129 7130 if (ch->histogram != NULL) { 7131 spdk_histogram_data_free(ch->histogram); 7132 ch->histogram = NULL; 7133 } 7134 spdk_for_each_channel_continue(i, 0); 7135 } 7136 7137 static void 7138 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 7139 { 7140 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7141 7142 if (status != 0) { 7143 ctx->status = status; 7144 ctx->bdev->internal.histogram_enabled = false; 7145 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 7146 bdev_histogram_disable_channel_cb); 7147 } else { 7148 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7149 ctx->bdev->internal.histogram_in_progress = false; 7150 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7151 ctx->cb_fn(ctx->cb_arg, ctx->status); 7152 free(ctx); 7153 } 7154 } 7155 7156 static void 7157 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 7158 { 7159 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7160 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7161 int status = 0; 7162 7163 if (ch->histogram == NULL) { 7164 ch->histogram = spdk_histogram_data_alloc(); 7165 if (ch->histogram == NULL) { 7166 status = -ENOMEM; 7167 } 7168 } 7169 7170 spdk_for_each_channel_continue(i, status); 7171 } 7172 7173 void 7174 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7175 void *cb_arg, bool enable) 7176 { 7177 struct spdk_bdev_histogram_ctx *ctx; 7178 7179 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7180 if (ctx == NULL) { 7181 cb_fn(cb_arg, -ENOMEM); 7182 return; 7183 } 7184 7185 ctx->bdev = bdev; 7186 ctx->status = 0; 7187 ctx->cb_fn = cb_fn; 7188 ctx->cb_arg = cb_arg; 7189 7190 pthread_mutex_lock(&bdev->internal.mutex); 7191 if (bdev->internal.histogram_in_progress) { 7192 pthread_mutex_unlock(&bdev->internal.mutex); 7193 free(ctx); 7194 cb_fn(cb_arg, -EAGAIN); 7195 return; 7196 } 7197 7198 bdev->internal.histogram_in_progress = true; 7199 pthread_mutex_unlock(&bdev->internal.mutex); 7200 7201 bdev->internal.histogram_enabled = enable; 7202 7203 if (enable) { 7204 /* Allocate histogram for each channel */ 7205 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 7206 bdev_histogram_enable_channel_cb); 7207 } else { 7208 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 7209 bdev_histogram_disable_channel_cb); 7210 } 7211 } 7212 7213 struct spdk_bdev_histogram_data_ctx { 7214 spdk_bdev_histogram_data_cb cb_fn; 7215 void *cb_arg; 7216 struct spdk_bdev *bdev; 7217 /** merged histogram data from all channels */ 7218 struct spdk_histogram_data *histogram; 7219 }; 7220 7221 static void 7222 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 7223 { 7224 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7225 7226 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7227 free(ctx); 7228 } 7229 7230 static void 7231 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 7232 { 7233 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7234 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7235 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7236 int status = 0; 7237 7238 if (ch->histogram == NULL) { 7239 status = -EFAULT; 7240 } else { 7241 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7242 } 7243 7244 spdk_for_each_channel_continue(i, status); 7245 } 7246 7247 void 7248 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7249 spdk_bdev_histogram_data_cb cb_fn, 7250 void *cb_arg) 7251 { 7252 struct spdk_bdev_histogram_data_ctx *ctx; 7253 7254 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7255 if (ctx == NULL) { 7256 cb_fn(cb_arg, -ENOMEM, NULL); 7257 return; 7258 } 7259 7260 ctx->bdev = bdev; 7261 ctx->cb_fn = cb_fn; 7262 ctx->cb_arg = cb_arg; 7263 7264 ctx->histogram = histogram; 7265 7266 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 7267 bdev_histogram_get_channel_cb); 7268 } 7269 7270 size_t 7271 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 7272 size_t max_events) 7273 { 7274 struct media_event_entry *entry; 7275 size_t num_events = 0; 7276 7277 for (; num_events < max_events; ++num_events) { 7278 entry = TAILQ_FIRST(&desc->pending_media_events); 7279 if (entry == NULL) { 7280 break; 7281 } 7282 7283 events[num_events] = entry->event; 7284 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 7285 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 7286 } 7287 7288 return num_events; 7289 } 7290 7291 int 7292 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 7293 size_t num_events) 7294 { 7295 struct spdk_bdev_desc *desc; 7296 struct media_event_entry *entry; 7297 size_t event_id; 7298 int rc = 0; 7299 7300 assert(bdev->media_events); 7301 7302 pthread_mutex_lock(&bdev->internal.mutex); 7303 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7304 if (desc->write) { 7305 break; 7306 } 7307 } 7308 7309 if (desc == NULL || desc->media_events_buffer == NULL) { 7310 rc = -ENODEV; 7311 goto out; 7312 } 7313 7314 for (event_id = 0; event_id < num_events; ++event_id) { 7315 entry = TAILQ_FIRST(&desc->free_media_events); 7316 if (entry == NULL) { 7317 break; 7318 } 7319 7320 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 7321 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 7322 entry->event = events[event_id]; 7323 } 7324 7325 rc = event_id; 7326 out: 7327 pthread_mutex_unlock(&bdev->internal.mutex); 7328 return rc; 7329 } 7330 7331 void 7332 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 7333 { 7334 struct spdk_bdev_desc *desc; 7335 7336 pthread_mutex_lock(&bdev->internal.mutex); 7337 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7338 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 7339 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 7340 desc->callback.ctx); 7341 } 7342 } 7343 pthread_mutex_unlock(&bdev->internal.mutex); 7344 } 7345 7346 struct locked_lba_range_ctx { 7347 struct lba_range range; 7348 struct spdk_bdev *bdev; 7349 struct lba_range *current_range; 7350 struct lba_range *owner_range; 7351 struct spdk_poller *poller; 7352 lock_range_cb cb_fn; 7353 void *cb_arg; 7354 }; 7355 7356 static void 7357 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 7358 { 7359 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7360 7361 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 7362 free(ctx); 7363 } 7364 7365 static void 7366 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 7367 7368 static void 7369 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7370 { 7371 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7372 struct spdk_bdev *bdev = ctx->bdev; 7373 7374 if (status == -ENOMEM) { 7375 /* One of the channels could not allocate a range object. 7376 * So we have to go back and clean up any ranges that were 7377 * allocated successfully before we return error status to 7378 * the caller. We can reuse the unlock function to do that 7379 * clean up. 7380 */ 7381 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7382 bdev_unlock_lba_range_get_channel, ctx, 7383 bdev_lock_error_cleanup_cb); 7384 return; 7385 } 7386 7387 /* All channels have locked this range and no I/O overlapping the range 7388 * are outstanding! Set the owner_ch for the range object for the 7389 * locking channel, so that this channel will know that it is allowed 7390 * to write to this range. 7391 */ 7392 ctx->owner_range->owner_ch = ctx->range.owner_ch; 7393 ctx->cb_fn(ctx->cb_arg, status); 7394 7395 /* Don't free the ctx here. Its range is in the bdev's global list of 7396 * locked ranges still, and will be removed and freed when this range 7397 * is later unlocked. 7398 */ 7399 } 7400 7401 static int 7402 bdev_lock_lba_range_check_io(void *_i) 7403 { 7404 struct spdk_io_channel_iter *i = _i; 7405 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7406 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7407 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7408 struct lba_range *range = ctx->current_range; 7409 struct spdk_bdev_io *bdev_io; 7410 7411 spdk_poller_unregister(&ctx->poller); 7412 7413 /* The range is now in the locked_ranges, so no new IO can be submitted to this 7414 * range. But we need to wait until any outstanding IO overlapping with this range 7415 * are completed. 7416 */ 7417 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 7418 if (bdev_io_range_is_locked(bdev_io, range)) { 7419 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 7420 return SPDK_POLLER_BUSY; 7421 } 7422 } 7423 7424 spdk_for_each_channel_continue(i, 0); 7425 return SPDK_POLLER_BUSY; 7426 } 7427 7428 static void 7429 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7430 { 7431 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7432 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7433 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7434 struct lba_range *range; 7435 7436 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7437 if (range->length == ctx->range.length && 7438 range->offset == ctx->range.offset && 7439 range->locked_ctx == ctx->range.locked_ctx) { 7440 /* This range already exists on this channel, so don't add 7441 * it again. This can happen when a new channel is created 7442 * while the for_each_channel operation is in progress. 7443 * Do not check for outstanding I/O in that case, since the 7444 * range was locked before any I/O could be submitted to the 7445 * new channel. 7446 */ 7447 spdk_for_each_channel_continue(i, 0); 7448 return; 7449 } 7450 } 7451 7452 range = calloc(1, sizeof(*range)); 7453 if (range == NULL) { 7454 spdk_for_each_channel_continue(i, -ENOMEM); 7455 return; 7456 } 7457 7458 range->length = ctx->range.length; 7459 range->offset = ctx->range.offset; 7460 range->locked_ctx = ctx->range.locked_ctx; 7461 ctx->current_range = range; 7462 if (ctx->range.owner_ch == ch) { 7463 /* This is the range object for the channel that will hold 7464 * the lock. Store it in the ctx object so that we can easily 7465 * set its owner_ch after the lock is finally acquired. 7466 */ 7467 ctx->owner_range = range; 7468 } 7469 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 7470 bdev_lock_lba_range_check_io(i); 7471 } 7472 7473 static void 7474 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 7475 { 7476 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 7477 7478 /* We will add a copy of this range to each channel now. */ 7479 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 7480 bdev_lock_lba_range_cb); 7481 } 7482 7483 static bool 7484 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 7485 { 7486 struct lba_range *r; 7487 7488 TAILQ_FOREACH(r, tailq, tailq) { 7489 if (bdev_lba_range_overlapped(range, r)) { 7490 return true; 7491 } 7492 } 7493 return false; 7494 } 7495 7496 static int 7497 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7498 uint64_t offset, uint64_t length, 7499 lock_range_cb cb_fn, void *cb_arg) 7500 { 7501 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7502 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7503 struct locked_lba_range_ctx *ctx; 7504 7505 if (cb_arg == NULL) { 7506 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7507 return -EINVAL; 7508 } 7509 7510 ctx = calloc(1, sizeof(*ctx)); 7511 if (ctx == NULL) { 7512 return -ENOMEM; 7513 } 7514 7515 ctx->range.offset = offset; 7516 ctx->range.length = length; 7517 ctx->range.owner_ch = ch; 7518 ctx->range.locked_ctx = cb_arg; 7519 ctx->bdev = bdev; 7520 ctx->cb_fn = cb_fn; 7521 ctx->cb_arg = cb_arg; 7522 7523 pthread_mutex_lock(&bdev->internal.mutex); 7524 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7525 /* There is an active lock overlapping with this range. 7526 * Put it on the pending list until this range no 7527 * longer overlaps with another. 7528 */ 7529 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7530 } else { 7531 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7532 bdev_lock_lba_range_ctx(bdev, ctx); 7533 } 7534 pthread_mutex_unlock(&bdev->internal.mutex); 7535 return 0; 7536 } 7537 7538 static void 7539 bdev_lock_lba_range_ctx_msg(void *_ctx) 7540 { 7541 struct locked_lba_range_ctx *ctx = _ctx; 7542 7543 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7544 } 7545 7546 static void 7547 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7548 { 7549 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7550 struct locked_lba_range_ctx *pending_ctx; 7551 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7552 struct spdk_bdev *bdev = ch->bdev; 7553 struct lba_range *range, *tmp; 7554 7555 pthread_mutex_lock(&bdev->internal.mutex); 7556 /* Check if there are any pending locked ranges that overlap with this range 7557 * that was just unlocked. If there are, check that it doesn't overlap with any 7558 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7559 * the lock process. 7560 */ 7561 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7562 if (bdev_lba_range_overlapped(range, &ctx->range) && 7563 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7564 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7565 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7566 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7567 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7568 bdev_lock_lba_range_ctx_msg, pending_ctx); 7569 } 7570 } 7571 pthread_mutex_unlock(&bdev->internal.mutex); 7572 7573 ctx->cb_fn(ctx->cb_arg, status); 7574 free(ctx); 7575 } 7576 7577 static void 7578 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7579 { 7580 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7581 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7582 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7583 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7584 struct spdk_bdev_io *bdev_io; 7585 struct lba_range *range; 7586 7587 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7588 if (ctx->range.offset == range->offset && 7589 ctx->range.length == range->length && 7590 ctx->range.locked_ctx == range->locked_ctx) { 7591 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7592 free(range); 7593 break; 7594 } 7595 } 7596 7597 /* Note: we should almost always be able to assert that the range specified 7598 * was found. But there are some very rare corner cases where a new channel 7599 * gets created simultaneously with a range unlock, where this function 7600 * would execute on that new channel and wouldn't have the range. 7601 * We also use this to clean up range allocations when a later allocation 7602 * fails in the locking path. 7603 * So we can't actually assert() here. 7604 */ 7605 7606 /* Swap the locked IO into a temporary list, and then try to submit them again. 7607 * We could hyper-optimize this to only resubmit locked I/O that overlap 7608 * with the range that was just unlocked, but this isn't a performance path so 7609 * we go for simplicity here. 7610 */ 7611 TAILQ_INIT(&io_locked); 7612 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7613 while (!TAILQ_EMPTY(&io_locked)) { 7614 bdev_io = TAILQ_FIRST(&io_locked); 7615 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7616 bdev_io_submit(bdev_io); 7617 } 7618 7619 spdk_for_each_channel_continue(i, 0); 7620 } 7621 7622 static int 7623 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7624 uint64_t offset, uint64_t length, 7625 lock_range_cb cb_fn, void *cb_arg) 7626 { 7627 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7628 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7629 struct locked_lba_range_ctx *ctx; 7630 struct lba_range *range; 7631 bool range_found = false; 7632 7633 /* Let's make sure the specified channel actually has a lock on 7634 * the specified range. Note that the range must match exactly. 7635 */ 7636 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7637 if (range->offset == offset && range->length == length && 7638 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7639 range_found = true; 7640 break; 7641 } 7642 } 7643 7644 if (!range_found) { 7645 return -EINVAL; 7646 } 7647 7648 pthread_mutex_lock(&bdev->internal.mutex); 7649 /* We confirmed that this channel has locked the specified range. To 7650 * start the unlock the process, we find the range in the bdev's locked_ranges 7651 * and remove it. This ensures new channels don't inherit the locked range. 7652 * Then we will send a message to each channel (including the one specified 7653 * here) to remove the range from its per-channel list. 7654 */ 7655 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7656 if (range->offset == offset && range->length == length && 7657 range->locked_ctx == cb_arg) { 7658 break; 7659 } 7660 } 7661 if (range == NULL) { 7662 assert(false); 7663 pthread_mutex_unlock(&bdev->internal.mutex); 7664 return -EINVAL; 7665 } 7666 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7667 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7668 pthread_mutex_unlock(&bdev->internal.mutex); 7669 7670 ctx->cb_fn = cb_fn; 7671 ctx->cb_arg = cb_arg; 7672 7673 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7674 bdev_unlock_lba_range_cb); 7675 return 0; 7676 } 7677 7678 int 7679 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7680 int array_size) 7681 { 7682 if (!bdev) { 7683 return -EINVAL; 7684 } 7685 7686 if (bdev->fn_table->get_memory_domains) { 7687 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7688 } 7689 7690 return 0; 7691 } 7692 7693 SPDK_LOG_REGISTER_COMPONENT(bdev) 7694 7695 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 7696 { 7697 struct spdk_trace_tpoint_opts opts[] = { 7698 { 7699 "BDEV_IO_START", TRACE_BDEV_IO_START, 7700 OWNER_BDEV, OBJECT_BDEV_IO, 1, 7701 { 7702 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7703 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 7704 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7705 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 } 7706 } 7707 }, 7708 { 7709 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 7710 OWNER_BDEV, OBJECT_BDEV_IO, 0, 7711 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7712 }, 7713 { 7714 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 7715 OWNER_BDEV, OBJECT_NONE, 1, 7716 { 7717 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7718 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7719 } 7720 }, 7721 { 7722 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 7723 OWNER_BDEV, OBJECT_NONE, 0, 7724 { 7725 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7726 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7727 } 7728 }, 7729 }; 7730 7731 7732 spdk_trace_register_owner(OWNER_BDEV, 'b'); 7733 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 7734 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7735 } 7736