1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/notify.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk/log.h" 52 #include "spdk/string.h" 53 54 #include "bdev_internal.h" 55 #include "spdk_internal/trace_defs.h" 56 57 #ifdef SPDK_CONFIG_VTUNE 58 #include "ittnotify.h" 59 #include "ittnotify_types.h" 60 int __itt_init_ittlib(const char *, __itt_group_id); 61 #endif 62 63 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 64 #define SPDK_BDEV_IO_CACHE_SIZE 256 65 #define SPDK_BDEV_AUTO_EXAMINE true 66 #define BUF_SMALL_POOL_SIZE 8191 67 #define BUF_LARGE_POOL_SIZE 1023 68 #define NOMEM_THRESHOLD_COUNT 8 69 #define ZERO_BUFFER_SIZE 0x100000 70 71 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 72 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 73 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 74 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 75 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 76 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 77 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 78 79 #define SPDK_BDEV_POOL_ALIGNMENT 512 80 81 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 82 * when splitting into children requests at a time. 83 */ 84 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 85 86 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 87 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 88 }; 89 90 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 91 92 RB_HEAD(bdev_name_tree, spdk_bdev_name); 93 94 static int 95 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 96 { 97 return strcmp(name1->name, name2->name); 98 } 99 100 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 101 102 struct spdk_bdev_mgr { 103 struct spdk_mempool *bdev_io_pool; 104 105 struct spdk_mempool *buf_small_pool; 106 struct spdk_mempool *buf_large_pool; 107 108 void *zero_buffer; 109 110 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 111 112 struct spdk_bdev_list bdevs; 113 struct bdev_name_tree bdev_names; 114 115 bool init_complete; 116 bool module_init_complete; 117 118 pthread_mutex_t mutex; 119 120 #ifdef SPDK_CONFIG_VTUNE 121 __itt_domain *domain; 122 #endif 123 }; 124 125 static struct spdk_bdev_mgr g_bdev_mgr = { 126 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 127 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 128 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 129 .init_complete = false, 130 .module_init_complete = false, 131 .mutex = PTHREAD_MUTEX_INITIALIZER, 132 }; 133 134 typedef void (*lock_range_cb)(void *ctx, int status); 135 136 struct lba_range { 137 uint64_t offset; 138 uint64_t length; 139 void *locked_ctx; 140 struct spdk_bdev_channel *owner_ch; 141 TAILQ_ENTRY(lba_range) tailq; 142 }; 143 144 static struct spdk_bdev_opts g_bdev_opts = { 145 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 146 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 147 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 148 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 149 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 150 }; 151 152 static spdk_bdev_init_cb g_init_cb_fn = NULL; 153 static void *g_init_cb_arg = NULL; 154 155 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 156 static void *g_fini_cb_arg = NULL; 157 static struct spdk_thread *g_fini_thread = NULL; 158 159 struct spdk_bdev_qos_limit { 160 /** IOs or bytes allowed per second (i.e., 1s). */ 161 uint64_t limit; 162 163 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 164 * For remaining bytes, allowed to run negative if an I/O is submitted when 165 * some bytes are remaining, but the I/O is bigger than that amount. The 166 * excess will be deducted from the next timeslice. 167 */ 168 int64_t remaining_this_timeslice; 169 170 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 171 uint32_t min_per_timeslice; 172 173 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t max_per_timeslice; 175 176 /** Function to check whether to queue the IO. */ 177 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 178 179 /** Function to update for the submitted IO. */ 180 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 }; 182 183 struct spdk_bdev_qos { 184 /** Types of structure of rate limits. */ 185 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 186 187 /** The channel that all I/O are funneled through. */ 188 struct spdk_bdev_channel *ch; 189 190 /** The thread on which the poller is running. */ 191 struct spdk_thread *thread; 192 193 /** Queue of I/O waiting to be issued. */ 194 bdev_io_tailq_t queued; 195 196 /** Size of a timeslice in tsc ticks. */ 197 uint64_t timeslice_size; 198 199 /** Timestamp of start of last timeslice. */ 200 uint64_t last_timeslice; 201 202 /** Poller that processes queued I/O commands each time slice. */ 203 struct spdk_poller *poller; 204 }; 205 206 struct spdk_bdev_mgmt_channel { 207 bdev_io_stailq_t need_buf_small; 208 bdev_io_stailq_t need_buf_large; 209 210 /* 211 * Each thread keeps a cache of bdev_io - this allows 212 * bdev threads which are *not* DPDK threads to still 213 * benefit from a per-thread bdev_io cache. Without 214 * this, non-DPDK threads fetching from the mempool 215 * incur a cmpxchg on get and put. 216 */ 217 bdev_io_stailq_t per_thread_cache; 218 uint32_t per_thread_cache_count; 219 uint32_t bdev_io_cache_size; 220 221 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 222 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 223 }; 224 225 /* 226 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 227 * will queue here their IO that awaits retry. It makes it possible to retry sending 228 * IO to one bdev after IO from other bdev completes. 229 */ 230 struct spdk_bdev_shared_resource { 231 /* The bdev management channel */ 232 struct spdk_bdev_mgmt_channel *mgmt_ch; 233 234 /* 235 * Count of I/O submitted to bdev module and waiting for completion. 236 * Incremented before submit_request() is called on an spdk_bdev_io. 237 */ 238 uint64_t io_outstanding; 239 240 /* 241 * Queue of IO awaiting retry because of a previous NOMEM status returned 242 * on this channel. 243 */ 244 bdev_io_tailq_t nomem_io; 245 246 /* 247 * Threshold which io_outstanding must drop to before retrying nomem_io. 248 */ 249 uint64_t nomem_threshold; 250 251 /* I/O channel allocated by a bdev module */ 252 struct spdk_io_channel *shared_ch; 253 254 /* Refcount of bdev channels using this resource */ 255 uint32_t ref; 256 257 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 258 }; 259 260 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 261 #define BDEV_CH_QOS_ENABLED (1 << 1) 262 263 struct spdk_bdev_channel { 264 struct spdk_bdev *bdev; 265 266 /* The channel for the underlying device */ 267 struct spdk_io_channel *channel; 268 269 /* Per io_device per thread data */ 270 struct spdk_bdev_shared_resource *shared_resource; 271 272 struct spdk_bdev_io_stat stat; 273 274 /* 275 * Count of I/O submitted to the underlying dev module through this channel 276 * and waiting for completion. 277 */ 278 uint64_t io_outstanding; 279 280 /* 281 * List of all submitted I/Os including I/O that are generated via splitting. 282 */ 283 bdev_io_tailq_t io_submitted; 284 285 /* 286 * List of spdk_bdev_io that are currently queued because they write to a locked 287 * LBA range. 288 */ 289 bdev_io_tailq_t io_locked; 290 291 uint32_t flags; 292 293 struct spdk_histogram_data *histogram; 294 295 #ifdef SPDK_CONFIG_VTUNE 296 uint64_t start_tsc; 297 uint64_t interval_tsc; 298 __itt_string_handle *handle; 299 struct spdk_bdev_io_stat prev_stat; 300 #endif 301 302 bdev_io_tailq_t queued_resets; 303 304 lba_range_tailq_t locked_ranges; 305 }; 306 307 struct media_event_entry { 308 struct spdk_bdev_media_event event; 309 TAILQ_ENTRY(media_event_entry) tailq; 310 }; 311 312 #define MEDIA_EVENT_POOL_SIZE 64 313 314 struct spdk_bdev_desc { 315 struct spdk_bdev *bdev; 316 struct spdk_thread *thread; 317 struct { 318 spdk_bdev_event_cb_t event_fn; 319 void *ctx; 320 } callback; 321 bool closed; 322 bool write; 323 pthread_mutex_t mutex; 324 uint32_t refs; 325 TAILQ_HEAD(, media_event_entry) pending_media_events; 326 TAILQ_HEAD(, media_event_entry) free_media_events; 327 struct media_event_entry *media_events_buffer; 328 TAILQ_ENTRY(spdk_bdev_desc) link; 329 330 uint64_t timeout_in_sec; 331 spdk_bdev_io_timeout_cb cb_fn; 332 void *cb_arg; 333 struct spdk_poller *io_timeout_poller; 334 }; 335 336 struct spdk_bdev_iostat_ctx { 337 struct spdk_bdev_io_stat *stat; 338 spdk_bdev_get_device_stat_cb cb; 339 void *cb_arg; 340 }; 341 342 struct set_qos_limit_ctx { 343 void (*cb_fn)(void *cb_arg, int status); 344 void *cb_arg; 345 struct spdk_bdev *bdev; 346 }; 347 348 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 349 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 350 351 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 352 static void bdev_write_zero_buffer_next(void *_bdev_io); 353 354 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 355 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 356 357 static int 358 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 359 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 360 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 361 struct spdk_bdev_ext_io_opts *opts); 362 static int 363 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 364 struct iovec *iov, int iovcnt, void *md_buf, 365 uint64_t offset_blocks, uint64_t num_blocks, 366 spdk_bdev_io_completion_cb cb, void *cb_arg, 367 struct spdk_bdev_ext_io_opts *opts); 368 369 static int 370 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 371 uint64_t offset, uint64_t length, 372 lock_range_cb cb_fn, void *cb_arg); 373 374 static int 375 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 376 uint64_t offset, uint64_t length, 377 lock_range_cb cb_fn, void *cb_arg); 378 379 static inline void bdev_io_complete(void *ctx); 380 381 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 382 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 383 384 void 385 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 386 { 387 if (!opts) { 388 SPDK_ERRLOG("opts should not be NULL\n"); 389 return; 390 } 391 392 if (!opts_size) { 393 SPDK_ERRLOG("opts_size should not be zero value\n"); 394 return; 395 } 396 397 opts->opts_size = opts_size; 398 399 #define SET_FIELD(field) \ 400 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 401 opts->field = g_bdev_opts.field; \ 402 } \ 403 404 SET_FIELD(bdev_io_pool_size); 405 SET_FIELD(bdev_io_cache_size); 406 SET_FIELD(bdev_auto_examine); 407 SET_FIELD(small_buf_pool_size); 408 SET_FIELD(large_buf_pool_size); 409 410 /* Do not remove this statement, you should always update this statement when you adding a new field, 411 * and do not forget to add the SET_FIELD statement for your added field. */ 412 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 413 414 #undef SET_FIELD 415 } 416 417 int 418 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 419 { 420 uint32_t min_pool_size; 421 422 if (!opts) { 423 SPDK_ERRLOG("opts cannot be NULL\n"); 424 return -1; 425 } 426 427 if (!opts->opts_size) { 428 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 429 return -1; 430 } 431 432 /* 433 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 434 * initialization. A second mgmt_ch will be created on the same thread when the application starts 435 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 436 */ 437 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 438 if (opts->bdev_io_pool_size < min_pool_size) { 439 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 440 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 441 spdk_thread_get_count()); 442 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 443 return -1; 444 } 445 446 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 447 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 448 return -1; 449 } 450 451 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 452 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 453 return -1; 454 } 455 456 #define SET_FIELD(field) \ 457 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 458 g_bdev_opts.field = opts->field; \ 459 } \ 460 461 SET_FIELD(bdev_io_pool_size); 462 SET_FIELD(bdev_io_cache_size); 463 SET_FIELD(bdev_auto_examine); 464 SET_FIELD(small_buf_pool_size); 465 SET_FIELD(large_buf_pool_size); 466 467 g_bdev_opts.opts_size = opts->opts_size; 468 469 #undef SET_FIELD 470 471 return 0; 472 } 473 474 static struct spdk_bdev * 475 bdev_get_by_name(const char *bdev_name) 476 { 477 struct spdk_bdev_name find; 478 struct spdk_bdev_name *res; 479 480 find.name = (char *)bdev_name; 481 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 482 if (res != NULL) { 483 return res->bdev; 484 } 485 486 return NULL; 487 } 488 489 struct spdk_bdev * 490 spdk_bdev_get_by_name(const char *bdev_name) 491 { 492 struct spdk_bdev *bdev; 493 494 pthread_mutex_lock(&g_bdev_mgr.mutex); 495 bdev = bdev_get_by_name(bdev_name); 496 pthread_mutex_unlock(&g_bdev_mgr.mutex); 497 498 return bdev; 499 } 500 501 struct spdk_bdev_wait_for_examine_ctx { 502 struct spdk_poller *poller; 503 spdk_bdev_wait_for_examine_cb cb_fn; 504 void *cb_arg; 505 }; 506 507 static bool 508 bdev_module_all_actions_completed(void); 509 510 static int 511 bdev_wait_for_examine_cb(void *arg) 512 { 513 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 514 515 if (!bdev_module_all_actions_completed()) { 516 return SPDK_POLLER_IDLE; 517 } 518 519 spdk_poller_unregister(&ctx->poller); 520 ctx->cb_fn(ctx->cb_arg); 521 free(ctx); 522 523 return SPDK_POLLER_BUSY; 524 } 525 526 int 527 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 528 { 529 struct spdk_bdev_wait_for_examine_ctx *ctx; 530 531 ctx = calloc(1, sizeof(*ctx)); 532 if (ctx == NULL) { 533 return -ENOMEM; 534 } 535 ctx->cb_fn = cb_fn; 536 ctx->cb_arg = cb_arg; 537 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 538 539 return 0; 540 } 541 542 struct spdk_bdev_examine_item { 543 char *name; 544 TAILQ_ENTRY(spdk_bdev_examine_item) link; 545 }; 546 547 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 548 549 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 550 g_bdev_examine_allowlist); 551 552 static inline bool 553 bdev_examine_allowlist_check(const char *name) 554 { 555 struct spdk_bdev_examine_item *item; 556 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 557 if (strcmp(name, item->name) == 0) { 558 return true; 559 } 560 } 561 return false; 562 } 563 564 static inline void 565 bdev_examine_allowlist_free(void) 566 { 567 struct spdk_bdev_examine_item *item; 568 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 569 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 570 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 571 free(item->name); 572 free(item); 573 } 574 } 575 576 static inline bool 577 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 578 { 579 struct spdk_bdev_alias *tmp; 580 if (bdev_examine_allowlist_check(bdev->name)) { 581 return true; 582 } 583 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 584 if (bdev_examine_allowlist_check(tmp->alias.name)) { 585 return true; 586 } 587 } 588 return false; 589 } 590 591 static inline bool 592 bdev_ok_to_examine(struct spdk_bdev *bdev) 593 { 594 if (g_bdev_opts.bdev_auto_examine) { 595 return true; 596 } else { 597 return bdev_in_examine_allowlist(bdev); 598 } 599 } 600 601 static void 602 bdev_examine(struct spdk_bdev *bdev) 603 { 604 struct spdk_bdev_module *module; 605 uint32_t action; 606 607 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 608 if (module->examine_config && bdev_ok_to_examine(bdev)) { 609 action = module->internal.action_in_progress; 610 module->internal.action_in_progress++; 611 module->examine_config(bdev); 612 if (action != module->internal.action_in_progress) { 613 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 614 module->name); 615 } 616 } 617 } 618 619 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 620 if (bdev->internal.claim_module->examine_disk) { 621 bdev->internal.claim_module->internal.action_in_progress++; 622 bdev->internal.claim_module->examine_disk(bdev); 623 } 624 return; 625 } 626 627 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 628 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 629 module->internal.action_in_progress++; 630 module->examine_disk(bdev); 631 } 632 } 633 } 634 635 int 636 spdk_bdev_examine(const char *name) 637 { 638 struct spdk_bdev *bdev; 639 struct spdk_bdev_examine_item *item; 640 641 if (g_bdev_opts.bdev_auto_examine) { 642 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 643 return -EINVAL; 644 } 645 646 if (bdev_examine_allowlist_check(name)) { 647 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 648 return -EEXIST; 649 } 650 651 item = calloc(1, sizeof(*item)); 652 if (!item) { 653 return -ENOMEM; 654 } 655 item->name = strdup(name); 656 if (!item->name) { 657 free(item); 658 return -ENOMEM; 659 } 660 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 661 662 bdev = spdk_bdev_get_by_name(name); 663 if (bdev) { 664 bdev_examine(bdev); 665 } 666 return 0; 667 } 668 669 static inline void 670 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 671 { 672 struct spdk_bdev_examine_item *item; 673 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 674 spdk_json_write_object_begin(w); 675 spdk_json_write_named_string(w, "method", "bdev_examine"); 676 spdk_json_write_named_object_begin(w, "params"); 677 spdk_json_write_named_string(w, "name", item->name); 678 spdk_json_write_object_end(w); 679 spdk_json_write_object_end(w); 680 } 681 } 682 683 struct spdk_bdev * 684 spdk_bdev_first(void) 685 { 686 struct spdk_bdev *bdev; 687 688 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 689 if (bdev) { 690 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 691 } 692 693 return bdev; 694 } 695 696 struct spdk_bdev * 697 spdk_bdev_next(struct spdk_bdev *prev) 698 { 699 struct spdk_bdev *bdev; 700 701 bdev = TAILQ_NEXT(prev, internal.link); 702 if (bdev) { 703 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 704 } 705 706 return bdev; 707 } 708 709 static struct spdk_bdev * 710 _bdev_next_leaf(struct spdk_bdev *bdev) 711 { 712 while (bdev != NULL) { 713 if (bdev->internal.claim_module == NULL) { 714 return bdev; 715 } else { 716 bdev = TAILQ_NEXT(bdev, internal.link); 717 } 718 } 719 720 return bdev; 721 } 722 723 struct spdk_bdev * 724 spdk_bdev_first_leaf(void) 725 { 726 struct spdk_bdev *bdev; 727 728 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 729 730 if (bdev) { 731 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 732 } 733 734 return bdev; 735 } 736 737 struct spdk_bdev * 738 spdk_bdev_next_leaf(struct spdk_bdev *prev) 739 { 740 struct spdk_bdev *bdev; 741 742 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 743 744 if (bdev) { 745 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 746 } 747 748 return bdev; 749 } 750 751 void 752 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 753 { 754 struct iovec *iovs; 755 756 if (bdev_io->u.bdev.iovs == NULL) { 757 bdev_io->u.bdev.iovs = &bdev_io->iov; 758 bdev_io->u.bdev.iovcnt = 1; 759 } 760 761 iovs = bdev_io->u.bdev.iovs; 762 763 assert(iovs != NULL); 764 assert(bdev_io->u.bdev.iovcnt >= 1); 765 766 iovs[0].iov_base = buf; 767 iovs[0].iov_len = len; 768 } 769 770 void 771 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 772 { 773 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 774 bdev_io->u.bdev.md_buf = md_buf; 775 } 776 777 static bool 778 _is_buf_allocated(const struct iovec *iovs) 779 { 780 if (iovs == NULL) { 781 return false; 782 } 783 784 return iovs[0].iov_base != NULL; 785 } 786 787 static bool 788 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 789 { 790 int i; 791 uintptr_t iov_base; 792 793 if (spdk_likely(alignment == 1)) { 794 return true; 795 } 796 797 for (i = 0; i < iovcnt; i++) { 798 iov_base = (uintptr_t)iovs[i].iov_base; 799 if ((iov_base & (alignment - 1)) != 0) { 800 return false; 801 } 802 } 803 804 return true; 805 } 806 807 static void 808 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 809 { 810 int i; 811 size_t len; 812 813 for (i = 0; i < iovcnt; i++) { 814 len = spdk_min(iovs[i].iov_len, buf_len); 815 memcpy(buf, iovs[i].iov_base, len); 816 buf += len; 817 buf_len -= len; 818 } 819 } 820 821 static void 822 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 823 { 824 int i; 825 size_t len; 826 827 for (i = 0; i < iovcnt; i++) { 828 len = spdk_min(iovs[i].iov_len, buf_len); 829 memcpy(iovs[i].iov_base, buf, len); 830 buf += len; 831 buf_len -= len; 832 } 833 } 834 835 static void 836 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 837 { 838 /* save original iovec */ 839 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 840 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 841 /* set bounce iov */ 842 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 843 bdev_io->u.bdev.iovcnt = 1; 844 /* set bounce buffer for this operation */ 845 bdev_io->u.bdev.iovs[0].iov_base = buf; 846 bdev_io->u.bdev.iovs[0].iov_len = len; 847 /* if this is write path, copy data from original buffer to bounce buffer */ 848 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 849 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 850 } 851 } 852 853 static void 854 _bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 855 { 856 /* save original md_buf */ 857 bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf; 858 /* set bounce md_buf */ 859 bdev_io->u.bdev.md_buf = md_buf; 860 861 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 862 memcpy(md_buf, bdev_io->internal.orig_md_buf, len); 863 } 864 } 865 866 static void 867 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, void *buf, bool status) 868 { 869 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 870 871 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 872 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 873 bdev_io->internal.get_aux_buf_cb = NULL; 874 } else { 875 assert(bdev_io->internal.get_buf_cb != NULL); 876 bdev_io->internal.buf = buf; 877 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 878 bdev_io->internal.get_buf_cb = NULL; 879 } 880 } 881 882 static void 883 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 884 { 885 struct spdk_bdev *bdev = bdev_io->bdev; 886 bool buf_allocated; 887 uint64_t md_len, alignment; 888 void *aligned_buf; 889 890 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 891 bdev_io_get_buf_complete(bdev_io, buf, true); 892 return; 893 } 894 895 alignment = spdk_bdev_get_buf_align(bdev); 896 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 897 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 898 899 if (buf_allocated) { 900 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 901 } else { 902 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 903 } 904 905 if (spdk_bdev_is_md_separate(bdev)) { 906 aligned_buf = (char *)aligned_buf + len; 907 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 908 909 assert(((uintptr_t)aligned_buf & (alignment - 1)) == 0); 910 911 if (bdev_io->u.bdev.md_buf != NULL) { 912 _bdev_io_set_bounce_md_buf(bdev_io, aligned_buf, md_len); 913 } else { 914 spdk_bdev_io_set_md_buf(bdev_io, aligned_buf, md_len); 915 } 916 } 917 bdev_io_get_buf_complete(bdev_io, buf, true); 918 } 919 920 static void 921 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 922 { 923 struct spdk_bdev *bdev = bdev_io->bdev; 924 struct spdk_mempool *pool; 925 struct spdk_bdev_io *tmp; 926 bdev_io_stailq_t *stailq; 927 struct spdk_bdev_mgmt_channel *ch; 928 uint64_t md_len, alignment; 929 930 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 931 alignment = spdk_bdev_get_buf_align(bdev); 932 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 933 934 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 935 SPDK_BDEV_POOL_ALIGNMENT) { 936 pool = g_bdev_mgr.buf_small_pool; 937 stailq = &ch->need_buf_small; 938 } else { 939 pool = g_bdev_mgr.buf_large_pool; 940 stailq = &ch->need_buf_large; 941 } 942 943 if (STAILQ_EMPTY(stailq)) { 944 spdk_mempool_put(pool, buf); 945 } else { 946 tmp = STAILQ_FIRST(stailq); 947 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 948 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 949 } 950 } 951 952 static void 953 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 954 { 955 assert(bdev_io->internal.buf != NULL); 956 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 957 bdev_io->internal.buf = NULL; 958 } 959 960 void 961 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 962 { 963 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 964 965 assert(buf != NULL); 966 _bdev_io_put_buf(bdev_io, buf, len); 967 } 968 969 static void 970 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 971 { 972 if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) { 973 assert(bdev_io->internal.orig_md_buf == NULL); 974 return; 975 } 976 977 /* if this is read path, copy data from bounce buffer to original buffer */ 978 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 979 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 980 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 981 bdev_io->internal.orig_iovcnt, 982 bdev_io->internal.bounce_iov.iov_base, 983 bdev_io->internal.bounce_iov.iov_len); 984 } 985 /* set original buffer for this io */ 986 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 987 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 988 /* disable bouncing buffer for this io */ 989 bdev_io->internal.orig_iovcnt = 0; 990 bdev_io->internal.orig_iovs = NULL; 991 992 /* do the same for metadata buffer */ 993 if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) { 994 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 995 996 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 997 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 998 memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf, 999 bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev)); 1000 } 1001 1002 bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf; 1003 bdev_io->internal.orig_md_buf = NULL; 1004 } 1005 1006 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1007 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1008 */ 1009 bdev_io_put_buf(bdev_io); 1010 } 1011 1012 static void 1013 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1014 { 1015 struct spdk_bdev *bdev = bdev_io->bdev; 1016 struct spdk_mempool *pool; 1017 bdev_io_stailq_t *stailq; 1018 struct spdk_bdev_mgmt_channel *mgmt_ch; 1019 uint64_t alignment, md_len; 1020 void *buf; 1021 1022 alignment = spdk_bdev_get_buf_align(bdev); 1023 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1024 1025 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1026 SPDK_BDEV_POOL_ALIGNMENT) { 1027 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1028 len + alignment); 1029 bdev_io_get_buf_complete(bdev_io, NULL, false); 1030 return; 1031 } 1032 1033 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1034 1035 bdev_io->internal.buf_len = len; 1036 1037 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1038 SPDK_BDEV_POOL_ALIGNMENT) { 1039 pool = g_bdev_mgr.buf_small_pool; 1040 stailq = &mgmt_ch->need_buf_small; 1041 } else { 1042 pool = g_bdev_mgr.buf_large_pool; 1043 stailq = &mgmt_ch->need_buf_large; 1044 } 1045 1046 buf = spdk_mempool_get(pool); 1047 if (!buf) { 1048 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1049 } else { 1050 _bdev_io_set_buf(bdev_io, buf, len); 1051 } 1052 } 1053 1054 void 1055 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1056 { 1057 struct spdk_bdev *bdev = bdev_io->bdev; 1058 uint64_t alignment; 1059 1060 assert(cb != NULL); 1061 bdev_io->internal.get_buf_cb = cb; 1062 1063 alignment = spdk_bdev_get_buf_align(bdev); 1064 1065 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1066 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1067 /* Buffer already present and aligned */ 1068 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1069 return; 1070 } 1071 1072 bdev_io_get_buf(bdev_io, len); 1073 } 1074 1075 void 1076 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1077 { 1078 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1079 1080 assert(cb != NULL); 1081 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1082 bdev_io->internal.get_aux_buf_cb = cb; 1083 bdev_io_get_buf(bdev_io, len); 1084 } 1085 1086 static int 1087 bdev_module_get_max_ctx_size(void) 1088 { 1089 struct spdk_bdev_module *bdev_module; 1090 int max_bdev_module_size = 0; 1091 1092 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1093 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1094 max_bdev_module_size = bdev_module->get_ctx_size(); 1095 } 1096 } 1097 1098 return max_bdev_module_size; 1099 } 1100 1101 static void 1102 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1103 { 1104 int i; 1105 struct spdk_bdev_qos *qos = bdev->internal.qos; 1106 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1107 1108 if (!qos) { 1109 return; 1110 } 1111 1112 spdk_bdev_get_qos_rate_limits(bdev, limits); 1113 1114 spdk_json_write_object_begin(w); 1115 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1116 1117 spdk_json_write_named_object_begin(w, "params"); 1118 spdk_json_write_named_string(w, "name", bdev->name); 1119 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1120 if (limits[i] > 0) { 1121 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1122 } 1123 } 1124 spdk_json_write_object_end(w); 1125 1126 spdk_json_write_object_end(w); 1127 } 1128 1129 void 1130 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1131 { 1132 struct spdk_bdev_module *bdev_module; 1133 struct spdk_bdev *bdev; 1134 1135 assert(w != NULL); 1136 1137 spdk_json_write_array_begin(w); 1138 1139 spdk_json_write_object_begin(w); 1140 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1141 spdk_json_write_named_object_begin(w, "params"); 1142 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1143 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1144 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1145 spdk_json_write_object_end(w); 1146 spdk_json_write_object_end(w); 1147 1148 bdev_examine_allowlist_config_json(w); 1149 1150 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1151 if (bdev_module->config_json) { 1152 bdev_module->config_json(w); 1153 } 1154 } 1155 1156 pthread_mutex_lock(&g_bdev_mgr.mutex); 1157 1158 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1159 if (bdev->fn_table->write_config_json) { 1160 bdev->fn_table->write_config_json(bdev, w); 1161 } 1162 1163 bdev_qos_config_json(bdev, w); 1164 } 1165 1166 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1167 1168 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1169 spdk_json_write_object_begin(w); 1170 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1171 spdk_json_write_object_end(w); 1172 1173 spdk_json_write_array_end(w); 1174 } 1175 1176 static int 1177 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1178 { 1179 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1180 struct spdk_bdev_io *bdev_io; 1181 uint32_t i; 1182 1183 STAILQ_INIT(&ch->need_buf_small); 1184 STAILQ_INIT(&ch->need_buf_large); 1185 1186 STAILQ_INIT(&ch->per_thread_cache); 1187 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1188 1189 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1190 ch->per_thread_cache_count = 0; 1191 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1192 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1193 assert(bdev_io != NULL); 1194 ch->per_thread_cache_count++; 1195 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1196 } 1197 1198 TAILQ_INIT(&ch->shared_resources); 1199 TAILQ_INIT(&ch->io_wait_queue); 1200 1201 return 0; 1202 } 1203 1204 static void 1205 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1206 { 1207 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1208 struct spdk_bdev_io *bdev_io; 1209 1210 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1211 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1212 } 1213 1214 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1215 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1216 } 1217 1218 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1219 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1220 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1221 ch->per_thread_cache_count--; 1222 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1223 } 1224 1225 assert(ch->per_thread_cache_count == 0); 1226 } 1227 1228 static void 1229 bdev_init_complete(int rc) 1230 { 1231 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1232 void *cb_arg = g_init_cb_arg; 1233 struct spdk_bdev_module *m; 1234 1235 g_bdev_mgr.init_complete = true; 1236 g_init_cb_fn = NULL; 1237 g_init_cb_arg = NULL; 1238 1239 /* 1240 * For modules that need to know when subsystem init is complete, 1241 * inform them now. 1242 */ 1243 if (rc == 0) { 1244 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1245 if (m->init_complete) { 1246 m->init_complete(); 1247 } 1248 } 1249 } 1250 1251 cb_fn(cb_arg, rc); 1252 } 1253 1254 static bool 1255 bdev_module_all_actions_completed(void) 1256 { 1257 struct spdk_bdev_module *m; 1258 1259 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1260 if (m->internal.action_in_progress > 0) { 1261 return false; 1262 } 1263 } 1264 return true; 1265 } 1266 1267 static void 1268 bdev_module_action_complete(void) 1269 { 1270 /* 1271 * Don't finish bdev subsystem initialization if 1272 * module pre-initialization is still in progress, or 1273 * the subsystem been already initialized. 1274 */ 1275 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1276 return; 1277 } 1278 1279 /* 1280 * Check all bdev modules for inits/examinations in progress. If any 1281 * exist, return immediately since we cannot finish bdev subsystem 1282 * initialization until all are completed. 1283 */ 1284 if (!bdev_module_all_actions_completed()) { 1285 return; 1286 } 1287 1288 /* 1289 * Modules already finished initialization - now that all 1290 * the bdev modules have finished their asynchronous I/O 1291 * processing, the entire bdev layer can be marked as complete. 1292 */ 1293 bdev_init_complete(0); 1294 } 1295 1296 static void 1297 bdev_module_action_done(struct spdk_bdev_module *module) 1298 { 1299 assert(module->internal.action_in_progress > 0); 1300 module->internal.action_in_progress--; 1301 bdev_module_action_complete(); 1302 } 1303 1304 void 1305 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1306 { 1307 bdev_module_action_done(module); 1308 } 1309 1310 void 1311 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1312 { 1313 bdev_module_action_done(module); 1314 } 1315 1316 /** The last initialized bdev module */ 1317 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1318 1319 static void 1320 bdev_init_failed(void *cb_arg) 1321 { 1322 struct spdk_bdev_module *module = cb_arg; 1323 1324 module->internal.action_in_progress--; 1325 bdev_init_complete(-1); 1326 } 1327 1328 static int 1329 bdev_modules_init(void) 1330 { 1331 struct spdk_bdev_module *module; 1332 int rc = 0; 1333 1334 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1335 g_resume_bdev_module = module; 1336 if (module->async_init) { 1337 module->internal.action_in_progress = 1; 1338 } 1339 rc = module->module_init(); 1340 if (rc != 0) { 1341 /* Bump action_in_progress to prevent other modules from completion of modules_init 1342 * Send message to defer application shutdown until resources are cleaned up */ 1343 module->internal.action_in_progress = 1; 1344 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1345 return rc; 1346 } 1347 } 1348 1349 g_resume_bdev_module = NULL; 1350 return 0; 1351 } 1352 1353 void 1354 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1355 { 1356 int cache_size; 1357 int rc = 0; 1358 char mempool_name[32]; 1359 1360 assert(cb_fn != NULL); 1361 1362 g_init_cb_fn = cb_fn; 1363 g_init_cb_arg = cb_arg; 1364 1365 spdk_notify_type_register("bdev_register"); 1366 spdk_notify_type_register("bdev_unregister"); 1367 1368 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1369 1370 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1371 g_bdev_opts.bdev_io_pool_size, 1372 sizeof(struct spdk_bdev_io) + 1373 bdev_module_get_max_ctx_size(), 1374 0, 1375 SPDK_ENV_SOCKET_ID_ANY); 1376 1377 if (g_bdev_mgr.bdev_io_pool == NULL) { 1378 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1379 bdev_init_complete(-1); 1380 return; 1381 } 1382 1383 /** 1384 * Ensure no more than half of the total buffers end up local caches, by 1385 * using spdk_env_get_core_count() to determine how many local caches we need 1386 * to account for. 1387 */ 1388 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1389 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1390 1391 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1392 g_bdev_opts.small_buf_pool_size, 1393 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1394 SPDK_BDEV_POOL_ALIGNMENT, 1395 cache_size, 1396 SPDK_ENV_SOCKET_ID_ANY); 1397 if (!g_bdev_mgr.buf_small_pool) { 1398 SPDK_ERRLOG("create rbuf small pool failed\n"); 1399 bdev_init_complete(-1); 1400 return; 1401 } 1402 1403 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1404 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1405 1406 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1407 g_bdev_opts.large_buf_pool_size, 1408 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1409 SPDK_BDEV_POOL_ALIGNMENT, 1410 cache_size, 1411 SPDK_ENV_SOCKET_ID_ANY); 1412 if (!g_bdev_mgr.buf_large_pool) { 1413 SPDK_ERRLOG("create rbuf large pool failed\n"); 1414 bdev_init_complete(-1); 1415 return; 1416 } 1417 1418 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1419 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1420 if (!g_bdev_mgr.zero_buffer) { 1421 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1422 bdev_init_complete(-1); 1423 return; 1424 } 1425 1426 #ifdef SPDK_CONFIG_VTUNE 1427 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1428 #endif 1429 1430 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1431 bdev_mgmt_channel_destroy, 1432 sizeof(struct spdk_bdev_mgmt_channel), 1433 "bdev_mgr"); 1434 1435 rc = bdev_modules_init(); 1436 g_bdev_mgr.module_init_complete = true; 1437 if (rc != 0) { 1438 SPDK_ERRLOG("bdev modules init failed\n"); 1439 return; 1440 } 1441 1442 bdev_module_action_complete(); 1443 } 1444 1445 static void 1446 bdev_mgr_unregister_cb(void *io_device) 1447 { 1448 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1449 1450 if (g_bdev_mgr.bdev_io_pool) { 1451 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1452 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1453 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1454 g_bdev_opts.bdev_io_pool_size); 1455 } 1456 1457 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1458 } 1459 1460 if (g_bdev_mgr.buf_small_pool) { 1461 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1462 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1463 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1464 g_bdev_opts.small_buf_pool_size); 1465 assert(false); 1466 } 1467 1468 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1469 } 1470 1471 if (g_bdev_mgr.buf_large_pool) { 1472 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1473 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1474 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1475 g_bdev_opts.large_buf_pool_size); 1476 assert(false); 1477 } 1478 1479 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1480 } 1481 1482 spdk_free(g_bdev_mgr.zero_buffer); 1483 1484 bdev_examine_allowlist_free(); 1485 1486 cb_fn(g_fini_cb_arg); 1487 g_fini_cb_fn = NULL; 1488 g_fini_cb_arg = NULL; 1489 g_bdev_mgr.init_complete = false; 1490 g_bdev_mgr.module_init_complete = false; 1491 } 1492 1493 static void 1494 bdev_module_fini_iter(void *arg) 1495 { 1496 struct spdk_bdev_module *bdev_module; 1497 1498 /* FIXME: Handling initialization failures is broken now, 1499 * so we won't even try cleaning up after successfully 1500 * initialized modules. if module_init_complete is false, 1501 * just call spdk_bdev_mgr_unregister_cb 1502 */ 1503 if (!g_bdev_mgr.module_init_complete) { 1504 bdev_mgr_unregister_cb(NULL); 1505 return; 1506 } 1507 1508 /* Start iterating from the last touched module */ 1509 if (!g_resume_bdev_module) { 1510 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1511 } else { 1512 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1513 internal.tailq); 1514 } 1515 1516 while (bdev_module) { 1517 if (bdev_module->async_fini) { 1518 /* Save our place so we can resume later. We must 1519 * save the variable here, before calling module_fini() 1520 * below, because in some cases the module may immediately 1521 * call spdk_bdev_module_fini_done() and re-enter 1522 * this function to continue iterating. */ 1523 g_resume_bdev_module = bdev_module; 1524 } 1525 1526 if (bdev_module->module_fini) { 1527 bdev_module->module_fini(); 1528 } 1529 1530 if (bdev_module->async_fini) { 1531 return; 1532 } 1533 1534 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1535 internal.tailq); 1536 } 1537 1538 g_resume_bdev_module = NULL; 1539 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1540 } 1541 1542 void 1543 spdk_bdev_module_fini_done(void) 1544 { 1545 if (spdk_get_thread() != g_fini_thread) { 1546 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1547 } else { 1548 bdev_module_fini_iter(NULL); 1549 } 1550 } 1551 1552 /* Deprecated */ 1553 void 1554 spdk_bdev_module_finish_done(void) 1555 { 1556 SPDK_NOTICELOG("spdk_bdev_module_finish_done() is deprecated, please use spdk_bdev_module_fini_done().\n"); 1557 spdk_bdev_module_fini_done(); 1558 } 1559 1560 static void 1561 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1562 { 1563 struct spdk_bdev *bdev = cb_arg; 1564 1565 if (bdeverrno && bdev) { 1566 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1567 bdev->name); 1568 1569 /* 1570 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1571 * bdev; try to continue by manually removing this bdev from the list and continue 1572 * with the next bdev in the list. 1573 */ 1574 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1575 } 1576 1577 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1578 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1579 /* 1580 * Bdev module finish need to be deferred as we might be in the middle of some context 1581 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1582 * after returning. 1583 */ 1584 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1585 return; 1586 } 1587 1588 /* 1589 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1590 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1591 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1592 * base bdevs. 1593 * 1594 * Also, walk the list in the reverse order. 1595 */ 1596 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1597 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1598 if (bdev->internal.claim_module != NULL) { 1599 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1600 bdev->name, bdev->internal.claim_module->name); 1601 continue; 1602 } 1603 1604 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1605 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1606 return; 1607 } 1608 1609 /* 1610 * If any bdev fails to unclaim underlying bdev properly, we may face the 1611 * case of bdev list consisting of claimed bdevs only (if claims are managed 1612 * correctly, this would mean there's a loop in the claims graph which is 1613 * clearly impossible). Warn and unregister last bdev on the list then. 1614 */ 1615 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1616 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1617 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1618 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1619 return; 1620 } 1621 } 1622 1623 static void 1624 bdev_module_fini_start_iter(void *arg) 1625 { 1626 struct spdk_bdev_module *bdev_module; 1627 1628 if (!g_resume_bdev_module) { 1629 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1630 } else { 1631 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1632 } 1633 1634 while (bdev_module) { 1635 if (bdev_module->async_fini_start) { 1636 /* Save our place so we can resume later. We must 1637 * save the variable here, before calling fini_start() 1638 * below, because in some cases the module may immediately 1639 * call spdk_bdev_module_fini_start_done() and re-enter 1640 * this function to continue iterating. */ 1641 g_resume_bdev_module = bdev_module; 1642 } 1643 1644 if (bdev_module->fini_start) { 1645 bdev_module->fini_start(); 1646 } 1647 1648 if (bdev_module->async_fini_start) { 1649 return; 1650 } 1651 1652 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1653 } 1654 1655 g_resume_bdev_module = NULL; 1656 1657 bdev_finish_unregister_bdevs_iter(NULL, 0); 1658 } 1659 1660 void 1661 spdk_bdev_module_fini_start_done(void) 1662 { 1663 if (spdk_get_thread() != g_fini_thread) { 1664 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1665 } else { 1666 bdev_module_fini_start_iter(NULL); 1667 } 1668 } 1669 1670 void 1671 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1672 { 1673 assert(cb_fn != NULL); 1674 1675 g_fini_thread = spdk_get_thread(); 1676 1677 g_fini_cb_fn = cb_fn; 1678 g_fini_cb_arg = cb_arg; 1679 1680 bdev_module_fini_start_iter(NULL); 1681 } 1682 1683 struct spdk_bdev_io * 1684 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1685 { 1686 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1687 struct spdk_bdev_io *bdev_io; 1688 1689 if (ch->per_thread_cache_count > 0) { 1690 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1691 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1692 ch->per_thread_cache_count--; 1693 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1694 /* 1695 * Don't try to look for bdev_ios in the global pool if there are 1696 * waiters on bdev_ios - we don't want this caller to jump the line. 1697 */ 1698 bdev_io = NULL; 1699 } else { 1700 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1701 } 1702 1703 return bdev_io; 1704 } 1705 1706 void 1707 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1708 { 1709 struct spdk_bdev_mgmt_channel *ch; 1710 1711 assert(bdev_io != NULL); 1712 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1713 1714 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1715 1716 if (bdev_io->internal.buf != NULL) { 1717 bdev_io_put_buf(bdev_io); 1718 } 1719 1720 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1721 ch->per_thread_cache_count++; 1722 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1723 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1724 struct spdk_bdev_io_wait_entry *entry; 1725 1726 entry = TAILQ_FIRST(&ch->io_wait_queue); 1727 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1728 entry->cb_fn(entry->cb_arg); 1729 } 1730 } else { 1731 /* We should never have a full cache with entries on the io wait queue. */ 1732 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1733 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1734 } 1735 } 1736 1737 static bool 1738 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1739 { 1740 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1741 1742 switch (limit) { 1743 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1744 return true; 1745 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1746 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1747 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1748 return false; 1749 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1750 default: 1751 return false; 1752 } 1753 } 1754 1755 static bool 1756 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1757 { 1758 switch (bdev_io->type) { 1759 case SPDK_BDEV_IO_TYPE_NVME_IO: 1760 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1761 case SPDK_BDEV_IO_TYPE_READ: 1762 case SPDK_BDEV_IO_TYPE_WRITE: 1763 return true; 1764 case SPDK_BDEV_IO_TYPE_ZCOPY: 1765 if (bdev_io->u.bdev.zcopy.start) { 1766 return true; 1767 } else { 1768 return false; 1769 } 1770 default: 1771 return false; 1772 } 1773 } 1774 1775 static bool 1776 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1777 { 1778 switch (bdev_io->type) { 1779 case SPDK_BDEV_IO_TYPE_NVME_IO: 1780 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1781 /* Bit 1 (0x2) set for read operation */ 1782 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1783 return true; 1784 } else { 1785 return false; 1786 } 1787 case SPDK_BDEV_IO_TYPE_READ: 1788 return true; 1789 case SPDK_BDEV_IO_TYPE_ZCOPY: 1790 /* Populate to read from disk */ 1791 if (bdev_io->u.bdev.zcopy.populate) { 1792 return true; 1793 } else { 1794 return false; 1795 } 1796 default: 1797 return false; 1798 } 1799 } 1800 1801 static uint64_t 1802 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1803 { 1804 struct spdk_bdev *bdev = bdev_io->bdev; 1805 1806 switch (bdev_io->type) { 1807 case SPDK_BDEV_IO_TYPE_NVME_IO: 1808 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1809 return bdev_io->u.nvme_passthru.nbytes; 1810 case SPDK_BDEV_IO_TYPE_READ: 1811 case SPDK_BDEV_IO_TYPE_WRITE: 1812 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1813 case SPDK_BDEV_IO_TYPE_ZCOPY: 1814 /* Track the data in the start phase only */ 1815 if (bdev_io->u.bdev.zcopy.start) { 1816 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1817 } else { 1818 return 0; 1819 } 1820 default: 1821 return 0; 1822 } 1823 } 1824 1825 static bool 1826 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1827 { 1828 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1829 return true; 1830 } else { 1831 return false; 1832 } 1833 } 1834 1835 static bool 1836 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1837 { 1838 if (bdev_is_read_io(io) == false) { 1839 return false; 1840 } 1841 1842 return bdev_qos_rw_queue_io(limit, io); 1843 } 1844 1845 static bool 1846 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1847 { 1848 if (bdev_is_read_io(io) == true) { 1849 return false; 1850 } 1851 1852 return bdev_qos_rw_queue_io(limit, io); 1853 } 1854 1855 static void 1856 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1857 { 1858 limit->remaining_this_timeslice--; 1859 } 1860 1861 static void 1862 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1863 { 1864 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 1865 } 1866 1867 static void 1868 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1869 { 1870 if (bdev_is_read_io(io) == false) { 1871 return; 1872 } 1873 1874 return bdev_qos_rw_bps_update_quota(limit, io); 1875 } 1876 1877 static void 1878 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1879 { 1880 if (bdev_is_read_io(io) == true) { 1881 return; 1882 } 1883 1884 return bdev_qos_rw_bps_update_quota(limit, io); 1885 } 1886 1887 static void 1888 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1889 { 1890 int i; 1891 1892 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1893 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1894 qos->rate_limits[i].queue_io = NULL; 1895 qos->rate_limits[i].update_quota = NULL; 1896 continue; 1897 } 1898 1899 switch (i) { 1900 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1901 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1902 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 1903 break; 1904 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1905 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1906 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 1907 break; 1908 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1909 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 1910 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 1911 break; 1912 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1913 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 1914 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 1915 break; 1916 default: 1917 break; 1918 } 1919 } 1920 } 1921 1922 static void 1923 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 1924 struct spdk_bdev_io *bdev_io, 1925 enum spdk_bdev_io_status status) 1926 { 1927 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1928 1929 bdev_io->internal.in_submit_request = true; 1930 bdev_ch->io_outstanding++; 1931 shared_resource->io_outstanding++; 1932 spdk_bdev_io_complete(bdev_io, status); 1933 bdev_io->internal.in_submit_request = false; 1934 } 1935 1936 static inline void 1937 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1938 { 1939 struct spdk_bdev *bdev = bdev_io->bdev; 1940 struct spdk_io_channel *ch = bdev_ch->channel; 1941 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1942 1943 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 1944 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 1945 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 1946 1947 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 1948 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 1949 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 1950 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 1951 SPDK_BDEV_IO_STATUS_SUCCESS); 1952 return; 1953 } 1954 } 1955 1956 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1957 bdev_ch->io_outstanding++; 1958 shared_resource->io_outstanding++; 1959 bdev_io->internal.in_submit_request = true; 1960 bdev->fn_table->submit_request(ch, bdev_io); 1961 bdev_io->internal.in_submit_request = false; 1962 } else { 1963 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1964 } 1965 } 1966 1967 static int 1968 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1969 { 1970 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1971 int i, submitted_ios = 0; 1972 1973 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1974 if (bdev_qos_io_to_limit(bdev_io) == true) { 1975 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1976 if (!qos->rate_limits[i].queue_io) { 1977 continue; 1978 } 1979 1980 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1981 bdev_io) == true) { 1982 return submitted_ios; 1983 } 1984 } 1985 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1986 if (!qos->rate_limits[i].update_quota) { 1987 continue; 1988 } 1989 1990 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1991 } 1992 } 1993 1994 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1995 bdev_io_do_submit(ch, bdev_io); 1996 submitted_ios++; 1997 } 1998 1999 return submitted_ios; 2000 } 2001 2002 static void 2003 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2004 { 2005 int rc; 2006 2007 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2008 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2009 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2010 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2011 &bdev_io->internal.waitq_entry); 2012 if (rc != 0) { 2013 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2014 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2015 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2016 } 2017 } 2018 2019 static bool 2020 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2021 { 2022 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 2023 uint32_t max_size = bdev_io->bdev->max_segment_size; 2024 int max_segs = bdev_io->bdev->max_num_segments; 2025 2026 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 2027 2028 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2029 return false; 2030 } 2031 2032 if (io_boundary) { 2033 uint64_t start_stripe, end_stripe; 2034 2035 start_stripe = bdev_io->u.bdev.offset_blocks; 2036 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2037 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2038 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2039 start_stripe >>= spdk_u32log2(io_boundary); 2040 end_stripe >>= spdk_u32log2(io_boundary); 2041 } else { 2042 start_stripe /= io_boundary; 2043 end_stripe /= io_boundary; 2044 } 2045 2046 if (start_stripe != end_stripe) { 2047 return true; 2048 } 2049 } 2050 2051 if (max_segs) { 2052 if (bdev_io->u.bdev.iovcnt > max_segs) { 2053 return true; 2054 } 2055 } 2056 2057 if (max_size) { 2058 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2059 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2060 return true; 2061 } 2062 } 2063 } 2064 2065 return false; 2066 } 2067 2068 static bool 2069 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2070 { 2071 uint32_t num_unmap_segments; 2072 2073 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2074 return false; 2075 } 2076 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2077 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2078 return true; 2079 } 2080 2081 return false; 2082 } 2083 2084 static bool 2085 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2086 { 2087 if (!bdev_io->bdev->max_write_zeroes) { 2088 return false; 2089 } 2090 2091 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2092 return true; 2093 } 2094 2095 return false; 2096 } 2097 2098 static bool 2099 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2100 { 2101 switch (bdev_io->type) { 2102 case SPDK_BDEV_IO_TYPE_READ: 2103 case SPDK_BDEV_IO_TYPE_WRITE: 2104 return bdev_rw_should_split(bdev_io); 2105 case SPDK_BDEV_IO_TYPE_UNMAP: 2106 return bdev_unmap_should_split(bdev_io); 2107 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2108 return bdev_write_zeroes_should_split(bdev_io); 2109 default: 2110 return false; 2111 } 2112 } 2113 2114 static uint32_t 2115 _to_next_boundary(uint64_t offset, uint32_t boundary) 2116 { 2117 return (boundary - (offset % boundary)); 2118 } 2119 2120 static void 2121 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2122 2123 static void 2124 _bdev_rw_split(void *_bdev_io); 2125 2126 static void 2127 bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2128 2129 static void 2130 _bdev_unmap_split(void *_bdev_io) 2131 { 2132 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2133 } 2134 2135 static void 2136 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2137 2138 static void 2139 _bdev_write_zeroes_split(void *_bdev_io) 2140 { 2141 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2142 } 2143 2144 static int 2145 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2146 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2147 { 2148 int rc; 2149 uint64_t current_offset, current_remaining; 2150 spdk_bdev_io_wait_cb io_wait_fn; 2151 2152 current_offset = *offset; 2153 current_remaining = *remaining; 2154 2155 bdev_io->u.bdev.split_outstanding++; 2156 2157 io_wait_fn = _bdev_rw_split; 2158 switch (bdev_io->type) { 2159 case SPDK_BDEV_IO_TYPE_READ: 2160 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2161 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2162 iov, iovcnt, md_buf, current_offset, 2163 num_blocks, 2164 bdev_io_split_done, bdev_io, 2165 bdev_io->internal.ext_opts); 2166 break; 2167 case SPDK_BDEV_IO_TYPE_WRITE: 2168 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2169 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2170 iov, iovcnt, md_buf, current_offset, 2171 num_blocks, 2172 bdev_io_split_done, bdev_io, 2173 bdev_io->internal.ext_opts); 2174 break; 2175 case SPDK_BDEV_IO_TYPE_UNMAP: 2176 io_wait_fn = _bdev_unmap_split; 2177 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2178 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2179 current_offset, num_blocks, 2180 bdev_io_split_done, bdev_io); 2181 break; 2182 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2183 io_wait_fn = _bdev_write_zeroes_split; 2184 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2185 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2186 current_offset, num_blocks, 2187 bdev_io_split_done, bdev_io); 2188 break; 2189 default: 2190 assert(false); 2191 rc = -EINVAL; 2192 break; 2193 } 2194 2195 if (rc == 0) { 2196 current_offset += num_blocks; 2197 current_remaining -= num_blocks; 2198 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2199 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2200 *offset = current_offset; 2201 *remaining = current_remaining; 2202 } else { 2203 bdev_io->u.bdev.split_outstanding--; 2204 if (rc == -ENOMEM) { 2205 if (bdev_io->u.bdev.split_outstanding == 0) { 2206 /* No I/O is outstanding. Hence we should wait here. */ 2207 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2208 } 2209 } else { 2210 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2211 if (bdev_io->u.bdev.split_outstanding == 0) { 2212 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2213 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2214 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2215 } 2216 } 2217 } 2218 2219 return rc; 2220 } 2221 2222 static void 2223 _bdev_rw_split(void *_bdev_io) 2224 { 2225 struct iovec *parent_iov, *iov; 2226 struct spdk_bdev_io *bdev_io = _bdev_io; 2227 struct spdk_bdev *bdev = bdev_io->bdev; 2228 uint64_t parent_offset, current_offset, remaining; 2229 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2230 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2231 uint32_t iovcnt, iov_len, child_iovsize; 2232 uint32_t blocklen = bdev->blocklen; 2233 uint32_t io_boundary = bdev->optimal_io_boundary; 2234 uint32_t max_segment_size = bdev->max_segment_size; 2235 uint32_t max_child_iovcnt = bdev->max_num_segments; 2236 void *md_buf = NULL; 2237 int rc; 2238 2239 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2240 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2241 BDEV_IO_NUM_CHILD_IOV; 2242 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2243 2244 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2245 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2246 parent_offset = bdev_io->u.bdev.offset_blocks; 2247 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2248 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2249 2250 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2251 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2252 if (parent_iov_offset < parent_iov->iov_len) { 2253 break; 2254 } 2255 parent_iov_offset -= parent_iov->iov_len; 2256 } 2257 2258 child_iovcnt = 0; 2259 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2260 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2261 to_next_boundary = spdk_min(remaining, to_next_boundary); 2262 to_next_boundary_bytes = to_next_boundary * blocklen; 2263 2264 iov = &bdev_io->child_iov[child_iovcnt]; 2265 iovcnt = 0; 2266 2267 if (bdev_io->u.bdev.md_buf) { 2268 md_buf = (char *)bdev_io->u.bdev.md_buf + 2269 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2270 } 2271 2272 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2273 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2274 iovcnt < child_iovsize) { 2275 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2276 iov_len = parent_iov->iov_len - parent_iov_offset; 2277 2278 iov_len = spdk_min(iov_len, max_segment_size); 2279 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2280 to_next_boundary_bytes -= iov_len; 2281 2282 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2283 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2284 2285 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2286 parent_iov_offset += iov_len; 2287 } else { 2288 parent_iovpos++; 2289 parent_iov_offset = 0; 2290 } 2291 child_iovcnt++; 2292 iovcnt++; 2293 } 2294 2295 if (to_next_boundary_bytes > 0) { 2296 /* We had to stop this child I/O early because we ran out of 2297 * child_iov space or were limited by max_num_segments. 2298 * Ensure the iovs to be aligned with block size and 2299 * then adjust to_next_boundary before starting the 2300 * child I/O. 2301 */ 2302 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2303 iovcnt == child_iovsize); 2304 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2305 if (to_last_block_bytes != 0) { 2306 uint32_t child_iovpos = child_iovcnt - 1; 2307 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2308 * so the loop will naturally end 2309 */ 2310 2311 to_last_block_bytes = blocklen - to_last_block_bytes; 2312 to_next_boundary_bytes += to_last_block_bytes; 2313 while (to_last_block_bytes > 0 && iovcnt > 0) { 2314 iov_len = spdk_min(to_last_block_bytes, 2315 bdev_io->child_iov[child_iovpos].iov_len); 2316 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2317 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2318 child_iovpos--; 2319 if (--iovcnt == 0) { 2320 /* If the child IO is less than a block size just return. 2321 * If the first child IO of any split round is less than 2322 * a block size, an error exit. 2323 */ 2324 if (bdev_io->u.bdev.split_outstanding == 0) { 2325 SPDK_ERRLOG("The first child io was less than a block size\n"); 2326 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2327 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2328 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2329 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2330 } 2331 2332 return; 2333 } 2334 } 2335 2336 to_last_block_bytes -= iov_len; 2337 2338 if (parent_iov_offset == 0) { 2339 parent_iovpos--; 2340 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2341 } 2342 parent_iov_offset -= iov_len; 2343 } 2344 2345 assert(to_last_block_bytes == 0); 2346 } 2347 to_next_boundary -= to_next_boundary_bytes / blocklen; 2348 } 2349 2350 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2351 ¤t_offset, &remaining); 2352 if (spdk_unlikely(rc)) { 2353 return; 2354 } 2355 } 2356 } 2357 2358 static void 2359 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2360 { 2361 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2362 uint32_t num_children_reqs = 0; 2363 int rc; 2364 2365 offset = bdev_io->u.bdev.split_current_offset_blocks; 2366 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2367 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2368 2369 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2370 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2371 2372 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2373 &offset, &remaining); 2374 if (spdk_likely(rc == 0)) { 2375 num_children_reqs++; 2376 } else { 2377 return; 2378 } 2379 } 2380 } 2381 2382 static void 2383 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2384 { 2385 uint64_t offset, write_zeroes_blocks, remaining; 2386 uint32_t num_children_reqs = 0; 2387 int rc; 2388 2389 offset = bdev_io->u.bdev.split_current_offset_blocks; 2390 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2391 2392 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2393 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2394 2395 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2396 &offset, &remaining); 2397 if (spdk_likely(rc == 0)) { 2398 num_children_reqs++; 2399 } else { 2400 return; 2401 } 2402 } 2403 } 2404 2405 static void 2406 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2407 { 2408 struct spdk_bdev_io *parent_io = cb_arg; 2409 2410 spdk_bdev_free_io(bdev_io); 2411 2412 if (!success) { 2413 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2414 /* If any child I/O failed, stop further splitting process. */ 2415 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2416 parent_io->u.bdev.split_remaining_num_blocks = 0; 2417 } 2418 parent_io->u.bdev.split_outstanding--; 2419 if (parent_io->u.bdev.split_outstanding != 0) { 2420 return; 2421 } 2422 2423 /* 2424 * Parent I/O finishes when all blocks are consumed. 2425 */ 2426 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2427 assert(parent_io->internal.cb != bdev_io_split_done); 2428 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2429 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2430 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2431 parent_io->internal.caller_ctx); 2432 return; 2433 } 2434 2435 /* 2436 * Continue with the splitting process. This function will complete the parent I/O if the 2437 * splitting is done. 2438 */ 2439 switch (parent_io->type) { 2440 case SPDK_BDEV_IO_TYPE_READ: 2441 case SPDK_BDEV_IO_TYPE_WRITE: 2442 _bdev_rw_split(parent_io); 2443 break; 2444 case SPDK_BDEV_IO_TYPE_UNMAP: 2445 bdev_unmap_split(parent_io); 2446 break; 2447 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2448 bdev_write_zeroes_split(parent_io); 2449 break; 2450 default: 2451 assert(false); 2452 break; 2453 } 2454 } 2455 2456 static void 2457 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2458 2459 static void 2460 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2461 { 2462 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2463 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2464 bdev_io->u.bdev.split_outstanding = 0; 2465 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2466 2467 switch (bdev_io->type) { 2468 case SPDK_BDEV_IO_TYPE_READ: 2469 case SPDK_BDEV_IO_TYPE_WRITE: 2470 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2471 _bdev_rw_split(bdev_io); 2472 } else { 2473 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2474 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2475 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2476 } 2477 break; 2478 case SPDK_BDEV_IO_TYPE_UNMAP: 2479 bdev_unmap_split(bdev_io); 2480 break; 2481 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2482 bdev_write_zeroes_split(bdev_io); 2483 break; 2484 default: 2485 assert(false); 2486 break; 2487 } 2488 } 2489 2490 static void 2491 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2492 { 2493 if (!success) { 2494 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2495 return; 2496 } 2497 2498 _bdev_rw_split(bdev_io); 2499 } 2500 2501 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2502 * be inlined, at least on some compilers. 2503 */ 2504 static inline void 2505 _bdev_io_submit(void *ctx) 2506 { 2507 struct spdk_bdev_io *bdev_io = ctx; 2508 struct spdk_bdev *bdev = bdev_io->bdev; 2509 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2510 uint64_t tsc; 2511 2512 tsc = spdk_get_ticks(); 2513 bdev_io->internal.submit_tsc = tsc; 2514 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type, 2515 bdev_io->internal.caller_ctx); 2516 2517 if (spdk_likely(bdev_ch->flags == 0)) { 2518 bdev_io_do_submit(bdev_ch, bdev_io); 2519 return; 2520 } 2521 2522 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2523 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2524 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2525 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2526 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2527 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2528 } else { 2529 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2530 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2531 } 2532 } else { 2533 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2534 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2535 } 2536 } 2537 2538 bool 2539 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2540 2541 bool 2542 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2543 { 2544 if (range1->length == 0 || range2->length == 0) { 2545 return false; 2546 } 2547 2548 if (range1->offset + range1->length <= range2->offset) { 2549 return false; 2550 } 2551 2552 if (range2->offset + range2->length <= range1->offset) { 2553 return false; 2554 } 2555 2556 return true; 2557 } 2558 2559 static bool 2560 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2561 { 2562 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2563 struct lba_range r; 2564 2565 switch (bdev_io->type) { 2566 case SPDK_BDEV_IO_TYPE_NVME_IO: 2567 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2568 /* Don't try to decode the NVMe command - just assume worst-case and that 2569 * it overlaps a locked range. 2570 */ 2571 return true; 2572 case SPDK_BDEV_IO_TYPE_WRITE: 2573 case SPDK_BDEV_IO_TYPE_UNMAP: 2574 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2575 case SPDK_BDEV_IO_TYPE_ZCOPY: 2576 r.offset = bdev_io->u.bdev.offset_blocks; 2577 r.length = bdev_io->u.bdev.num_blocks; 2578 if (!bdev_lba_range_overlapped(range, &r)) { 2579 /* This I/O doesn't overlap the specified LBA range. */ 2580 return false; 2581 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2582 /* This I/O overlaps, but the I/O is on the same channel that locked this 2583 * range, and the caller_ctx is the same as the locked_ctx. This means 2584 * that this I/O is associated with the lock, and is allowed to execute. 2585 */ 2586 return false; 2587 } else { 2588 return true; 2589 } 2590 default: 2591 return false; 2592 } 2593 } 2594 2595 void 2596 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2597 { 2598 struct spdk_bdev *bdev = bdev_io->bdev; 2599 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2600 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2601 2602 assert(thread != NULL); 2603 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2604 2605 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2606 struct lba_range *range; 2607 2608 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2609 if (bdev_io_range_is_locked(bdev_io, range)) { 2610 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2611 return; 2612 } 2613 } 2614 } 2615 2616 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2617 2618 if (bdev_io_should_split(bdev_io)) { 2619 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2620 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2621 (uintptr_t)bdev_io, bdev_io->type, bdev_io->internal.caller_ctx); 2622 bdev_io_split(NULL, bdev_io); 2623 return; 2624 } 2625 2626 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2627 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2628 _bdev_io_submit(bdev_io); 2629 } else { 2630 bdev_io->internal.io_submit_ch = ch; 2631 bdev_io->internal.ch = bdev->internal.qos->ch; 2632 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2633 } 2634 } else { 2635 _bdev_io_submit(bdev_io); 2636 } 2637 } 2638 2639 static void 2640 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2641 { 2642 struct spdk_bdev *bdev = bdev_io->bdev; 2643 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2644 struct spdk_io_channel *ch = bdev_ch->channel; 2645 2646 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2647 2648 bdev_io->internal.in_submit_request = true; 2649 bdev->fn_table->submit_request(ch, bdev_io); 2650 bdev_io->internal.in_submit_request = false; 2651 } 2652 2653 void 2654 bdev_io_init(struct spdk_bdev_io *bdev_io, 2655 struct spdk_bdev *bdev, void *cb_arg, 2656 spdk_bdev_io_completion_cb cb) 2657 { 2658 bdev_io->bdev = bdev; 2659 bdev_io->internal.caller_ctx = cb_arg; 2660 bdev_io->internal.cb = cb; 2661 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2662 bdev_io->internal.in_submit_request = false; 2663 bdev_io->internal.buf = NULL; 2664 bdev_io->internal.io_submit_ch = NULL; 2665 bdev_io->internal.orig_iovs = NULL; 2666 bdev_io->internal.orig_iovcnt = 0; 2667 bdev_io->internal.orig_md_buf = NULL; 2668 bdev_io->internal.error.nvme.cdw0 = 0; 2669 bdev_io->num_retries = 0; 2670 bdev_io->internal.get_buf_cb = NULL; 2671 bdev_io->internal.get_aux_buf_cb = NULL; 2672 bdev_io->internal.ext_opts = NULL; 2673 } 2674 2675 static bool 2676 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2677 { 2678 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2679 } 2680 2681 bool 2682 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2683 { 2684 bool supported; 2685 2686 supported = bdev_io_type_supported(bdev, io_type); 2687 2688 if (!supported) { 2689 switch (io_type) { 2690 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2691 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2692 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2693 break; 2694 default: 2695 break; 2696 } 2697 } 2698 2699 return supported; 2700 } 2701 2702 int 2703 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2704 { 2705 if (bdev->fn_table->dump_info_json) { 2706 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2707 } 2708 2709 return 0; 2710 } 2711 2712 static void 2713 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 2714 { 2715 uint32_t max_per_timeslice = 0; 2716 int i; 2717 2718 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2719 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2720 qos->rate_limits[i].max_per_timeslice = 0; 2721 continue; 2722 } 2723 2724 max_per_timeslice = qos->rate_limits[i].limit * 2725 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 2726 2727 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 2728 qos->rate_limits[i].min_per_timeslice); 2729 2730 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 2731 } 2732 2733 bdev_qos_set_ops(qos); 2734 } 2735 2736 static int 2737 bdev_channel_poll_qos(void *arg) 2738 { 2739 struct spdk_bdev_qos *qos = arg; 2740 uint64_t now = spdk_get_ticks(); 2741 int i; 2742 2743 if (now < (qos->last_timeslice + qos->timeslice_size)) { 2744 /* We received our callback earlier than expected - return 2745 * immediately and wait to do accounting until at least one 2746 * timeslice has actually expired. This should never happen 2747 * with a well-behaved timer implementation. 2748 */ 2749 return SPDK_POLLER_IDLE; 2750 } 2751 2752 /* Reset for next round of rate limiting */ 2753 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2754 /* We may have allowed the IOs or bytes to slightly overrun in the last 2755 * timeslice. remaining_this_timeslice is signed, so if it's negative 2756 * here, we'll account for the overrun so that the next timeslice will 2757 * be appropriately reduced. 2758 */ 2759 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 2760 qos->rate_limits[i].remaining_this_timeslice = 0; 2761 } 2762 } 2763 2764 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 2765 qos->last_timeslice += qos->timeslice_size; 2766 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2767 qos->rate_limits[i].remaining_this_timeslice += 2768 qos->rate_limits[i].max_per_timeslice; 2769 } 2770 } 2771 2772 return bdev_qos_io_submit(qos->ch, qos); 2773 } 2774 2775 static void 2776 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 2777 { 2778 struct spdk_bdev_shared_resource *shared_resource; 2779 struct lba_range *range; 2780 2781 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 2782 range = TAILQ_FIRST(&ch->locked_ranges); 2783 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 2784 free(range); 2785 } 2786 2787 spdk_put_io_channel(ch->channel); 2788 2789 shared_resource = ch->shared_resource; 2790 2791 assert(TAILQ_EMPTY(&ch->io_locked)); 2792 assert(TAILQ_EMPTY(&ch->io_submitted)); 2793 assert(ch->io_outstanding == 0); 2794 assert(shared_resource->ref > 0); 2795 shared_resource->ref--; 2796 if (shared_resource->ref == 0) { 2797 assert(shared_resource->io_outstanding == 0); 2798 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 2799 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 2800 free(shared_resource); 2801 } 2802 } 2803 2804 /* Caller must hold bdev->internal.mutex. */ 2805 static void 2806 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 2807 { 2808 struct spdk_bdev_qos *qos = bdev->internal.qos; 2809 int i; 2810 2811 /* Rate limiting on this bdev enabled */ 2812 if (qos) { 2813 if (qos->ch == NULL) { 2814 struct spdk_io_channel *io_ch; 2815 2816 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 2817 bdev->name, spdk_get_thread()); 2818 2819 /* No qos channel has been selected, so set one up */ 2820 2821 /* Take another reference to ch */ 2822 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2823 assert(io_ch != NULL); 2824 qos->ch = ch; 2825 2826 qos->thread = spdk_io_channel_get_thread(io_ch); 2827 2828 TAILQ_INIT(&qos->queued); 2829 2830 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2831 if (bdev_qos_is_iops_rate_limit(i) == true) { 2832 qos->rate_limits[i].min_per_timeslice = 2833 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 2834 } else { 2835 qos->rate_limits[i].min_per_timeslice = 2836 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 2837 } 2838 2839 if (qos->rate_limits[i].limit == 0) { 2840 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 2841 } 2842 } 2843 bdev_qos_update_max_quota_per_timeslice(qos); 2844 qos->timeslice_size = 2845 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 2846 qos->last_timeslice = spdk_get_ticks(); 2847 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 2848 qos, 2849 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 2850 } 2851 2852 ch->flags |= BDEV_CH_QOS_ENABLED; 2853 } 2854 } 2855 2856 struct poll_timeout_ctx { 2857 struct spdk_bdev_desc *desc; 2858 uint64_t timeout_in_sec; 2859 spdk_bdev_io_timeout_cb cb_fn; 2860 void *cb_arg; 2861 }; 2862 2863 static void 2864 bdev_desc_free(struct spdk_bdev_desc *desc) 2865 { 2866 pthread_mutex_destroy(&desc->mutex); 2867 free(desc->media_events_buffer); 2868 free(desc); 2869 } 2870 2871 static void 2872 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 2873 { 2874 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2875 struct spdk_bdev_desc *desc = ctx->desc; 2876 2877 free(ctx); 2878 2879 pthread_mutex_lock(&desc->mutex); 2880 desc->refs--; 2881 if (desc->closed == true && desc->refs == 0) { 2882 pthread_mutex_unlock(&desc->mutex); 2883 bdev_desc_free(desc); 2884 return; 2885 } 2886 pthread_mutex_unlock(&desc->mutex); 2887 } 2888 2889 static void 2890 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 2891 { 2892 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2893 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2894 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 2895 struct spdk_bdev_desc *desc = ctx->desc; 2896 struct spdk_bdev_io *bdev_io; 2897 uint64_t now; 2898 2899 pthread_mutex_lock(&desc->mutex); 2900 if (desc->closed == true) { 2901 pthread_mutex_unlock(&desc->mutex); 2902 spdk_for_each_channel_continue(i, -1); 2903 return; 2904 } 2905 pthread_mutex_unlock(&desc->mutex); 2906 2907 now = spdk_get_ticks(); 2908 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 2909 /* Exclude any I/O that are generated via splitting. */ 2910 if (bdev_io->internal.cb == bdev_io_split_done) { 2911 continue; 2912 } 2913 2914 /* Once we find an I/O that has not timed out, we can immediately 2915 * exit the loop. 2916 */ 2917 if (now < (bdev_io->internal.submit_tsc + 2918 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 2919 goto end; 2920 } 2921 2922 if (bdev_io->internal.desc == desc) { 2923 ctx->cb_fn(ctx->cb_arg, bdev_io); 2924 } 2925 } 2926 2927 end: 2928 spdk_for_each_channel_continue(i, 0); 2929 } 2930 2931 static int 2932 bdev_poll_timeout_io(void *arg) 2933 { 2934 struct spdk_bdev_desc *desc = arg; 2935 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 2936 struct poll_timeout_ctx *ctx; 2937 2938 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 2939 if (!ctx) { 2940 SPDK_ERRLOG("failed to allocate memory\n"); 2941 return SPDK_POLLER_BUSY; 2942 } 2943 ctx->desc = desc; 2944 ctx->cb_arg = desc->cb_arg; 2945 ctx->cb_fn = desc->cb_fn; 2946 ctx->timeout_in_sec = desc->timeout_in_sec; 2947 2948 /* Take a ref on the descriptor in case it gets closed while we are checking 2949 * all of the channels. 2950 */ 2951 pthread_mutex_lock(&desc->mutex); 2952 desc->refs++; 2953 pthread_mutex_unlock(&desc->mutex); 2954 2955 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2956 bdev_channel_poll_timeout_io, 2957 ctx, 2958 bdev_channel_poll_timeout_io_done); 2959 2960 return SPDK_POLLER_BUSY; 2961 } 2962 2963 int 2964 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 2965 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 2966 { 2967 assert(desc->thread == spdk_get_thread()); 2968 2969 spdk_poller_unregister(&desc->io_timeout_poller); 2970 2971 if (timeout_in_sec) { 2972 assert(cb_fn != NULL); 2973 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 2974 desc, 2975 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 2976 1000); 2977 if (desc->io_timeout_poller == NULL) { 2978 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 2979 return -1; 2980 } 2981 } 2982 2983 desc->cb_fn = cb_fn; 2984 desc->cb_arg = cb_arg; 2985 desc->timeout_in_sec = timeout_in_sec; 2986 2987 return 0; 2988 } 2989 2990 static int 2991 bdev_channel_create(void *io_device, void *ctx_buf) 2992 { 2993 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 2994 struct spdk_bdev_channel *ch = ctx_buf; 2995 struct spdk_io_channel *mgmt_io_ch; 2996 struct spdk_bdev_mgmt_channel *mgmt_ch; 2997 struct spdk_bdev_shared_resource *shared_resource; 2998 struct lba_range *range; 2999 3000 ch->bdev = bdev; 3001 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3002 if (!ch->channel) { 3003 return -1; 3004 } 3005 3006 assert(ch->histogram == NULL); 3007 if (bdev->internal.histogram_enabled) { 3008 ch->histogram = spdk_histogram_data_alloc(); 3009 if (ch->histogram == NULL) { 3010 SPDK_ERRLOG("Could not allocate histogram\n"); 3011 } 3012 } 3013 3014 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3015 if (!mgmt_io_ch) { 3016 spdk_put_io_channel(ch->channel); 3017 return -1; 3018 } 3019 3020 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3021 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3022 if (shared_resource->shared_ch == ch->channel) { 3023 spdk_put_io_channel(mgmt_io_ch); 3024 shared_resource->ref++; 3025 break; 3026 } 3027 } 3028 3029 if (shared_resource == NULL) { 3030 shared_resource = calloc(1, sizeof(*shared_resource)); 3031 if (shared_resource == NULL) { 3032 spdk_put_io_channel(ch->channel); 3033 spdk_put_io_channel(mgmt_io_ch); 3034 return -1; 3035 } 3036 3037 shared_resource->mgmt_ch = mgmt_ch; 3038 shared_resource->io_outstanding = 0; 3039 TAILQ_INIT(&shared_resource->nomem_io); 3040 shared_resource->nomem_threshold = 0; 3041 shared_resource->shared_ch = ch->channel; 3042 shared_resource->ref = 1; 3043 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3044 } 3045 3046 memset(&ch->stat, 0, sizeof(ch->stat)); 3047 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3048 ch->io_outstanding = 0; 3049 TAILQ_INIT(&ch->queued_resets); 3050 TAILQ_INIT(&ch->locked_ranges); 3051 ch->flags = 0; 3052 ch->shared_resource = shared_resource; 3053 3054 TAILQ_INIT(&ch->io_submitted); 3055 TAILQ_INIT(&ch->io_locked); 3056 3057 #ifdef SPDK_CONFIG_VTUNE 3058 { 3059 char *name; 3060 __itt_init_ittlib(NULL, 0); 3061 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3062 if (!name) { 3063 bdev_channel_destroy_resource(ch); 3064 return -1; 3065 } 3066 ch->handle = __itt_string_handle_create(name); 3067 free(name); 3068 ch->start_tsc = spdk_get_ticks(); 3069 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3070 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3071 } 3072 #endif 3073 3074 pthread_mutex_lock(&bdev->internal.mutex); 3075 bdev_enable_qos(bdev, ch); 3076 3077 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3078 struct lba_range *new_range; 3079 3080 new_range = calloc(1, sizeof(*new_range)); 3081 if (new_range == NULL) { 3082 pthread_mutex_unlock(&bdev->internal.mutex); 3083 bdev_channel_destroy_resource(ch); 3084 return -1; 3085 } 3086 new_range->length = range->length; 3087 new_range->offset = range->offset; 3088 new_range->locked_ctx = range->locked_ctx; 3089 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3090 } 3091 3092 pthread_mutex_unlock(&bdev->internal.mutex); 3093 3094 return 0; 3095 } 3096 3097 /* 3098 * Abort I/O that are waiting on a data buffer. These types of I/O are 3099 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3100 */ 3101 static void 3102 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3103 { 3104 bdev_io_stailq_t tmp; 3105 struct spdk_bdev_io *bdev_io; 3106 3107 STAILQ_INIT(&tmp); 3108 3109 while (!STAILQ_EMPTY(queue)) { 3110 bdev_io = STAILQ_FIRST(queue); 3111 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3112 if (bdev_io->internal.ch == ch) { 3113 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3114 } else { 3115 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3116 } 3117 } 3118 3119 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3120 } 3121 3122 /* 3123 * Abort I/O that are queued waiting for submission. These types of I/O are 3124 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3125 */ 3126 static void 3127 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3128 { 3129 struct spdk_bdev_io *bdev_io, *tmp; 3130 3131 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3132 if (bdev_io->internal.ch == ch) { 3133 TAILQ_REMOVE(queue, bdev_io, internal.link); 3134 /* 3135 * spdk_bdev_io_complete() assumes that the completed I/O had 3136 * been submitted to the bdev module. Since in this case it 3137 * hadn't, bump io_outstanding to account for the decrement 3138 * that spdk_bdev_io_complete() will do. 3139 */ 3140 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3141 ch->io_outstanding++; 3142 ch->shared_resource->io_outstanding++; 3143 } 3144 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3145 } 3146 } 3147 } 3148 3149 static bool 3150 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3151 { 3152 struct spdk_bdev_io *bdev_io; 3153 3154 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3155 if (bdev_io == bio_to_abort) { 3156 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3157 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3158 return true; 3159 } 3160 } 3161 3162 return false; 3163 } 3164 3165 static bool 3166 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3167 { 3168 struct spdk_bdev_io *bdev_io; 3169 3170 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3171 if (bdev_io == bio_to_abort) { 3172 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3173 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3174 return true; 3175 } 3176 } 3177 3178 return false; 3179 } 3180 3181 static void 3182 bdev_qos_channel_destroy(void *cb_arg) 3183 { 3184 struct spdk_bdev_qos *qos = cb_arg; 3185 3186 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3187 spdk_poller_unregister(&qos->poller); 3188 3189 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3190 3191 free(qos); 3192 } 3193 3194 static int 3195 bdev_qos_destroy(struct spdk_bdev *bdev) 3196 { 3197 int i; 3198 3199 /* 3200 * Cleanly shutting down the QoS poller is tricky, because 3201 * during the asynchronous operation the user could open 3202 * a new descriptor and create a new channel, spawning 3203 * a new QoS poller. 3204 * 3205 * The strategy is to create a new QoS structure here and swap it 3206 * in. The shutdown path then continues to refer to the old one 3207 * until it completes and then releases it. 3208 */ 3209 struct spdk_bdev_qos *new_qos, *old_qos; 3210 3211 old_qos = bdev->internal.qos; 3212 3213 new_qos = calloc(1, sizeof(*new_qos)); 3214 if (!new_qos) { 3215 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3216 return -ENOMEM; 3217 } 3218 3219 /* Copy the old QoS data into the newly allocated structure */ 3220 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3221 3222 /* Zero out the key parts of the QoS structure */ 3223 new_qos->ch = NULL; 3224 new_qos->thread = NULL; 3225 new_qos->poller = NULL; 3226 TAILQ_INIT(&new_qos->queued); 3227 /* 3228 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3229 * It will be used later for the new QoS structure. 3230 */ 3231 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3232 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3233 new_qos->rate_limits[i].min_per_timeslice = 0; 3234 new_qos->rate_limits[i].max_per_timeslice = 0; 3235 } 3236 3237 bdev->internal.qos = new_qos; 3238 3239 if (old_qos->thread == NULL) { 3240 free(old_qos); 3241 } else { 3242 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3243 } 3244 3245 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3246 * been destroyed yet. The destruction path will end up waiting for the final 3247 * channel to be put before it releases resources. */ 3248 3249 return 0; 3250 } 3251 3252 static void 3253 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3254 { 3255 total->bytes_read += add->bytes_read; 3256 total->num_read_ops += add->num_read_ops; 3257 total->bytes_written += add->bytes_written; 3258 total->num_write_ops += add->num_write_ops; 3259 total->bytes_unmapped += add->bytes_unmapped; 3260 total->num_unmap_ops += add->num_unmap_ops; 3261 total->read_latency_ticks += add->read_latency_ticks; 3262 total->write_latency_ticks += add->write_latency_ticks; 3263 total->unmap_latency_ticks += add->unmap_latency_ticks; 3264 } 3265 3266 static void 3267 bdev_channel_destroy(void *io_device, void *ctx_buf) 3268 { 3269 struct spdk_bdev_channel *ch = ctx_buf; 3270 struct spdk_bdev_mgmt_channel *mgmt_ch; 3271 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3272 3273 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3274 spdk_get_thread()); 3275 3276 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3277 pthread_mutex_lock(&ch->bdev->internal.mutex); 3278 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3279 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3280 3281 mgmt_ch = shared_resource->mgmt_ch; 3282 3283 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3284 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3285 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3286 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3287 3288 if (ch->histogram) { 3289 spdk_histogram_data_free(ch->histogram); 3290 } 3291 3292 bdev_channel_destroy_resource(ch); 3293 } 3294 3295 /* 3296 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3297 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3298 */ 3299 static int 3300 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3301 { 3302 struct spdk_bdev_name *tmp; 3303 3304 bdev_name->name = strdup(name); 3305 if (bdev_name->name == NULL) { 3306 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3307 return -ENOMEM; 3308 } 3309 3310 bdev_name->bdev = bdev; 3311 3312 pthread_mutex_lock(&g_bdev_mgr.mutex); 3313 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3314 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3315 3316 if (tmp != NULL) { 3317 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3318 free(bdev_name->name); 3319 return -EEXIST; 3320 } 3321 3322 return 0; 3323 } 3324 3325 static void 3326 bdev_name_del(struct spdk_bdev_name *bdev_name) 3327 { 3328 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3329 free(bdev_name->name); 3330 } 3331 3332 int 3333 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3334 { 3335 struct spdk_bdev_alias *tmp; 3336 int ret; 3337 3338 if (alias == NULL) { 3339 SPDK_ERRLOG("Empty alias passed\n"); 3340 return -EINVAL; 3341 } 3342 3343 tmp = calloc(1, sizeof(*tmp)); 3344 if (tmp == NULL) { 3345 SPDK_ERRLOG("Unable to allocate alias\n"); 3346 return -ENOMEM; 3347 } 3348 3349 ret = bdev_name_add(&tmp->alias, bdev, alias); 3350 if (ret != 0) { 3351 free(tmp); 3352 return ret; 3353 } 3354 3355 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3356 3357 return 0; 3358 } 3359 3360 int 3361 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3362 { 3363 struct spdk_bdev_alias *tmp; 3364 3365 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3366 if (strcmp(alias, tmp->alias.name) == 0) { 3367 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3368 pthread_mutex_lock(&g_bdev_mgr.mutex); 3369 bdev_name_del(&tmp->alias); 3370 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3371 free(tmp); 3372 return 0; 3373 } 3374 } 3375 3376 SPDK_INFOLOG(bdev, "Alias %s does not exists\n", alias); 3377 3378 return -ENOENT; 3379 } 3380 3381 void 3382 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3383 { 3384 struct spdk_bdev_alias *p, *tmp; 3385 3386 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3387 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3388 pthread_mutex_lock(&g_bdev_mgr.mutex); 3389 bdev_name_del(&p->alias); 3390 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3391 free(p); 3392 } 3393 } 3394 3395 struct spdk_io_channel * 3396 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3397 { 3398 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3399 } 3400 3401 void * 3402 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3403 { 3404 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3405 void *ctx = NULL; 3406 3407 if (bdev->fn_table->get_module_ctx) { 3408 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3409 } 3410 3411 return ctx; 3412 } 3413 3414 const char * 3415 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3416 { 3417 return bdev->module->name; 3418 } 3419 3420 const char * 3421 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3422 { 3423 return bdev->name; 3424 } 3425 3426 const char * 3427 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3428 { 3429 return bdev->product_name; 3430 } 3431 3432 const struct spdk_bdev_aliases_list * 3433 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3434 { 3435 return &bdev->aliases; 3436 } 3437 3438 uint32_t 3439 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3440 { 3441 return bdev->blocklen; 3442 } 3443 3444 uint32_t 3445 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3446 { 3447 return bdev->write_unit_size; 3448 } 3449 3450 uint64_t 3451 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3452 { 3453 return bdev->blockcnt; 3454 } 3455 3456 const char * 3457 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3458 { 3459 return qos_rpc_type[type]; 3460 } 3461 3462 void 3463 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3464 { 3465 int i; 3466 3467 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3468 3469 pthread_mutex_lock(&bdev->internal.mutex); 3470 if (bdev->internal.qos) { 3471 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3472 if (bdev->internal.qos->rate_limits[i].limit != 3473 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3474 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3475 if (bdev_qos_is_iops_rate_limit(i) == false) { 3476 /* Change from Byte to Megabyte which is user visible. */ 3477 limits[i] = limits[i] / 1024 / 1024; 3478 } 3479 } 3480 } 3481 } 3482 pthread_mutex_unlock(&bdev->internal.mutex); 3483 } 3484 3485 size_t 3486 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3487 { 3488 return 1 << bdev->required_alignment; 3489 } 3490 3491 uint32_t 3492 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3493 { 3494 return bdev->optimal_io_boundary; 3495 } 3496 3497 bool 3498 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3499 { 3500 return bdev->write_cache; 3501 } 3502 3503 const struct spdk_uuid * 3504 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3505 { 3506 return &bdev->uuid; 3507 } 3508 3509 uint16_t 3510 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3511 { 3512 return bdev->acwu; 3513 } 3514 3515 uint32_t 3516 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3517 { 3518 return bdev->md_len; 3519 } 3520 3521 bool 3522 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3523 { 3524 return (bdev->md_len != 0) && bdev->md_interleave; 3525 } 3526 3527 bool 3528 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3529 { 3530 return (bdev->md_len != 0) && !bdev->md_interleave; 3531 } 3532 3533 bool 3534 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3535 { 3536 return bdev->zoned; 3537 } 3538 3539 uint32_t 3540 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3541 { 3542 if (spdk_bdev_is_md_interleaved(bdev)) { 3543 return bdev->blocklen - bdev->md_len; 3544 } else { 3545 return bdev->blocklen; 3546 } 3547 } 3548 3549 uint32_t 3550 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3551 { 3552 return bdev->phys_blocklen; 3553 } 3554 3555 static uint32_t 3556 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3557 { 3558 if (!spdk_bdev_is_md_interleaved(bdev)) { 3559 return bdev->blocklen + bdev->md_len; 3560 } else { 3561 return bdev->blocklen; 3562 } 3563 } 3564 3565 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3566 { 3567 if (bdev->md_len != 0) { 3568 return bdev->dif_type; 3569 } else { 3570 return SPDK_DIF_DISABLE; 3571 } 3572 } 3573 3574 bool 3575 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3576 { 3577 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3578 return bdev->dif_is_head_of_md; 3579 } else { 3580 return false; 3581 } 3582 } 3583 3584 bool 3585 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3586 enum spdk_dif_check_type check_type) 3587 { 3588 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3589 return false; 3590 } 3591 3592 switch (check_type) { 3593 case SPDK_DIF_CHECK_TYPE_REFTAG: 3594 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3595 case SPDK_DIF_CHECK_TYPE_APPTAG: 3596 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3597 case SPDK_DIF_CHECK_TYPE_GUARD: 3598 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3599 default: 3600 return false; 3601 } 3602 } 3603 3604 uint64_t 3605 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3606 { 3607 return bdev->internal.measured_queue_depth; 3608 } 3609 3610 uint64_t 3611 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3612 { 3613 return bdev->internal.period; 3614 } 3615 3616 uint64_t 3617 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3618 { 3619 return bdev->internal.weighted_io_time; 3620 } 3621 3622 uint64_t 3623 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3624 { 3625 return bdev->internal.io_time; 3626 } 3627 3628 static void 3629 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3630 { 3631 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3632 3633 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3634 3635 if (bdev->internal.measured_queue_depth) { 3636 bdev->internal.io_time += bdev->internal.period; 3637 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3638 } 3639 } 3640 3641 static void 3642 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3643 { 3644 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3645 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3646 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3647 3648 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3649 spdk_for_each_channel_continue(i, 0); 3650 } 3651 3652 static int 3653 bdev_calculate_measured_queue_depth(void *ctx) 3654 { 3655 struct spdk_bdev *bdev = ctx; 3656 bdev->internal.temporary_queue_depth = 0; 3657 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3658 _calculate_measured_qd_cpl); 3659 return SPDK_POLLER_BUSY; 3660 } 3661 3662 void 3663 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3664 { 3665 bdev->internal.period = period; 3666 3667 if (bdev->internal.qd_poller != NULL) { 3668 spdk_poller_unregister(&bdev->internal.qd_poller); 3669 bdev->internal.measured_queue_depth = UINT64_MAX; 3670 } 3671 3672 if (period != 0) { 3673 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 3674 period); 3675 } 3676 } 3677 3678 static void 3679 _resize_notify(void *arg) 3680 { 3681 struct spdk_bdev_desc *desc = arg; 3682 3683 pthread_mutex_lock(&desc->mutex); 3684 desc->refs--; 3685 if (!desc->closed) { 3686 pthread_mutex_unlock(&desc->mutex); 3687 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 3688 desc->bdev, 3689 desc->callback.ctx); 3690 return; 3691 } else if (0 == desc->refs) { 3692 /* This descriptor was closed after this resize_notify message was sent. 3693 * spdk_bdev_close() could not free the descriptor since this message was 3694 * in flight, so we free it now using bdev_desc_free(). 3695 */ 3696 pthread_mutex_unlock(&desc->mutex); 3697 bdev_desc_free(desc); 3698 return; 3699 } 3700 pthread_mutex_unlock(&desc->mutex); 3701 } 3702 3703 int 3704 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 3705 { 3706 struct spdk_bdev_desc *desc; 3707 int ret; 3708 3709 if (size == bdev->blockcnt) { 3710 return 0; 3711 } 3712 3713 pthread_mutex_lock(&bdev->internal.mutex); 3714 3715 /* bdev has open descriptors */ 3716 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 3717 bdev->blockcnt > size) { 3718 ret = -EBUSY; 3719 } else { 3720 bdev->blockcnt = size; 3721 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 3722 pthread_mutex_lock(&desc->mutex); 3723 if (!desc->closed) { 3724 desc->refs++; 3725 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 3726 } 3727 pthread_mutex_unlock(&desc->mutex); 3728 } 3729 ret = 0; 3730 } 3731 3732 pthread_mutex_unlock(&bdev->internal.mutex); 3733 3734 return ret; 3735 } 3736 3737 /* 3738 * Convert I/O offset and length from bytes to blocks. 3739 * 3740 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 3741 */ 3742 static uint64_t 3743 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 3744 uint64_t num_bytes, uint64_t *num_blocks) 3745 { 3746 uint32_t block_size = bdev->blocklen; 3747 uint8_t shift_cnt; 3748 3749 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3750 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 3751 shift_cnt = spdk_u32log2(block_size); 3752 *offset_blocks = offset_bytes >> shift_cnt; 3753 *num_blocks = num_bytes >> shift_cnt; 3754 return (offset_bytes - (*offset_blocks << shift_cnt)) | 3755 (num_bytes - (*num_blocks << shift_cnt)); 3756 } else { 3757 *offset_blocks = offset_bytes / block_size; 3758 *num_blocks = num_bytes / block_size; 3759 return (offset_bytes % block_size) | (num_bytes % block_size); 3760 } 3761 } 3762 3763 static bool 3764 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 3765 { 3766 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 3767 * has been an overflow and hence the offset has been wrapped around */ 3768 if (offset_blocks + num_blocks < offset_blocks) { 3769 return false; 3770 } 3771 3772 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 3773 if (offset_blocks + num_blocks > bdev->blockcnt) { 3774 return false; 3775 } 3776 3777 return true; 3778 } 3779 3780 static bool 3781 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 3782 { 3783 return _is_buf_allocated(iovs) == (md_buf != NULL); 3784 } 3785 3786 static int 3787 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 3788 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3789 spdk_bdev_io_completion_cb cb, void *cb_arg) 3790 { 3791 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3792 struct spdk_bdev_io *bdev_io; 3793 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3794 3795 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3796 return -EINVAL; 3797 } 3798 3799 bdev_io = bdev_channel_get_io(channel); 3800 if (!bdev_io) { 3801 return -ENOMEM; 3802 } 3803 3804 bdev_io->internal.ch = channel; 3805 bdev_io->internal.desc = desc; 3806 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3807 bdev_io->u.bdev.iovs = &bdev_io->iov; 3808 bdev_io->u.bdev.iovs[0].iov_base = buf; 3809 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3810 bdev_io->u.bdev.iovcnt = 1; 3811 bdev_io->u.bdev.md_buf = md_buf; 3812 bdev_io->u.bdev.num_blocks = num_blocks; 3813 bdev_io->u.bdev.offset_blocks = offset_blocks; 3814 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3815 3816 bdev_io_submit(bdev_io); 3817 return 0; 3818 } 3819 3820 int 3821 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3822 void *buf, uint64_t offset, uint64_t nbytes, 3823 spdk_bdev_io_completion_cb cb, void *cb_arg) 3824 { 3825 uint64_t offset_blocks, num_blocks; 3826 3827 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3828 nbytes, &num_blocks) != 0) { 3829 return -EINVAL; 3830 } 3831 3832 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3833 } 3834 3835 int 3836 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3837 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3838 spdk_bdev_io_completion_cb cb, void *cb_arg) 3839 { 3840 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 3841 } 3842 3843 int 3844 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3845 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3846 spdk_bdev_io_completion_cb cb, void *cb_arg) 3847 { 3848 struct iovec iov = { 3849 .iov_base = buf, 3850 }; 3851 3852 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3853 return -EINVAL; 3854 } 3855 3856 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3857 return -EINVAL; 3858 } 3859 3860 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3861 cb, cb_arg); 3862 } 3863 3864 int 3865 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3866 struct iovec *iov, int iovcnt, 3867 uint64_t offset, uint64_t nbytes, 3868 spdk_bdev_io_completion_cb cb, void *cb_arg) 3869 { 3870 uint64_t offset_blocks, num_blocks; 3871 3872 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3873 nbytes, &num_blocks) != 0) { 3874 return -EINVAL; 3875 } 3876 3877 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3878 } 3879 3880 static int 3881 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3882 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 3883 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 3884 struct spdk_bdev_ext_io_opts *opts) 3885 { 3886 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3887 struct spdk_bdev_io *bdev_io; 3888 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3889 3890 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3891 return -EINVAL; 3892 } 3893 3894 bdev_io = bdev_channel_get_io(channel); 3895 if (!bdev_io) { 3896 return -ENOMEM; 3897 } 3898 3899 bdev_io->internal.ch = channel; 3900 bdev_io->internal.desc = desc; 3901 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3902 bdev_io->u.bdev.iovs = iov; 3903 bdev_io->u.bdev.iovcnt = iovcnt; 3904 bdev_io->u.bdev.md_buf = md_buf; 3905 bdev_io->u.bdev.num_blocks = num_blocks; 3906 bdev_io->u.bdev.offset_blocks = offset_blocks; 3907 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3908 bdev_io->internal.ext_opts = opts; 3909 3910 bdev_io_submit(bdev_io); 3911 return 0; 3912 } 3913 3914 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3915 struct iovec *iov, int iovcnt, 3916 uint64_t offset_blocks, uint64_t num_blocks, 3917 spdk_bdev_io_completion_cb cb, void *cb_arg) 3918 { 3919 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3920 num_blocks, cb, cb_arg, NULL); 3921 } 3922 3923 int 3924 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3925 struct iovec *iov, int iovcnt, void *md_buf, 3926 uint64_t offset_blocks, uint64_t num_blocks, 3927 spdk_bdev_io_completion_cb cb, void *cb_arg) 3928 { 3929 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3930 return -EINVAL; 3931 } 3932 3933 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3934 return -EINVAL; 3935 } 3936 3937 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3938 num_blocks, cb, cb_arg, NULL); 3939 } 3940 3941 int 3942 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3943 struct iovec *iov, int iovcnt, 3944 uint64_t offset_blocks, uint64_t num_blocks, 3945 spdk_bdev_io_completion_cb cb, void *cb_arg, 3946 struct spdk_bdev_ext_io_opts *opts) 3947 { 3948 void *md = NULL; 3949 3950 if (opts) { 3951 md = opts->metadata; 3952 } 3953 3954 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3955 return -EINVAL; 3956 } 3957 3958 if (md && !_bdev_io_check_md_buf(iov, md)) { 3959 return -EINVAL; 3960 } 3961 3962 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 3963 num_blocks, cb, cb_arg, opts); 3964 } 3965 3966 static int 3967 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3968 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3969 spdk_bdev_io_completion_cb cb, void *cb_arg) 3970 { 3971 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3972 struct spdk_bdev_io *bdev_io; 3973 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3974 3975 if (!desc->write) { 3976 return -EBADF; 3977 } 3978 3979 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3980 return -EINVAL; 3981 } 3982 3983 bdev_io = bdev_channel_get_io(channel); 3984 if (!bdev_io) { 3985 return -ENOMEM; 3986 } 3987 3988 bdev_io->internal.ch = channel; 3989 bdev_io->internal.desc = desc; 3990 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 3991 bdev_io->u.bdev.iovs = &bdev_io->iov; 3992 bdev_io->u.bdev.iovs[0].iov_base = buf; 3993 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3994 bdev_io->u.bdev.iovcnt = 1; 3995 bdev_io->u.bdev.md_buf = md_buf; 3996 bdev_io->u.bdev.num_blocks = num_blocks; 3997 bdev_io->u.bdev.offset_blocks = offset_blocks; 3998 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3999 4000 bdev_io_submit(bdev_io); 4001 return 0; 4002 } 4003 4004 int 4005 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4006 void *buf, uint64_t offset, uint64_t nbytes, 4007 spdk_bdev_io_completion_cb cb, void *cb_arg) 4008 { 4009 uint64_t offset_blocks, num_blocks; 4010 4011 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4012 nbytes, &num_blocks) != 0) { 4013 return -EINVAL; 4014 } 4015 4016 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4017 } 4018 4019 int 4020 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4021 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4022 spdk_bdev_io_completion_cb cb, void *cb_arg) 4023 { 4024 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4025 cb, cb_arg); 4026 } 4027 4028 int 4029 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4030 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4031 spdk_bdev_io_completion_cb cb, void *cb_arg) 4032 { 4033 struct iovec iov = { 4034 .iov_base = buf, 4035 }; 4036 4037 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4038 return -EINVAL; 4039 } 4040 4041 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4042 return -EINVAL; 4043 } 4044 4045 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4046 cb, cb_arg); 4047 } 4048 4049 static int 4050 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4051 struct iovec *iov, int iovcnt, void *md_buf, 4052 uint64_t offset_blocks, uint64_t num_blocks, 4053 spdk_bdev_io_completion_cb cb, void *cb_arg, 4054 struct spdk_bdev_ext_io_opts *opts) 4055 { 4056 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4057 struct spdk_bdev_io *bdev_io; 4058 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4059 4060 if (!desc->write) { 4061 return -EBADF; 4062 } 4063 4064 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4065 return -EINVAL; 4066 } 4067 4068 bdev_io = bdev_channel_get_io(channel); 4069 if (!bdev_io) { 4070 return -ENOMEM; 4071 } 4072 4073 bdev_io->internal.ch = channel; 4074 bdev_io->internal.desc = desc; 4075 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4076 bdev_io->u.bdev.iovs = iov; 4077 bdev_io->u.bdev.iovcnt = iovcnt; 4078 bdev_io->u.bdev.md_buf = md_buf; 4079 bdev_io->u.bdev.num_blocks = num_blocks; 4080 bdev_io->u.bdev.offset_blocks = offset_blocks; 4081 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4082 bdev_io->internal.ext_opts = opts; 4083 4084 bdev_io_submit(bdev_io); 4085 return 0; 4086 } 4087 4088 int 4089 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4090 struct iovec *iov, int iovcnt, 4091 uint64_t offset, uint64_t len, 4092 spdk_bdev_io_completion_cb cb, void *cb_arg) 4093 { 4094 uint64_t offset_blocks, num_blocks; 4095 4096 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4097 len, &num_blocks) != 0) { 4098 return -EINVAL; 4099 } 4100 4101 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4102 } 4103 4104 int 4105 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4106 struct iovec *iov, int iovcnt, 4107 uint64_t offset_blocks, uint64_t num_blocks, 4108 spdk_bdev_io_completion_cb cb, void *cb_arg) 4109 { 4110 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4111 num_blocks, cb, cb_arg, NULL); 4112 } 4113 4114 int 4115 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4116 struct iovec *iov, int iovcnt, void *md_buf, 4117 uint64_t offset_blocks, uint64_t num_blocks, 4118 spdk_bdev_io_completion_cb cb, void *cb_arg) 4119 { 4120 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4121 return -EINVAL; 4122 } 4123 4124 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4125 return -EINVAL; 4126 } 4127 4128 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4129 num_blocks, cb, cb_arg, NULL); 4130 } 4131 4132 int 4133 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4134 struct iovec *iov, int iovcnt, 4135 uint64_t offset_blocks, uint64_t num_blocks, 4136 spdk_bdev_io_completion_cb cb, void *cb_arg, 4137 struct spdk_bdev_ext_io_opts *opts) 4138 { 4139 void *md = NULL; 4140 4141 if (opts) { 4142 md = opts->metadata; 4143 } 4144 4145 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4146 return -EINVAL; 4147 } 4148 4149 if (md && !_bdev_io_check_md_buf(iov, md)) { 4150 return -EINVAL; 4151 } 4152 4153 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4154 num_blocks, cb, cb_arg, opts); 4155 } 4156 4157 static void 4158 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4159 { 4160 struct spdk_bdev_io *parent_io = cb_arg; 4161 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4162 int i, rc = 0; 4163 4164 if (!success) { 4165 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4166 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4167 spdk_bdev_free_io(bdev_io); 4168 return; 4169 } 4170 4171 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4172 rc = memcmp(read_buf, 4173 parent_io->u.bdev.iovs[i].iov_base, 4174 parent_io->u.bdev.iovs[i].iov_len); 4175 if (rc) { 4176 break; 4177 } 4178 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4179 } 4180 4181 spdk_bdev_free_io(bdev_io); 4182 4183 if (rc == 0) { 4184 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4185 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4186 } else { 4187 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4188 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4189 } 4190 } 4191 4192 static void 4193 bdev_compare_do_read(void *_bdev_io) 4194 { 4195 struct spdk_bdev_io *bdev_io = _bdev_io; 4196 int rc; 4197 4198 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4199 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4200 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4201 bdev_compare_do_read_done, bdev_io); 4202 4203 if (rc == -ENOMEM) { 4204 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4205 } else if (rc != 0) { 4206 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4207 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4208 } 4209 } 4210 4211 static int 4212 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4213 struct iovec *iov, int iovcnt, void *md_buf, 4214 uint64_t offset_blocks, uint64_t num_blocks, 4215 spdk_bdev_io_completion_cb cb, void *cb_arg) 4216 { 4217 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4218 struct spdk_bdev_io *bdev_io; 4219 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4220 4221 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4222 return -EINVAL; 4223 } 4224 4225 bdev_io = bdev_channel_get_io(channel); 4226 if (!bdev_io) { 4227 return -ENOMEM; 4228 } 4229 4230 bdev_io->internal.ch = channel; 4231 bdev_io->internal.desc = desc; 4232 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4233 bdev_io->u.bdev.iovs = iov; 4234 bdev_io->u.bdev.iovcnt = iovcnt; 4235 bdev_io->u.bdev.md_buf = md_buf; 4236 bdev_io->u.bdev.num_blocks = num_blocks; 4237 bdev_io->u.bdev.offset_blocks = offset_blocks; 4238 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4239 4240 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4241 bdev_io_submit(bdev_io); 4242 return 0; 4243 } 4244 4245 bdev_compare_do_read(bdev_io); 4246 4247 return 0; 4248 } 4249 4250 int 4251 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4252 struct iovec *iov, int iovcnt, 4253 uint64_t offset_blocks, uint64_t num_blocks, 4254 spdk_bdev_io_completion_cb cb, void *cb_arg) 4255 { 4256 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4257 num_blocks, cb, cb_arg); 4258 } 4259 4260 int 4261 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4262 struct iovec *iov, int iovcnt, void *md_buf, 4263 uint64_t offset_blocks, uint64_t num_blocks, 4264 spdk_bdev_io_completion_cb cb, void *cb_arg) 4265 { 4266 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4267 return -EINVAL; 4268 } 4269 4270 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4271 return -EINVAL; 4272 } 4273 4274 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4275 num_blocks, cb, cb_arg); 4276 } 4277 4278 static int 4279 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4280 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4281 spdk_bdev_io_completion_cb cb, void *cb_arg) 4282 { 4283 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4284 struct spdk_bdev_io *bdev_io; 4285 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4286 4287 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4288 return -EINVAL; 4289 } 4290 4291 bdev_io = bdev_channel_get_io(channel); 4292 if (!bdev_io) { 4293 return -ENOMEM; 4294 } 4295 4296 bdev_io->internal.ch = channel; 4297 bdev_io->internal.desc = desc; 4298 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4299 bdev_io->u.bdev.iovs = &bdev_io->iov; 4300 bdev_io->u.bdev.iovs[0].iov_base = buf; 4301 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4302 bdev_io->u.bdev.iovcnt = 1; 4303 bdev_io->u.bdev.md_buf = md_buf; 4304 bdev_io->u.bdev.num_blocks = num_blocks; 4305 bdev_io->u.bdev.offset_blocks = offset_blocks; 4306 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4307 4308 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4309 bdev_io_submit(bdev_io); 4310 return 0; 4311 } 4312 4313 bdev_compare_do_read(bdev_io); 4314 4315 return 0; 4316 } 4317 4318 int 4319 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4320 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4321 spdk_bdev_io_completion_cb cb, void *cb_arg) 4322 { 4323 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4324 cb, cb_arg); 4325 } 4326 4327 int 4328 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4329 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4330 spdk_bdev_io_completion_cb cb, void *cb_arg) 4331 { 4332 struct iovec iov = { 4333 .iov_base = buf, 4334 }; 4335 4336 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4337 return -EINVAL; 4338 } 4339 4340 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4341 return -EINVAL; 4342 } 4343 4344 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4345 cb, cb_arg); 4346 } 4347 4348 static void 4349 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4350 { 4351 struct spdk_bdev_io *bdev_io = ctx; 4352 4353 if (unlock_status) { 4354 SPDK_ERRLOG("LBA range unlock failed\n"); 4355 } 4356 4357 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4358 false, bdev_io->internal.caller_ctx); 4359 } 4360 4361 static void 4362 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4363 { 4364 bdev_io->internal.status = status; 4365 4366 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4367 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4368 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4369 } 4370 4371 static void 4372 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4373 { 4374 struct spdk_bdev_io *parent_io = cb_arg; 4375 4376 if (!success) { 4377 SPDK_ERRLOG("Compare and write operation failed\n"); 4378 } 4379 4380 spdk_bdev_free_io(bdev_io); 4381 4382 bdev_comparev_and_writev_blocks_unlock(parent_io, 4383 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4384 } 4385 4386 static void 4387 bdev_compare_and_write_do_write(void *_bdev_io) 4388 { 4389 struct spdk_bdev_io *bdev_io = _bdev_io; 4390 int rc; 4391 4392 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4393 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4394 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4395 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4396 bdev_compare_and_write_do_write_done, bdev_io); 4397 4398 4399 if (rc == -ENOMEM) { 4400 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4401 } else if (rc != 0) { 4402 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4403 } 4404 } 4405 4406 static void 4407 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4408 { 4409 struct spdk_bdev_io *parent_io = cb_arg; 4410 4411 spdk_bdev_free_io(bdev_io); 4412 4413 if (!success) { 4414 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4415 return; 4416 } 4417 4418 bdev_compare_and_write_do_write(parent_io); 4419 } 4420 4421 static void 4422 bdev_compare_and_write_do_compare(void *_bdev_io) 4423 { 4424 struct spdk_bdev_io *bdev_io = _bdev_io; 4425 int rc; 4426 4427 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4428 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4429 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4430 bdev_compare_and_write_do_compare_done, bdev_io); 4431 4432 if (rc == -ENOMEM) { 4433 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4434 } else if (rc != 0) { 4435 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4436 } 4437 } 4438 4439 static void 4440 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4441 { 4442 struct spdk_bdev_io *bdev_io = ctx; 4443 4444 if (status) { 4445 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4446 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4447 return; 4448 } 4449 4450 bdev_compare_and_write_do_compare(bdev_io); 4451 } 4452 4453 int 4454 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4455 struct iovec *compare_iov, int compare_iovcnt, 4456 struct iovec *write_iov, int write_iovcnt, 4457 uint64_t offset_blocks, uint64_t num_blocks, 4458 spdk_bdev_io_completion_cb cb, void *cb_arg) 4459 { 4460 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4461 struct spdk_bdev_io *bdev_io; 4462 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4463 4464 if (!desc->write) { 4465 return -EBADF; 4466 } 4467 4468 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4469 return -EINVAL; 4470 } 4471 4472 if (num_blocks > bdev->acwu) { 4473 return -EINVAL; 4474 } 4475 4476 bdev_io = bdev_channel_get_io(channel); 4477 if (!bdev_io) { 4478 return -ENOMEM; 4479 } 4480 4481 bdev_io->internal.ch = channel; 4482 bdev_io->internal.desc = desc; 4483 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 4484 bdev_io->u.bdev.iovs = compare_iov; 4485 bdev_io->u.bdev.iovcnt = compare_iovcnt; 4486 bdev_io->u.bdev.fused_iovs = write_iov; 4487 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4488 bdev_io->u.bdev.md_buf = NULL; 4489 bdev_io->u.bdev.num_blocks = num_blocks; 4490 bdev_io->u.bdev.offset_blocks = offset_blocks; 4491 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4492 4493 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4494 bdev_io_submit(bdev_io); 4495 return 0; 4496 } 4497 4498 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4499 bdev_comparev_and_writev_blocks_locked, bdev_io); 4500 } 4501 4502 int 4503 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4504 struct iovec *iov, int iovcnt, 4505 uint64_t offset_blocks, uint64_t num_blocks, 4506 bool populate, 4507 spdk_bdev_io_completion_cb cb, void *cb_arg) 4508 { 4509 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4510 struct spdk_bdev_io *bdev_io; 4511 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4512 4513 if (!desc->write) { 4514 return -EBADF; 4515 } 4516 4517 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4518 return -EINVAL; 4519 } 4520 4521 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4522 return -ENOTSUP; 4523 } 4524 4525 bdev_io = bdev_channel_get_io(channel); 4526 if (!bdev_io) { 4527 return -ENOMEM; 4528 } 4529 4530 bdev_io->internal.ch = channel; 4531 bdev_io->internal.desc = desc; 4532 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4533 bdev_io->u.bdev.num_blocks = num_blocks; 4534 bdev_io->u.bdev.offset_blocks = offset_blocks; 4535 bdev_io->u.bdev.iovs = iov; 4536 bdev_io->u.bdev.iovcnt = iovcnt; 4537 bdev_io->u.bdev.md_buf = NULL; 4538 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4539 bdev_io->u.bdev.zcopy.commit = 0; 4540 bdev_io->u.bdev.zcopy.start = 1; 4541 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4542 4543 bdev_io_submit(bdev_io); 4544 4545 return 0; 4546 } 4547 4548 int 4549 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4550 spdk_bdev_io_completion_cb cb, void *cb_arg) 4551 { 4552 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4553 return -EINVAL; 4554 } 4555 4556 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4557 bdev_io->u.bdev.zcopy.start = 0; 4558 bdev_io->internal.caller_ctx = cb_arg; 4559 bdev_io->internal.cb = cb; 4560 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4561 4562 bdev_io_submit(bdev_io); 4563 4564 return 0; 4565 } 4566 4567 int 4568 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4569 uint64_t offset, uint64_t len, 4570 spdk_bdev_io_completion_cb cb, void *cb_arg) 4571 { 4572 uint64_t offset_blocks, num_blocks; 4573 4574 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4575 len, &num_blocks) != 0) { 4576 return -EINVAL; 4577 } 4578 4579 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4580 } 4581 4582 int 4583 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4584 uint64_t offset_blocks, uint64_t num_blocks, 4585 spdk_bdev_io_completion_cb cb, void *cb_arg) 4586 { 4587 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4588 struct spdk_bdev_io *bdev_io; 4589 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4590 4591 if (!desc->write) { 4592 return -EBADF; 4593 } 4594 4595 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4596 return -EINVAL; 4597 } 4598 4599 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4600 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4601 return -ENOTSUP; 4602 } 4603 4604 bdev_io = bdev_channel_get_io(channel); 4605 4606 if (!bdev_io) { 4607 return -ENOMEM; 4608 } 4609 4610 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 4611 bdev_io->internal.ch = channel; 4612 bdev_io->internal.desc = desc; 4613 bdev_io->u.bdev.offset_blocks = offset_blocks; 4614 bdev_io->u.bdev.num_blocks = num_blocks; 4615 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4616 4617 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 4618 bdev_io_submit(bdev_io); 4619 return 0; 4620 } 4621 4622 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 4623 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 4624 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 4625 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 4626 bdev_write_zero_buffer_next(bdev_io); 4627 4628 return 0; 4629 } 4630 4631 int 4632 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4633 uint64_t offset, uint64_t nbytes, 4634 spdk_bdev_io_completion_cb cb, void *cb_arg) 4635 { 4636 uint64_t offset_blocks, num_blocks; 4637 4638 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4639 nbytes, &num_blocks) != 0) { 4640 return -EINVAL; 4641 } 4642 4643 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4644 } 4645 4646 int 4647 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4648 uint64_t offset_blocks, uint64_t num_blocks, 4649 spdk_bdev_io_completion_cb cb, void *cb_arg) 4650 { 4651 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4652 struct spdk_bdev_io *bdev_io; 4653 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4654 4655 if (!desc->write) { 4656 return -EBADF; 4657 } 4658 4659 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4660 return -EINVAL; 4661 } 4662 4663 if (num_blocks == 0) { 4664 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 4665 return -EINVAL; 4666 } 4667 4668 bdev_io = bdev_channel_get_io(channel); 4669 if (!bdev_io) { 4670 return -ENOMEM; 4671 } 4672 4673 bdev_io->internal.ch = channel; 4674 bdev_io->internal.desc = desc; 4675 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 4676 4677 bdev_io->u.bdev.iovs = &bdev_io->iov; 4678 bdev_io->u.bdev.iovs[0].iov_base = NULL; 4679 bdev_io->u.bdev.iovs[0].iov_len = 0; 4680 bdev_io->u.bdev.iovcnt = 1; 4681 4682 bdev_io->u.bdev.offset_blocks = offset_blocks; 4683 bdev_io->u.bdev.num_blocks = num_blocks; 4684 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4685 4686 bdev_io_submit(bdev_io); 4687 return 0; 4688 } 4689 4690 int 4691 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4692 uint64_t offset, uint64_t length, 4693 spdk_bdev_io_completion_cb cb, void *cb_arg) 4694 { 4695 uint64_t offset_blocks, num_blocks; 4696 4697 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4698 length, &num_blocks) != 0) { 4699 return -EINVAL; 4700 } 4701 4702 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4703 } 4704 4705 int 4706 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4707 uint64_t offset_blocks, uint64_t num_blocks, 4708 spdk_bdev_io_completion_cb cb, void *cb_arg) 4709 { 4710 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4711 struct spdk_bdev_io *bdev_io; 4712 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4713 4714 if (!desc->write) { 4715 return -EBADF; 4716 } 4717 4718 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4719 return -EINVAL; 4720 } 4721 4722 bdev_io = bdev_channel_get_io(channel); 4723 if (!bdev_io) { 4724 return -ENOMEM; 4725 } 4726 4727 bdev_io->internal.ch = channel; 4728 bdev_io->internal.desc = desc; 4729 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 4730 bdev_io->u.bdev.iovs = NULL; 4731 bdev_io->u.bdev.iovcnt = 0; 4732 bdev_io->u.bdev.offset_blocks = offset_blocks; 4733 bdev_io->u.bdev.num_blocks = num_blocks; 4734 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4735 4736 bdev_io_submit(bdev_io); 4737 return 0; 4738 } 4739 4740 static void 4741 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 4742 { 4743 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 4744 struct spdk_bdev_io *bdev_io; 4745 4746 bdev_io = TAILQ_FIRST(&ch->queued_resets); 4747 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 4748 bdev_io_submit_reset(bdev_io); 4749 } 4750 4751 static void 4752 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 4753 { 4754 struct spdk_io_channel *ch; 4755 struct spdk_bdev_channel *channel; 4756 struct spdk_bdev_mgmt_channel *mgmt_channel; 4757 struct spdk_bdev_shared_resource *shared_resource; 4758 bdev_io_tailq_t tmp_queued; 4759 4760 TAILQ_INIT(&tmp_queued); 4761 4762 ch = spdk_io_channel_iter_get_channel(i); 4763 channel = spdk_io_channel_get_ctx(ch); 4764 shared_resource = channel->shared_resource; 4765 mgmt_channel = shared_resource->mgmt_ch; 4766 4767 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 4768 4769 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 4770 /* The QoS object is always valid and readable while 4771 * the channel flag is set, so the lock here should not 4772 * be necessary. We're not in the fast path though, so 4773 * just take it anyway. */ 4774 pthread_mutex_lock(&channel->bdev->internal.mutex); 4775 if (channel->bdev->internal.qos->ch == channel) { 4776 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 4777 } 4778 pthread_mutex_unlock(&channel->bdev->internal.mutex); 4779 } 4780 4781 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 4782 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 4783 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 4784 bdev_abort_all_queued_io(&tmp_queued, channel); 4785 4786 spdk_for_each_channel_continue(i, 0); 4787 } 4788 4789 static void 4790 bdev_start_reset(void *ctx) 4791 { 4792 struct spdk_bdev_channel *ch = ctx; 4793 4794 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 4795 ch, bdev_reset_dev); 4796 } 4797 4798 static void 4799 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 4800 { 4801 struct spdk_bdev *bdev = ch->bdev; 4802 4803 assert(!TAILQ_EMPTY(&ch->queued_resets)); 4804 4805 pthread_mutex_lock(&bdev->internal.mutex); 4806 if (bdev->internal.reset_in_progress == NULL) { 4807 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 4808 /* 4809 * Take a channel reference for the target bdev for the life of this 4810 * reset. This guards against the channel getting destroyed while 4811 * spdk_for_each_channel() calls related to this reset IO are in 4812 * progress. We will release the reference when this reset is 4813 * completed. 4814 */ 4815 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4816 bdev_start_reset(ch); 4817 } 4818 pthread_mutex_unlock(&bdev->internal.mutex); 4819 } 4820 4821 int 4822 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4823 spdk_bdev_io_completion_cb cb, void *cb_arg) 4824 { 4825 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4826 struct spdk_bdev_io *bdev_io; 4827 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4828 4829 bdev_io = bdev_channel_get_io(channel); 4830 if (!bdev_io) { 4831 return -ENOMEM; 4832 } 4833 4834 bdev_io->internal.ch = channel; 4835 bdev_io->internal.desc = desc; 4836 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4837 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 4838 bdev_io->u.reset.ch_ref = NULL; 4839 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4840 4841 pthread_mutex_lock(&bdev->internal.mutex); 4842 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 4843 pthread_mutex_unlock(&bdev->internal.mutex); 4844 4845 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 4846 internal.ch_link); 4847 4848 bdev_channel_start_reset(channel); 4849 4850 return 0; 4851 } 4852 4853 void 4854 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4855 struct spdk_bdev_io_stat *stat) 4856 { 4857 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4858 4859 *stat = channel->stat; 4860 } 4861 4862 static void 4863 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 4864 { 4865 void *io_device = spdk_io_channel_iter_get_io_device(i); 4866 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4867 4868 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 4869 bdev_iostat_ctx->cb_arg, 0); 4870 free(bdev_iostat_ctx); 4871 } 4872 4873 static void 4874 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 4875 { 4876 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4877 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4878 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4879 4880 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 4881 spdk_for_each_channel_continue(i, 0); 4882 } 4883 4884 void 4885 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 4886 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 4887 { 4888 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 4889 4890 assert(bdev != NULL); 4891 assert(stat != NULL); 4892 assert(cb != NULL); 4893 4894 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 4895 if (bdev_iostat_ctx == NULL) { 4896 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 4897 cb(bdev, stat, cb_arg, -ENOMEM); 4898 return; 4899 } 4900 4901 bdev_iostat_ctx->stat = stat; 4902 bdev_iostat_ctx->cb = cb; 4903 bdev_iostat_ctx->cb_arg = cb_arg; 4904 4905 /* Start with the statistics from previously deleted channels. */ 4906 pthread_mutex_lock(&bdev->internal.mutex); 4907 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 4908 pthread_mutex_unlock(&bdev->internal.mutex); 4909 4910 /* Then iterate and add the statistics from each existing channel. */ 4911 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4912 bdev_get_each_channel_stat, 4913 bdev_iostat_ctx, 4914 bdev_get_device_stat_done); 4915 } 4916 4917 int 4918 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4919 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4920 spdk_bdev_io_completion_cb cb, void *cb_arg) 4921 { 4922 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4923 struct spdk_bdev_io *bdev_io; 4924 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4925 4926 if (!desc->write) { 4927 return -EBADF; 4928 } 4929 4930 bdev_io = bdev_channel_get_io(channel); 4931 if (!bdev_io) { 4932 return -ENOMEM; 4933 } 4934 4935 bdev_io->internal.ch = channel; 4936 bdev_io->internal.desc = desc; 4937 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 4938 bdev_io->u.nvme_passthru.cmd = *cmd; 4939 bdev_io->u.nvme_passthru.buf = buf; 4940 bdev_io->u.nvme_passthru.nbytes = nbytes; 4941 bdev_io->u.nvme_passthru.md_buf = NULL; 4942 bdev_io->u.nvme_passthru.md_len = 0; 4943 4944 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4945 4946 bdev_io_submit(bdev_io); 4947 return 0; 4948 } 4949 4950 int 4951 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4952 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4953 spdk_bdev_io_completion_cb cb, void *cb_arg) 4954 { 4955 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4956 struct spdk_bdev_io *bdev_io; 4957 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4958 4959 if (!desc->write) { 4960 /* 4961 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 4962 * to easily determine if the command is a read or write, but for now just 4963 * do not allow io_passthru with a read-only descriptor. 4964 */ 4965 return -EBADF; 4966 } 4967 4968 bdev_io = bdev_channel_get_io(channel); 4969 if (!bdev_io) { 4970 return -ENOMEM; 4971 } 4972 4973 bdev_io->internal.ch = channel; 4974 bdev_io->internal.desc = desc; 4975 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 4976 bdev_io->u.nvme_passthru.cmd = *cmd; 4977 bdev_io->u.nvme_passthru.buf = buf; 4978 bdev_io->u.nvme_passthru.nbytes = nbytes; 4979 bdev_io->u.nvme_passthru.md_buf = NULL; 4980 bdev_io->u.nvme_passthru.md_len = 0; 4981 4982 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4983 4984 bdev_io_submit(bdev_io); 4985 return 0; 4986 } 4987 4988 int 4989 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4990 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 4991 spdk_bdev_io_completion_cb cb, void *cb_arg) 4992 { 4993 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4994 struct spdk_bdev_io *bdev_io; 4995 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4996 4997 if (!desc->write) { 4998 /* 4999 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5000 * to easily determine if the command is a read or write, but for now just 5001 * do not allow io_passthru with a read-only descriptor. 5002 */ 5003 return -EBADF; 5004 } 5005 5006 bdev_io = bdev_channel_get_io(channel); 5007 if (!bdev_io) { 5008 return -ENOMEM; 5009 } 5010 5011 bdev_io->internal.ch = channel; 5012 bdev_io->internal.desc = desc; 5013 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5014 bdev_io->u.nvme_passthru.cmd = *cmd; 5015 bdev_io->u.nvme_passthru.buf = buf; 5016 bdev_io->u.nvme_passthru.nbytes = nbytes; 5017 bdev_io->u.nvme_passthru.md_buf = md_buf; 5018 bdev_io->u.nvme_passthru.md_len = md_len; 5019 5020 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5021 5022 bdev_io_submit(bdev_io); 5023 return 0; 5024 } 5025 5026 static void bdev_abort_retry(void *ctx); 5027 static void bdev_abort(struct spdk_bdev_io *parent_io); 5028 5029 static void 5030 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5031 { 5032 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5033 struct spdk_bdev_io *parent_io = cb_arg; 5034 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5035 5036 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5037 5038 spdk_bdev_free_io(bdev_io); 5039 5040 if (!success) { 5041 /* Check if the target I/O completed in the meantime. */ 5042 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5043 if (tmp_io == bio_to_abort) { 5044 break; 5045 } 5046 } 5047 5048 /* If the target I/O still exists, set the parent to failed. */ 5049 if (tmp_io != NULL) { 5050 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5051 } 5052 } 5053 5054 parent_io->u.bdev.split_outstanding--; 5055 if (parent_io->u.bdev.split_outstanding == 0) { 5056 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5057 bdev_abort_retry(parent_io); 5058 } else { 5059 bdev_io_complete(parent_io); 5060 } 5061 } 5062 } 5063 5064 static int 5065 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5066 struct spdk_bdev_io *bio_to_abort, 5067 spdk_bdev_io_completion_cb cb, void *cb_arg) 5068 { 5069 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5070 struct spdk_bdev_io *bdev_io; 5071 5072 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5073 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5074 /* TODO: Abort reset or abort request. */ 5075 return -ENOTSUP; 5076 } 5077 5078 bdev_io = bdev_channel_get_io(channel); 5079 if (bdev_io == NULL) { 5080 return -ENOMEM; 5081 } 5082 5083 bdev_io->internal.ch = channel; 5084 bdev_io->internal.desc = desc; 5085 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5086 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5087 5088 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5089 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5090 5091 /* Parent abort request is not submitted directly, but to manage its 5092 * execution add it to the submitted list here. 5093 */ 5094 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5095 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5096 5097 bdev_abort(bdev_io); 5098 5099 return 0; 5100 } 5101 5102 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5103 5104 /* Submit the abort request to the underlying bdev module. */ 5105 bdev_io_submit(bdev_io); 5106 5107 return 0; 5108 } 5109 5110 static uint32_t 5111 _bdev_abort(struct spdk_bdev_io *parent_io) 5112 { 5113 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5114 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5115 void *bio_cb_arg; 5116 struct spdk_bdev_io *bio_to_abort; 5117 uint32_t matched_ios; 5118 int rc; 5119 5120 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5121 5122 /* matched_ios is returned and will be kept by the caller. 5123 * 5124 * This funcion will be used for two cases, 1) the same cb_arg is used for 5125 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5126 * Incrementing split_outstanding directly here may confuse readers especially 5127 * for the 1st case. 5128 * 5129 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5130 * works as expected. 5131 */ 5132 matched_ios = 0; 5133 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5134 5135 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5136 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5137 continue; 5138 } 5139 5140 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5141 /* Any I/O which was submitted after this abort command should be excluded. */ 5142 continue; 5143 } 5144 5145 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5146 if (rc != 0) { 5147 if (rc == -ENOMEM) { 5148 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5149 } else { 5150 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5151 } 5152 break; 5153 } 5154 matched_ios++; 5155 } 5156 5157 return matched_ios; 5158 } 5159 5160 static void 5161 bdev_abort_retry(void *ctx) 5162 { 5163 struct spdk_bdev_io *parent_io = ctx; 5164 uint32_t matched_ios; 5165 5166 matched_ios = _bdev_abort(parent_io); 5167 5168 if (matched_ios == 0) { 5169 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5170 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5171 } else { 5172 /* For retry, the case that no target I/O was found is success 5173 * because it means target I/Os completed in the meantime. 5174 */ 5175 bdev_io_complete(parent_io); 5176 } 5177 return; 5178 } 5179 5180 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5181 parent_io->u.bdev.split_outstanding = matched_ios; 5182 } 5183 5184 static void 5185 bdev_abort(struct spdk_bdev_io *parent_io) 5186 { 5187 uint32_t matched_ios; 5188 5189 matched_ios = _bdev_abort(parent_io); 5190 5191 if (matched_ios == 0) { 5192 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5193 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5194 } else { 5195 /* The case the no target I/O was found is failure. */ 5196 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5197 bdev_io_complete(parent_io); 5198 } 5199 return; 5200 } 5201 5202 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5203 parent_io->u.bdev.split_outstanding = matched_ios; 5204 } 5205 5206 int 5207 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5208 void *bio_cb_arg, 5209 spdk_bdev_io_completion_cb cb, void *cb_arg) 5210 { 5211 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5212 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5213 struct spdk_bdev_io *bdev_io; 5214 5215 if (bio_cb_arg == NULL) { 5216 return -EINVAL; 5217 } 5218 5219 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5220 return -ENOTSUP; 5221 } 5222 5223 bdev_io = bdev_channel_get_io(channel); 5224 if (bdev_io == NULL) { 5225 return -ENOMEM; 5226 } 5227 5228 bdev_io->internal.ch = channel; 5229 bdev_io->internal.desc = desc; 5230 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5231 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5232 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5233 5234 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5235 5236 /* Parent abort request is not submitted directly, but to manage its execution, 5237 * add it to the submitted list here. 5238 */ 5239 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5240 5241 bdev_abort(bdev_io); 5242 5243 return 0; 5244 } 5245 5246 int 5247 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5248 struct spdk_bdev_io_wait_entry *entry) 5249 { 5250 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5251 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5252 5253 if (bdev != entry->bdev) { 5254 SPDK_ERRLOG("bdevs do not match\n"); 5255 return -EINVAL; 5256 } 5257 5258 if (mgmt_ch->per_thread_cache_count > 0) { 5259 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5260 return -EINVAL; 5261 } 5262 5263 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5264 return 0; 5265 } 5266 5267 static void 5268 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 5269 { 5270 struct spdk_bdev *bdev = bdev_ch->bdev; 5271 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5272 struct spdk_bdev_io *bdev_io; 5273 5274 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 5275 /* 5276 * Allow some more I/O to complete before retrying the nomem_io queue. 5277 * Some drivers (such as nvme) cannot immediately take a new I/O in 5278 * the context of a completion, because the resources for the I/O are 5279 * not released until control returns to the bdev poller. Also, we 5280 * may require several small I/O to complete before a larger I/O 5281 * (that requires splitting) can be submitted. 5282 */ 5283 return; 5284 } 5285 5286 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 5287 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 5288 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 5289 bdev_io->internal.ch->io_outstanding++; 5290 shared_resource->io_outstanding++; 5291 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5292 bdev_io->internal.error.nvme.cdw0 = 0; 5293 bdev_io->num_retries++; 5294 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 5295 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5296 break; 5297 } 5298 } 5299 } 5300 5301 static inline void 5302 bdev_io_complete(void *ctx) 5303 { 5304 struct spdk_bdev_io *bdev_io = ctx; 5305 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5306 uint64_t tsc, tsc_diff; 5307 5308 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5309 /* 5310 * Send the completion to the thread that originally submitted the I/O, 5311 * which may not be the current thread in the case of QoS. 5312 */ 5313 if (bdev_io->internal.io_submit_ch) { 5314 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5315 bdev_io->internal.io_submit_ch = NULL; 5316 } 5317 5318 /* 5319 * Defer completion to avoid potential infinite recursion if the 5320 * user's completion callback issues a new I/O. 5321 */ 5322 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5323 bdev_io_complete, bdev_io); 5324 return; 5325 } 5326 5327 tsc = spdk_get_ticks(); 5328 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5329 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5330 bdev_io->internal.caller_ctx); 5331 5332 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5333 5334 if (bdev_io->internal.ch->histogram) { 5335 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5336 } 5337 5338 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5339 switch (bdev_io->type) { 5340 case SPDK_BDEV_IO_TYPE_READ: 5341 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5342 bdev_io->internal.ch->stat.num_read_ops++; 5343 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5344 break; 5345 case SPDK_BDEV_IO_TYPE_WRITE: 5346 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5347 bdev_io->internal.ch->stat.num_write_ops++; 5348 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5349 break; 5350 case SPDK_BDEV_IO_TYPE_UNMAP: 5351 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5352 bdev_io->internal.ch->stat.num_unmap_ops++; 5353 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5354 break; 5355 case SPDK_BDEV_IO_TYPE_ZCOPY: 5356 /* Track the data in the start phase only */ 5357 if (bdev_io->u.bdev.zcopy.start) { 5358 if (bdev_io->u.bdev.zcopy.populate) { 5359 bdev_io->internal.ch->stat.bytes_read += 5360 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5361 bdev_io->internal.ch->stat.num_read_ops++; 5362 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5363 } else { 5364 bdev_io->internal.ch->stat.bytes_written += 5365 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5366 bdev_io->internal.ch->stat.num_write_ops++; 5367 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5368 } 5369 } 5370 break; 5371 default: 5372 break; 5373 } 5374 } 5375 5376 #ifdef SPDK_CONFIG_VTUNE 5377 uint64_t now_tsc = spdk_get_ticks(); 5378 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5379 uint64_t data[5]; 5380 5381 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5382 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5383 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5384 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5385 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5386 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5387 5388 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5389 __itt_metadata_u64, 5, data); 5390 5391 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5392 bdev_io->internal.ch->start_tsc = now_tsc; 5393 } 5394 #endif 5395 5396 assert(bdev_io->internal.cb != NULL); 5397 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5398 5399 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5400 bdev_io->internal.caller_ctx); 5401 } 5402 5403 static void 5404 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 5405 { 5406 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5407 5408 if (bdev_io->u.reset.ch_ref != NULL) { 5409 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 5410 bdev_io->u.reset.ch_ref = NULL; 5411 } 5412 5413 bdev_io_complete(bdev_io); 5414 } 5415 5416 static void 5417 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 5418 { 5419 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5420 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5421 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5422 struct spdk_bdev_io *queued_reset; 5423 5424 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 5425 while (!TAILQ_EMPTY(&ch->queued_resets)) { 5426 queued_reset = TAILQ_FIRST(&ch->queued_resets); 5427 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 5428 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 5429 } 5430 5431 spdk_for_each_channel_continue(i, 0); 5432 } 5433 5434 void 5435 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 5436 { 5437 struct spdk_bdev *bdev = bdev_io->bdev; 5438 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5439 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5440 5441 bdev_io->internal.status = status; 5442 5443 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5444 bool unlock_channels = false; 5445 5446 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5447 SPDK_ERRLOG("NOMEM returned for reset\n"); 5448 } 5449 pthread_mutex_lock(&bdev->internal.mutex); 5450 if (bdev_io == bdev->internal.reset_in_progress) { 5451 bdev->internal.reset_in_progress = NULL; 5452 unlock_channels = true; 5453 } 5454 pthread_mutex_unlock(&bdev->internal.mutex); 5455 5456 if (unlock_channels) { 5457 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5458 bdev_io, bdev_reset_complete); 5459 return; 5460 } 5461 } else { 5462 _bdev_io_unset_bounce_buf(bdev_io); 5463 5464 assert(bdev_ch->io_outstanding > 0); 5465 assert(shared_resource->io_outstanding > 0); 5466 bdev_ch->io_outstanding--; 5467 shared_resource->io_outstanding--; 5468 5469 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 5470 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 5471 /* 5472 * Wait for some of the outstanding I/O to complete before we 5473 * retry any of the nomem_io. Normally we will wait for 5474 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 5475 * depth channels we will instead wait for half to complete. 5476 */ 5477 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 5478 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 5479 return; 5480 } 5481 5482 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 5483 bdev_ch_retry_io(bdev_ch); 5484 } 5485 } 5486 5487 bdev_io_complete(bdev_io); 5488 } 5489 5490 void 5491 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5492 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5493 { 5494 if (sc == SPDK_SCSI_STATUS_GOOD) { 5495 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5496 } else { 5497 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5498 bdev_io->internal.error.scsi.sc = sc; 5499 bdev_io->internal.error.scsi.sk = sk; 5500 bdev_io->internal.error.scsi.asc = asc; 5501 bdev_io->internal.error.scsi.ascq = ascq; 5502 } 5503 5504 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5505 } 5506 5507 void 5508 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5509 int *sc, int *sk, int *asc, int *ascq) 5510 { 5511 assert(sc != NULL); 5512 assert(sk != NULL); 5513 assert(asc != NULL); 5514 assert(ascq != NULL); 5515 5516 switch (bdev_io->internal.status) { 5517 case SPDK_BDEV_IO_STATUS_SUCCESS: 5518 *sc = SPDK_SCSI_STATUS_GOOD; 5519 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5520 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5521 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5522 break; 5523 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5524 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5525 break; 5526 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5527 *sc = bdev_io->internal.error.scsi.sc; 5528 *sk = bdev_io->internal.error.scsi.sk; 5529 *asc = bdev_io->internal.error.scsi.asc; 5530 *ascq = bdev_io->internal.error.scsi.ascq; 5531 break; 5532 default: 5533 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5534 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5535 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5536 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5537 break; 5538 } 5539 } 5540 5541 void 5542 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 5543 { 5544 if (aio_result == 0) { 5545 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5546 } else { 5547 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 5548 } 5549 5550 bdev_io->internal.error.aio_result = aio_result; 5551 5552 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5553 } 5554 5555 void 5556 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 5557 { 5558 assert(aio_result != NULL); 5559 5560 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 5561 *aio_result = bdev_io->internal.error.aio_result; 5562 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5563 *aio_result = 0; 5564 } else { 5565 *aio_result = -EIO; 5566 } 5567 } 5568 5569 void 5570 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5571 { 5572 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5573 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5574 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 5575 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 5576 } else { 5577 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5578 } 5579 5580 bdev_io->internal.error.nvme.cdw0 = cdw0; 5581 bdev_io->internal.error.nvme.sct = sct; 5582 bdev_io->internal.error.nvme.sc = sc; 5583 5584 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5585 } 5586 5587 void 5588 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5589 { 5590 assert(sct != NULL); 5591 assert(sc != NULL); 5592 assert(cdw0 != NULL); 5593 5594 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5595 *sct = bdev_io->internal.error.nvme.sct; 5596 *sc = bdev_io->internal.error.nvme.sc; 5597 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5598 *sct = SPDK_NVME_SCT_GENERIC; 5599 *sc = SPDK_NVME_SC_SUCCESS; 5600 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5601 *sct = SPDK_NVME_SCT_GENERIC; 5602 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5603 } else { 5604 *sct = SPDK_NVME_SCT_GENERIC; 5605 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5606 } 5607 5608 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5609 } 5610 5611 void 5612 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5613 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5614 { 5615 assert(first_sct != NULL); 5616 assert(first_sc != NULL); 5617 assert(second_sct != NULL); 5618 assert(second_sc != NULL); 5619 assert(cdw0 != NULL); 5620 5621 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5622 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5623 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5624 *first_sct = bdev_io->internal.error.nvme.sct; 5625 *first_sc = bdev_io->internal.error.nvme.sc; 5626 *second_sct = SPDK_NVME_SCT_GENERIC; 5627 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5628 } else { 5629 *first_sct = SPDK_NVME_SCT_GENERIC; 5630 *first_sc = SPDK_NVME_SC_SUCCESS; 5631 *second_sct = bdev_io->internal.error.nvme.sct; 5632 *second_sc = bdev_io->internal.error.nvme.sc; 5633 } 5634 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5635 *first_sct = SPDK_NVME_SCT_GENERIC; 5636 *first_sc = SPDK_NVME_SC_SUCCESS; 5637 *second_sct = SPDK_NVME_SCT_GENERIC; 5638 *second_sc = SPDK_NVME_SC_SUCCESS; 5639 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 5640 *first_sct = SPDK_NVME_SCT_GENERIC; 5641 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5642 *second_sct = SPDK_NVME_SCT_GENERIC; 5643 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5644 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 5645 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 5646 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 5647 *second_sct = SPDK_NVME_SCT_GENERIC; 5648 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5649 } else { 5650 *first_sct = SPDK_NVME_SCT_GENERIC; 5651 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5652 *second_sct = SPDK_NVME_SCT_GENERIC; 5653 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5654 } 5655 5656 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5657 } 5658 5659 struct spdk_thread * 5660 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 5661 { 5662 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 5663 } 5664 5665 struct spdk_io_channel * 5666 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 5667 { 5668 return bdev_io->internal.ch->channel; 5669 } 5670 5671 static int 5672 bdev_register(struct spdk_bdev *bdev) 5673 { 5674 char *bdev_name; 5675 int ret; 5676 5677 assert(bdev->module != NULL); 5678 5679 if (!bdev->name) { 5680 SPDK_ERRLOG("Bdev name is NULL\n"); 5681 return -EINVAL; 5682 } 5683 5684 if (!strlen(bdev->name)) { 5685 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 5686 return -EINVAL; 5687 } 5688 5689 /* Users often register their own I/O devices using the bdev name. In 5690 * order to avoid conflicts, prepend bdev_. */ 5691 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 5692 if (!bdev_name) { 5693 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 5694 return -ENOMEM; 5695 } 5696 5697 bdev->internal.status = SPDK_BDEV_STATUS_READY; 5698 bdev->internal.measured_queue_depth = UINT64_MAX; 5699 bdev->internal.claim_module = NULL; 5700 bdev->internal.qd_poller = NULL; 5701 bdev->internal.qos = NULL; 5702 5703 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 5704 if (ret != 0) { 5705 free(bdev_name); 5706 return ret; 5707 } 5708 5709 /* If the user didn't specify a uuid, generate one. */ 5710 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 5711 spdk_uuid_generate(&bdev->uuid); 5712 } 5713 5714 if (spdk_bdev_get_buf_align(bdev) > 1) { 5715 if (bdev->split_on_optimal_io_boundary) { 5716 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 5717 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 5718 } else { 5719 bdev->split_on_optimal_io_boundary = true; 5720 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 5721 } 5722 } 5723 5724 /* If the user didn't specify a write unit size, set it to one. */ 5725 if (bdev->write_unit_size == 0) { 5726 bdev->write_unit_size = 1; 5727 } 5728 5729 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 5730 if (bdev->acwu == 0) { 5731 bdev->acwu = 1; 5732 } 5733 5734 if (bdev->phys_blocklen == 0) { 5735 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 5736 } 5737 5738 TAILQ_INIT(&bdev->internal.open_descs); 5739 TAILQ_INIT(&bdev->internal.locked_ranges); 5740 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 5741 5742 TAILQ_INIT(&bdev->aliases); 5743 5744 bdev->internal.reset_in_progress = NULL; 5745 5746 spdk_io_device_register(__bdev_to_io_dev(bdev), 5747 bdev_channel_create, bdev_channel_destroy, 5748 sizeof(struct spdk_bdev_channel), 5749 bdev_name); 5750 5751 free(bdev_name); 5752 5753 pthread_mutex_init(&bdev->internal.mutex, NULL); 5754 5755 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 5756 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 5757 5758 return 0; 5759 } 5760 5761 static void 5762 bdev_destroy_cb(void *io_device) 5763 { 5764 int rc; 5765 struct spdk_bdev *bdev; 5766 spdk_bdev_unregister_cb cb_fn; 5767 void *cb_arg; 5768 5769 bdev = __bdev_from_io_dev(io_device); 5770 cb_fn = bdev->internal.unregister_cb; 5771 cb_arg = bdev->internal.unregister_ctx; 5772 5773 pthread_mutex_destroy(&bdev->internal.mutex); 5774 free(bdev->internal.qos); 5775 5776 rc = bdev->fn_table->destruct(bdev->ctxt); 5777 if (rc < 0) { 5778 SPDK_ERRLOG("destruct failed\n"); 5779 } 5780 if (rc <= 0 && cb_fn != NULL) { 5781 cb_fn(cb_arg, rc); 5782 } 5783 } 5784 5785 static void 5786 bdev_register_finished(void *arg) 5787 { 5788 struct spdk_bdev *bdev = arg; 5789 5790 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 5791 } 5792 5793 int 5794 spdk_bdev_register(struct spdk_bdev *bdev) 5795 { 5796 int rc = bdev_register(bdev); 5797 5798 if (rc == 0) { 5799 /* Examine configuration before initializing I/O */ 5800 bdev_examine(bdev); 5801 5802 spdk_bdev_wait_for_examine(bdev_register_finished, bdev); 5803 } 5804 5805 return rc; 5806 } 5807 5808 void 5809 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 5810 { 5811 if (bdev->internal.unregister_cb != NULL) { 5812 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 5813 } 5814 } 5815 5816 static void 5817 _remove_notify(void *arg) 5818 { 5819 struct spdk_bdev_desc *desc = arg; 5820 5821 pthread_mutex_lock(&desc->mutex); 5822 desc->refs--; 5823 5824 if (!desc->closed) { 5825 pthread_mutex_unlock(&desc->mutex); 5826 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 5827 return; 5828 } else if (0 == desc->refs) { 5829 /* This descriptor was closed after this remove_notify message was sent. 5830 * spdk_bdev_close() could not free the descriptor since this message was 5831 * in flight, so we free it now using bdev_desc_free(). 5832 */ 5833 pthread_mutex_unlock(&desc->mutex); 5834 bdev_desc_free(desc); 5835 return; 5836 } 5837 pthread_mutex_unlock(&desc->mutex); 5838 } 5839 5840 /* Must be called while holding bdev->internal.mutex. 5841 * returns: 0 - bdev removed and ready to be destructed. 5842 * -EBUSY - bdev can't be destructed yet. */ 5843 static int 5844 bdev_unregister_unsafe(struct spdk_bdev *bdev) 5845 { 5846 struct spdk_bdev_desc *desc, *tmp; 5847 int rc = 0; 5848 5849 /* Notify each descriptor about hotremoval */ 5850 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 5851 rc = -EBUSY; 5852 pthread_mutex_lock(&desc->mutex); 5853 /* 5854 * Defer invocation of the event_cb to a separate message that will 5855 * run later on its thread. This ensures this context unwinds and 5856 * we don't recursively unregister this bdev again if the event_cb 5857 * immediately closes its descriptor. 5858 */ 5859 desc->refs++; 5860 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 5861 pthread_mutex_unlock(&desc->mutex); 5862 } 5863 5864 /* If there are no descriptors, proceed removing the bdev */ 5865 if (rc == 0) { 5866 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 5867 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 5868 bdev_name_del(&bdev->internal.bdev_name); 5869 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 5870 } 5871 5872 return rc; 5873 } 5874 5875 void 5876 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 5877 { 5878 struct spdk_thread *thread; 5879 int rc; 5880 5881 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 5882 5883 thread = spdk_get_thread(); 5884 if (!thread) { 5885 /* The user called this from a non-SPDK thread. */ 5886 if (cb_fn != NULL) { 5887 cb_fn(cb_arg, -ENOTSUP); 5888 } 5889 return; 5890 } 5891 5892 pthread_mutex_lock(&g_bdev_mgr.mutex); 5893 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5894 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5895 if (cb_fn) { 5896 cb_fn(cb_arg, -EBUSY); 5897 } 5898 return; 5899 } 5900 5901 pthread_mutex_lock(&bdev->internal.mutex); 5902 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 5903 bdev->internal.unregister_cb = cb_fn; 5904 bdev->internal.unregister_ctx = cb_arg; 5905 5906 /* Call under lock. */ 5907 rc = bdev_unregister_unsafe(bdev); 5908 pthread_mutex_unlock(&bdev->internal.mutex); 5909 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5910 5911 if (rc == 0) { 5912 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 5913 } 5914 } 5915 5916 static int 5917 bdev_start_qos(struct spdk_bdev *bdev) 5918 { 5919 struct set_qos_limit_ctx *ctx; 5920 5921 /* Enable QoS */ 5922 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 5923 ctx = calloc(1, sizeof(*ctx)); 5924 if (ctx == NULL) { 5925 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 5926 return -ENOMEM; 5927 } 5928 ctx->bdev = bdev; 5929 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5930 bdev_enable_qos_msg, ctx, 5931 bdev_enable_qos_done); 5932 } 5933 5934 return 0; 5935 } 5936 5937 static int 5938 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 5939 { 5940 struct spdk_thread *thread; 5941 int rc = 0; 5942 5943 thread = spdk_get_thread(); 5944 if (!thread) { 5945 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 5946 return -ENOTSUP; 5947 } 5948 5949 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 5950 spdk_get_thread()); 5951 5952 desc->bdev = bdev; 5953 desc->thread = thread; 5954 desc->write = write; 5955 5956 pthread_mutex_lock(&bdev->internal.mutex); 5957 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5958 pthread_mutex_unlock(&bdev->internal.mutex); 5959 return -ENODEV; 5960 } 5961 5962 if (write && bdev->internal.claim_module) { 5963 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 5964 bdev->name, bdev->internal.claim_module->name); 5965 pthread_mutex_unlock(&bdev->internal.mutex); 5966 return -EPERM; 5967 } 5968 5969 rc = bdev_start_qos(bdev); 5970 if (rc != 0) { 5971 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 5972 pthread_mutex_unlock(&bdev->internal.mutex); 5973 return rc; 5974 } 5975 5976 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 5977 5978 pthread_mutex_unlock(&bdev->internal.mutex); 5979 5980 return 0; 5981 } 5982 5983 int 5984 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 5985 void *event_ctx, struct spdk_bdev_desc **_desc) 5986 { 5987 struct spdk_bdev_desc *desc; 5988 struct spdk_bdev *bdev; 5989 unsigned int event_id; 5990 int rc; 5991 5992 if (event_cb == NULL) { 5993 SPDK_ERRLOG("Missing event callback function\n"); 5994 return -EINVAL; 5995 } 5996 5997 pthread_mutex_lock(&g_bdev_mgr.mutex); 5998 5999 bdev = bdev_get_by_name(bdev_name); 6000 6001 if (bdev == NULL) { 6002 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6003 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6004 return -ENODEV; 6005 } 6006 6007 desc = calloc(1, sizeof(*desc)); 6008 if (desc == NULL) { 6009 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6010 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6011 return -ENOMEM; 6012 } 6013 6014 TAILQ_INIT(&desc->pending_media_events); 6015 TAILQ_INIT(&desc->free_media_events); 6016 6017 desc->callback.event_fn = event_cb; 6018 desc->callback.ctx = event_ctx; 6019 pthread_mutex_init(&desc->mutex, NULL); 6020 6021 if (bdev->media_events) { 6022 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6023 sizeof(*desc->media_events_buffer)); 6024 if (desc->media_events_buffer == NULL) { 6025 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6026 bdev_desc_free(desc); 6027 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6028 return -ENOMEM; 6029 } 6030 6031 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6032 TAILQ_INSERT_TAIL(&desc->free_media_events, 6033 &desc->media_events_buffer[event_id], tailq); 6034 } 6035 } 6036 6037 rc = bdev_open(bdev, write, desc); 6038 if (rc != 0) { 6039 bdev_desc_free(desc); 6040 desc = NULL; 6041 } 6042 6043 *_desc = desc; 6044 6045 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6046 6047 return rc; 6048 } 6049 6050 void 6051 spdk_bdev_close(struct spdk_bdev_desc *desc) 6052 { 6053 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6054 int rc; 6055 6056 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6057 spdk_get_thread()); 6058 6059 assert(desc->thread == spdk_get_thread()); 6060 6061 spdk_poller_unregister(&desc->io_timeout_poller); 6062 6063 pthread_mutex_lock(&g_bdev_mgr.mutex); 6064 pthread_mutex_lock(&bdev->internal.mutex); 6065 pthread_mutex_lock(&desc->mutex); 6066 6067 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6068 6069 desc->closed = true; 6070 6071 if (0 == desc->refs) { 6072 pthread_mutex_unlock(&desc->mutex); 6073 bdev_desc_free(desc); 6074 } else { 6075 pthread_mutex_unlock(&desc->mutex); 6076 } 6077 6078 /* If no more descriptors, kill QoS channel */ 6079 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6080 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6081 bdev->name, spdk_get_thread()); 6082 6083 if (bdev_qos_destroy(bdev)) { 6084 /* There isn't anything we can do to recover here. Just let the 6085 * old QoS poller keep running. The QoS handling won't change 6086 * cores when the user allocates a new channel, but it won't break. */ 6087 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6088 } 6089 } 6090 6091 spdk_bdev_set_qd_sampling_period(bdev, 0); 6092 6093 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6094 rc = bdev_unregister_unsafe(bdev); 6095 pthread_mutex_unlock(&bdev->internal.mutex); 6096 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6097 6098 if (rc == 0) { 6099 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6100 } 6101 } else { 6102 pthread_mutex_unlock(&bdev->internal.mutex); 6103 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6104 } 6105 } 6106 6107 int 6108 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6109 struct spdk_bdev_module *module) 6110 { 6111 if (bdev->internal.claim_module != NULL) { 6112 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6113 bdev->internal.claim_module->name); 6114 return -EPERM; 6115 } 6116 6117 if (desc && !desc->write) { 6118 desc->write = true; 6119 } 6120 6121 bdev->internal.claim_module = module; 6122 return 0; 6123 } 6124 6125 void 6126 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6127 { 6128 assert(bdev->internal.claim_module != NULL); 6129 bdev->internal.claim_module = NULL; 6130 } 6131 6132 struct spdk_bdev * 6133 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6134 { 6135 assert(desc != NULL); 6136 return desc->bdev; 6137 } 6138 6139 void 6140 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6141 { 6142 struct iovec *iovs; 6143 int iovcnt; 6144 6145 if (bdev_io == NULL) { 6146 return; 6147 } 6148 6149 switch (bdev_io->type) { 6150 case SPDK_BDEV_IO_TYPE_READ: 6151 case SPDK_BDEV_IO_TYPE_WRITE: 6152 case SPDK_BDEV_IO_TYPE_ZCOPY: 6153 iovs = bdev_io->u.bdev.iovs; 6154 iovcnt = bdev_io->u.bdev.iovcnt; 6155 break; 6156 default: 6157 iovs = NULL; 6158 iovcnt = 0; 6159 break; 6160 } 6161 6162 if (iovp) { 6163 *iovp = iovs; 6164 } 6165 if (iovcntp) { 6166 *iovcntp = iovcnt; 6167 } 6168 } 6169 6170 void * 6171 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 6172 { 6173 if (bdev_io == NULL) { 6174 return NULL; 6175 } 6176 6177 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 6178 return NULL; 6179 } 6180 6181 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 6182 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 6183 return bdev_io->u.bdev.md_buf; 6184 } 6185 6186 return NULL; 6187 } 6188 6189 void * 6190 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 6191 { 6192 if (bdev_io == NULL) { 6193 assert(false); 6194 return NULL; 6195 } 6196 6197 return bdev_io->internal.caller_ctx; 6198 } 6199 6200 void 6201 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 6202 { 6203 6204 if (spdk_bdev_module_list_find(bdev_module->name)) { 6205 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 6206 assert(false); 6207 } 6208 6209 /* 6210 * Modules with examine callbacks must be initialized first, so they are 6211 * ready to handle examine callbacks from later modules that will 6212 * register physical bdevs. 6213 */ 6214 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 6215 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6216 } else { 6217 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6218 } 6219 } 6220 6221 struct spdk_bdev_module * 6222 spdk_bdev_module_list_find(const char *name) 6223 { 6224 struct spdk_bdev_module *bdev_module; 6225 6226 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 6227 if (strcmp(name, bdev_module->name) == 0) { 6228 break; 6229 } 6230 } 6231 6232 return bdev_module; 6233 } 6234 6235 static void 6236 bdev_write_zero_buffer_next(void *_bdev_io) 6237 { 6238 struct spdk_bdev_io *bdev_io = _bdev_io; 6239 uint64_t num_bytes, num_blocks; 6240 void *md_buf = NULL; 6241 int rc; 6242 6243 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 6244 bdev_io->u.bdev.split_remaining_num_blocks, 6245 ZERO_BUFFER_SIZE); 6246 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 6247 6248 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 6249 md_buf = (char *)g_bdev_mgr.zero_buffer + 6250 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 6251 } 6252 6253 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 6254 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6255 g_bdev_mgr.zero_buffer, md_buf, 6256 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 6257 bdev_write_zero_buffer_done, bdev_io); 6258 if (rc == 0) { 6259 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 6260 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 6261 } else if (rc == -ENOMEM) { 6262 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 6263 } else { 6264 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6265 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6266 } 6267 } 6268 6269 static void 6270 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6271 { 6272 struct spdk_bdev_io *parent_io = cb_arg; 6273 6274 spdk_bdev_free_io(bdev_io); 6275 6276 if (!success) { 6277 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6278 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6279 return; 6280 } 6281 6282 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 6283 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6284 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6285 return; 6286 } 6287 6288 bdev_write_zero_buffer_next(parent_io); 6289 } 6290 6291 static void 6292 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 6293 { 6294 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6295 ctx->bdev->internal.qos_mod_in_progress = false; 6296 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6297 6298 if (ctx->cb_fn) { 6299 ctx->cb_fn(ctx->cb_arg, status); 6300 } 6301 free(ctx); 6302 } 6303 6304 static void 6305 bdev_disable_qos_done(void *cb_arg) 6306 { 6307 struct set_qos_limit_ctx *ctx = cb_arg; 6308 struct spdk_bdev *bdev = ctx->bdev; 6309 struct spdk_bdev_io *bdev_io; 6310 struct spdk_bdev_qos *qos; 6311 6312 pthread_mutex_lock(&bdev->internal.mutex); 6313 qos = bdev->internal.qos; 6314 bdev->internal.qos = NULL; 6315 pthread_mutex_unlock(&bdev->internal.mutex); 6316 6317 while (!TAILQ_EMPTY(&qos->queued)) { 6318 /* Send queued I/O back to their original thread for resubmission. */ 6319 bdev_io = TAILQ_FIRST(&qos->queued); 6320 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6321 6322 if (bdev_io->internal.io_submit_ch) { 6323 /* 6324 * Channel was changed when sending it to the QoS thread - change it back 6325 * before sending it back to the original thread. 6326 */ 6327 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6328 bdev_io->internal.io_submit_ch = NULL; 6329 } 6330 6331 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6332 _bdev_io_submit, bdev_io); 6333 } 6334 6335 if (qos->thread != NULL) { 6336 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6337 spdk_poller_unregister(&qos->poller); 6338 } 6339 6340 free(qos); 6341 6342 bdev_set_qos_limit_done(ctx, 0); 6343 } 6344 6345 static void 6346 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6347 { 6348 void *io_device = spdk_io_channel_iter_get_io_device(i); 6349 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6350 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6351 struct spdk_thread *thread; 6352 6353 pthread_mutex_lock(&bdev->internal.mutex); 6354 thread = bdev->internal.qos->thread; 6355 pthread_mutex_unlock(&bdev->internal.mutex); 6356 6357 if (thread != NULL) { 6358 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6359 } else { 6360 bdev_disable_qos_done(ctx); 6361 } 6362 } 6363 6364 static void 6365 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6366 { 6367 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6368 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6369 6370 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6371 6372 spdk_for_each_channel_continue(i, 0); 6373 } 6374 6375 static void 6376 bdev_update_qos_rate_limit_msg(void *cb_arg) 6377 { 6378 struct set_qos_limit_ctx *ctx = cb_arg; 6379 struct spdk_bdev *bdev = ctx->bdev; 6380 6381 pthread_mutex_lock(&bdev->internal.mutex); 6382 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6383 pthread_mutex_unlock(&bdev->internal.mutex); 6384 6385 bdev_set_qos_limit_done(ctx, 0); 6386 } 6387 6388 static void 6389 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6390 { 6391 void *io_device = spdk_io_channel_iter_get_io_device(i); 6392 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6393 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6394 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6395 6396 pthread_mutex_lock(&bdev->internal.mutex); 6397 bdev_enable_qos(bdev, bdev_ch); 6398 pthread_mutex_unlock(&bdev->internal.mutex); 6399 spdk_for_each_channel_continue(i, 0); 6400 } 6401 6402 static void 6403 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6404 { 6405 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6406 6407 bdev_set_qos_limit_done(ctx, status); 6408 } 6409 6410 static void 6411 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6412 { 6413 int i; 6414 6415 assert(bdev->internal.qos != NULL); 6416 6417 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6418 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6419 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6420 6421 if (limits[i] == 0) { 6422 bdev->internal.qos->rate_limits[i].limit = 6423 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6424 } 6425 } 6426 } 6427 } 6428 6429 void 6430 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6431 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6432 { 6433 struct set_qos_limit_ctx *ctx; 6434 uint32_t limit_set_complement; 6435 uint64_t min_limit_per_sec; 6436 int i; 6437 bool disable_rate_limit = true; 6438 6439 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6440 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6441 continue; 6442 } 6443 6444 if (limits[i] > 0) { 6445 disable_rate_limit = false; 6446 } 6447 6448 if (bdev_qos_is_iops_rate_limit(i) == true) { 6449 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 6450 } else { 6451 /* Change from megabyte to byte rate limit */ 6452 limits[i] = limits[i] * 1024 * 1024; 6453 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 6454 } 6455 6456 limit_set_complement = limits[i] % min_limit_per_sec; 6457 if (limit_set_complement) { 6458 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 6459 limits[i], min_limit_per_sec); 6460 limits[i] += min_limit_per_sec - limit_set_complement; 6461 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 6462 } 6463 } 6464 6465 ctx = calloc(1, sizeof(*ctx)); 6466 if (ctx == NULL) { 6467 cb_fn(cb_arg, -ENOMEM); 6468 return; 6469 } 6470 6471 ctx->cb_fn = cb_fn; 6472 ctx->cb_arg = cb_arg; 6473 ctx->bdev = bdev; 6474 6475 pthread_mutex_lock(&bdev->internal.mutex); 6476 if (bdev->internal.qos_mod_in_progress) { 6477 pthread_mutex_unlock(&bdev->internal.mutex); 6478 free(ctx); 6479 cb_fn(cb_arg, -EAGAIN); 6480 return; 6481 } 6482 bdev->internal.qos_mod_in_progress = true; 6483 6484 if (disable_rate_limit == true && bdev->internal.qos) { 6485 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6486 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 6487 (bdev->internal.qos->rate_limits[i].limit > 0 && 6488 bdev->internal.qos->rate_limits[i].limit != 6489 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 6490 disable_rate_limit = false; 6491 break; 6492 } 6493 } 6494 } 6495 6496 if (disable_rate_limit == false) { 6497 if (bdev->internal.qos == NULL) { 6498 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 6499 if (!bdev->internal.qos) { 6500 pthread_mutex_unlock(&bdev->internal.mutex); 6501 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 6502 bdev_set_qos_limit_done(ctx, -ENOMEM); 6503 return; 6504 } 6505 } 6506 6507 if (bdev->internal.qos->thread == NULL) { 6508 /* Enabling */ 6509 bdev_set_qos_rate_limits(bdev, limits); 6510 6511 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6512 bdev_enable_qos_msg, ctx, 6513 bdev_enable_qos_done); 6514 } else { 6515 /* Updating */ 6516 bdev_set_qos_rate_limits(bdev, limits); 6517 6518 spdk_thread_send_msg(bdev->internal.qos->thread, 6519 bdev_update_qos_rate_limit_msg, ctx); 6520 } 6521 } else { 6522 if (bdev->internal.qos != NULL) { 6523 bdev_set_qos_rate_limits(bdev, limits); 6524 6525 /* Disabling */ 6526 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6527 bdev_disable_qos_msg, ctx, 6528 bdev_disable_qos_msg_done); 6529 } else { 6530 pthread_mutex_unlock(&bdev->internal.mutex); 6531 bdev_set_qos_limit_done(ctx, 0); 6532 return; 6533 } 6534 } 6535 6536 pthread_mutex_unlock(&bdev->internal.mutex); 6537 } 6538 6539 struct spdk_bdev_histogram_ctx { 6540 spdk_bdev_histogram_status_cb cb_fn; 6541 void *cb_arg; 6542 struct spdk_bdev *bdev; 6543 int status; 6544 }; 6545 6546 static void 6547 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 6548 { 6549 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6550 6551 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6552 ctx->bdev->internal.histogram_in_progress = false; 6553 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6554 ctx->cb_fn(ctx->cb_arg, ctx->status); 6555 free(ctx); 6556 } 6557 6558 static void 6559 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 6560 { 6561 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6562 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6563 6564 if (ch->histogram != NULL) { 6565 spdk_histogram_data_free(ch->histogram); 6566 ch->histogram = NULL; 6567 } 6568 spdk_for_each_channel_continue(i, 0); 6569 } 6570 6571 static void 6572 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 6573 { 6574 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6575 6576 if (status != 0) { 6577 ctx->status = status; 6578 ctx->bdev->internal.histogram_enabled = false; 6579 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 6580 bdev_histogram_disable_channel_cb); 6581 } else { 6582 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6583 ctx->bdev->internal.histogram_in_progress = false; 6584 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6585 ctx->cb_fn(ctx->cb_arg, ctx->status); 6586 free(ctx); 6587 } 6588 } 6589 6590 static void 6591 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 6592 { 6593 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6594 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6595 int status = 0; 6596 6597 if (ch->histogram == NULL) { 6598 ch->histogram = spdk_histogram_data_alloc(); 6599 if (ch->histogram == NULL) { 6600 status = -ENOMEM; 6601 } 6602 } 6603 6604 spdk_for_each_channel_continue(i, status); 6605 } 6606 6607 void 6608 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 6609 void *cb_arg, bool enable) 6610 { 6611 struct spdk_bdev_histogram_ctx *ctx; 6612 6613 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 6614 if (ctx == NULL) { 6615 cb_fn(cb_arg, -ENOMEM); 6616 return; 6617 } 6618 6619 ctx->bdev = bdev; 6620 ctx->status = 0; 6621 ctx->cb_fn = cb_fn; 6622 ctx->cb_arg = cb_arg; 6623 6624 pthread_mutex_lock(&bdev->internal.mutex); 6625 if (bdev->internal.histogram_in_progress) { 6626 pthread_mutex_unlock(&bdev->internal.mutex); 6627 free(ctx); 6628 cb_fn(cb_arg, -EAGAIN); 6629 return; 6630 } 6631 6632 bdev->internal.histogram_in_progress = true; 6633 pthread_mutex_unlock(&bdev->internal.mutex); 6634 6635 bdev->internal.histogram_enabled = enable; 6636 6637 if (enable) { 6638 /* Allocate histogram for each channel */ 6639 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 6640 bdev_histogram_enable_channel_cb); 6641 } else { 6642 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 6643 bdev_histogram_disable_channel_cb); 6644 } 6645 } 6646 6647 struct spdk_bdev_histogram_data_ctx { 6648 spdk_bdev_histogram_data_cb cb_fn; 6649 void *cb_arg; 6650 struct spdk_bdev *bdev; 6651 /** merged histogram data from all channels */ 6652 struct spdk_histogram_data *histogram; 6653 }; 6654 6655 static void 6656 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 6657 { 6658 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6659 6660 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 6661 free(ctx); 6662 } 6663 6664 static void 6665 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 6666 { 6667 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6668 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6669 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6670 int status = 0; 6671 6672 if (ch->histogram == NULL) { 6673 status = -EFAULT; 6674 } else { 6675 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 6676 } 6677 6678 spdk_for_each_channel_continue(i, status); 6679 } 6680 6681 void 6682 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 6683 spdk_bdev_histogram_data_cb cb_fn, 6684 void *cb_arg) 6685 { 6686 struct spdk_bdev_histogram_data_ctx *ctx; 6687 6688 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 6689 if (ctx == NULL) { 6690 cb_fn(cb_arg, -ENOMEM, NULL); 6691 return; 6692 } 6693 6694 ctx->bdev = bdev; 6695 ctx->cb_fn = cb_fn; 6696 ctx->cb_arg = cb_arg; 6697 6698 ctx->histogram = histogram; 6699 6700 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 6701 bdev_histogram_get_channel_cb); 6702 } 6703 6704 size_t 6705 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 6706 size_t max_events) 6707 { 6708 struct media_event_entry *entry; 6709 size_t num_events = 0; 6710 6711 for (; num_events < max_events; ++num_events) { 6712 entry = TAILQ_FIRST(&desc->pending_media_events); 6713 if (entry == NULL) { 6714 break; 6715 } 6716 6717 events[num_events] = entry->event; 6718 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 6719 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 6720 } 6721 6722 return num_events; 6723 } 6724 6725 int 6726 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 6727 size_t num_events) 6728 { 6729 struct spdk_bdev_desc *desc; 6730 struct media_event_entry *entry; 6731 size_t event_id; 6732 int rc = 0; 6733 6734 assert(bdev->media_events); 6735 6736 pthread_mutex_lock(&bdev->internal.mutex); 6737 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6738 if (desc->write) { 6739 break; 6740 } 6741 } 6742 6743 if (desc == NULL || desc->media_events_buffer == NULL) { 6744 rc = -ENODEV; 6745 goto out; 6746 } 6747 6748 for (event_id = 0; event_id < num_events; ++event_id) { 6749 entry = TAILQ_FIRST(&desc->free_media_events); 6750 if (entry == NULL) { 6751 break; 6752 } 6753 6754 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 6755 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 6756 entry->event = events[event_id]; 6757 } 6758 6759 rc = event_id; 6760 out: 6761 pthread_mutex_unlock(&bdev->internal.mutex); 6762 return rc; 6763 } 6764 6765 void 6766 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 6767 { 6768 struct spdk_bdev_desc *desc; 6769 6770 pthread_mutex_lock(&bdev->internal.mutex); 6771 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6772 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 6773 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 6774 desc->callback.ctx); 6775 } 6776 } 6777 pthread_mutex_unlock(&bdev->internal.mutex); 6778 } 6779 6780 struct locked_lba_range_ctx { 6781 struct lba_range range; 6782 struct spdk_bdev *bdev; 6783 struct lba_range *current_range; 6784 struct lba_range *owner_range; 6785 struct spdk_poller *poller; 6786 lock_range_cb cb_fn; 6787 void *cb_arg; 6788 }; 6789 6790 static void 6791 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 6792 { 6793 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6794 6795 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 6796 free(ctx); 6797 } 6798 6799 static void 6800 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 6801 6802 static void 6803 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6804 { 6805 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6806 struct spdk_bdev *bdev = ctx->bdev; 6807 6808 if (status == -ENOMEM) { 6809 /* One of the channels could not allocate a range object. 6810 * So we have to go back and clean up any ranges that were 6811 * allocated successfully before we return error status to 6812 * the caller. We can reuse the unlock function to do that 6813 * clean up. 6814 */ 6815 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6816 bdev_unlock_lba_range_get_channel, ctx, 6817 bdev_lock_error_cleanup_cb); 6818 return; 6819 } 6820 6821 /* All channels have locked this range and no I/O overlapping the range 6822 * are outstanding! Set the owner_ch for the range object for the 6823 * locking channel, so that this channel will know that it is allowed 6824 * to write to this range. 6825 */ 6826 ctx->owner_range->owner_ch = ctx->range.owner_ch; 6827 ctx->cb_fn(ctx->cb_arg, status); 6828 6829 /* Don't free the ctx here. Its range is in the bdev's global list of 6830 * locked ranges still, and will be removed and freed when this range 6831 * is later unlocked. 6832 */ 6833 } 6834 6835 static int 6836 bdev_lock_lba_range_check_io(void *_i) 6837 { 6838 struct spdk_io_channel_iter *i = _i; 6839 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6840 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6841 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6842 struct lba_range *range = ctx->current_range; 6843 struct spdk_bdev_io *bdev_io; 6844 6845 spdk_poller_unregister(&ctx->poller); 6846 6847 /* The range is now in the locked_ranges, so no new IO can be submitted to this 6848 * range. But we need to wait until any outstanding IO overlapping with this range 6849 * are completed. 6850 */ 6851 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 6852 if (bdev_io_range_is_locked(bdev_io, range)) { 6853 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 6854 return SPDK_POLLER_BUSY; 6855 } 6856 } 6857 6858 spdk_for_each_channel_continue(i, 0); 6859 return SPDK_POLLER_BUSY; 6860 } 6861 6862 static void 6863 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6864 { 6865 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6866 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6867 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6868 struct lba_range *range; 6869 6870 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6871 if (range->length == ctx->range.length && 6872 range->offset == ctx->range.offset && 6873 range->locked_ctx == ctx->range.locked_ctx) { 6874 /* This range already exists on this channel, so don't add 6875 * it again. This can happen when a new channel is created 6876 * while the for_each_channel operation is in progress. 6877 * Do not check for outstanding I/O in that case, since the 6878 * range was locked before any I/O could be submitted to the 6879 * new channel. 6880 */ 6881 spdk_for_each_channel_continue(i, 0); 6882 return; 6883 } 6884 } 6885 6886 range = calloc(1, sizeof(*range)); 6887 if (range == NULL) { 6888 spdk_for_each_channel_continue(i, -ENOMEM); 6889 return; 6890 } 6891 6892 range->length = ctx->range.length; 6893 range->offset = ctx->range.offset; 6894 range->locked_ctx = ctx->range.locked_ctx; 6895 ctx->current_range = range; 6896 if (ctx->range.owner_ch == ch) { 6897 /* This is the range object for the channel that will hold 6898 * the lock. Store it in the ctx object so that we can easily 6899 * set its owner_ch after the lock is finally acquired. 6900 */ 6901 ctx->owner_range = range; 6902 } 6903 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 6904 bdev_lock_lba_range_check_io(i); 6905 } 6906 6907 static void 6908 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 6909 { 6910 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 6911 6912 /* We will add a copy of this range to each channel now. */ 6913 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 6914 bdev_lock_lba_range_cb); 6915 } 6916 6917 static bool 6918 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 6919 { 6920 struct lba_range *r; 6921 6922 TAILQ_FOREACH(r, tailq, tailq) { 6923 if (bdev_lba_range_overlapped(range, r)) { 6924 return true; 6925 } 6926 } 6927 return false; 6928 } 6929 6930 static int 6931 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 6932 uint64_t offset, uint64_t length, 6933 lock_range_cb cb_fn, void *cb_arg) 6934 { 6935 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6936 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6937 struct locked_lba_range_ctx *ctx; 6938 6939 if (cb_arg == NULL) { 6940 SPDK_ERRLOG("cb_arg must not be NULL\n"); 6941 return -EINVAL; 6942 } 6943 6944 ctx = calloc(1, sizeof(*ctx)); 6945 if (ctx == NULL) { 6946 return -ENOMEM; 6947 } 6948 6949 ctx->range.offset = offset; 6950 ctx->range.length = length; 6951 ctx->range.owner_ch = ch; 6952 ctx->range.locked_ctx = cb_arg; 6953 ctx->bdev = bdev; 6954 ctx->cb_fn = cb_fn; 6955 ctx->cb_arg = cb_arg; 6956 6957 pthread_mutex_lock(&bdev->internal.mutex); 6958 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 6959 /* There is an active lock overlapping with this range. 6960 * Put it on the pending list until this range no 6961 * longer overlaps with another. 6962 */ 6963 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 6964 } else { 6965 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 6966 bdev_lock_lba_range_ctx(bdev, ctx); 6967 } 6968 pthread_mutex_unlock(&bdev->internal.mutex); 6969 return 0; 6970 } 6971 6972 static void 6973 bdev_lock_lba_range_ctx_msg(void *_ctx) 6974 { 6975 struct locked_lba_range_ctx *ctx = _ctx; 6976 6977 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 6978 } 6979 6980 static void 6981 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6982 { 6983 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6984 struct locked_lba_range_ctx *pending_ctx; 6985 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 6986 struct spdk_bdev *bdev = ch->bdev; 6987 struct lba_range *range, *tmp; 6988 6989 pthread_mutex_lock(&bdev->internal.mutex); 6990 /* Check if there are any pending locked ranges that overlap with this range 6991 * that was just unlocked. If there are, check that it doesn't overlap with any 6992 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 6993 * the lock process. 6994 */ 6995 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 6996 if (bdev_lba_range_overlapped(range, &ctx->range) && 6997 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 6998 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 6999 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7000 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7001 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7002 bdev_lock_lba_range_ctx_msg, pending_ctx); 7003 } 7004 } 7005 pthread_mutex_unlock(&bdev->internal.mutex); 7006 7007 ctx->cb_fn(ctx->cb_arg, status); 7008 free(ctx); 7009 } 7010 7011 static void 7012 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7013 { 7014 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7015 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7016 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7017 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7018 struct spdk_bdev_io *bdev_io; 7019 struct lba_range *range; 7020 7021 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7022 if (ctx->range.offset == range->offset && 7023 ctx->range.length == range->length && 7024 ctx->range.locked_ctx == range->locked_ctx) { 7025 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7026 free(range); 7027 break; 7028 } 7029 } 7030 7031 /* Note: we should almost always be able to assert that the range specified 7032 * was found. But there are some very rare corner cases where a new channel 7033 * gets created simultaneously with a range unlock, where this function 7034 * would execute on that new channel and wouldn't have the range. 7035 * We also use this to clean up range allocations when a later allocation 7036 * fails in the locking path. 7037 * So we can't actually assert() here. 7038 */ 7039 7040 /* Swap the locked IO into a temporary list, and then try to submit them again. 7041 * We could hyper-optimize this to only resubmit locked I/O that overlap 7042 * with the range that was just unlocked, but this isn't a performance path so 7043 * we go for simplicity here. 7044 */ 7045 TAILQ_INIT(&io_locked); 7046 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7047 while (!TAILQ_EMPTY(&io_locked)) { 7048 bdev_io = TAILQ_FIRST(&io_locked); 7049 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7050 bdev_io_submit(bdev_io); 7051 } 7052 7053 spdk_for_each_channel_continue(i, 0); 7054 } 7055 7056 static int 7057 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7058 uint64_t offset, uint64_t length, 7059 lock_range_cb cb_fn, void *cb_arg) 7060 { 7061 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7062 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7063 struct locked_lba_range_ctx *ctx; 7064 struct lba_range *range; 7065 bool range_found = false; 7066 7067 /* Let's make sure the specified channel actually has a lock on 7068 * the specified range. Note that the range must match exactly. 7069 */ 7070 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7071 if (range->offset == offset && range->length == length && 7072 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7073 range_found = true; 7074 break; 7075 } 7076 } 7077 7078 if (!range_found) { 7079 return -EINVAL; 7080 } 7081 7082 pthread_mutex_lock(&bdev->internal.mutex); 7083 /* We confirmed that this channel has locked the specified range. To 7084 * start the unlock the process, we find the range in the bdev's locked_ranges 7085 * and remove it. This ensures new channels don't inherit the locked range. 7086 * Then we will send a message to each channel (including the one specified 7087 * here) to remove the range from its per-channel list. 7088 */ 7089 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7090 if (range->offset == offset && range->length == length && 7091 range->locked_ctx == cb_arg) { 7092 break; 7093 } 7094 } 7095 if (range == NULL) { 7096 assert(false); 7097 pthread_mutex_unlock(&bdev->internal.mutex); 7098 return -EINVAL; 7099 } 7100 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7101 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7102 pthread_mutex_unlock(&bdev->internal.mutex); 7103 7104 ctx->cb_fn = cb_fn; 7105 ctx->cb_arg = cb_arg; 7106 7107 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7108 bdev_unlock_lba_range_cb); 7109 return 0; 7110 } 7111 7112 int 7113 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7114 int array_size) 7115 { 7116 if (!bdev) { 7117 return -EINVAL; 7118 } 7119 7120 if (bdev->fn_table->get_memory_domains) { 7121 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7122 } 7123 7124 return 0; 7125 } 7126 7127 SPDK_LOG_REGISTER_COMPONENT(bdev) 7128 7129 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 7130 { 7131 struct spdk_trace_tpoint_opts opts[] = { 7132 { 7133 "BDEV_IO_START", TRACE_BDEV_IO_START, 7134 OWNER_BDEV, OBJECT_BDEV_IO, 1, 7135 { 7136 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7137 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 } 7138 } 7139 }, 7140 { 7141 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 7142 OWNER_BDEV, OBJECT_BDEV_IO, 0, 7143 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7144 }, 7145 }; 7146 7147 7148 spdk_trace_register_owner(OWNER_BDEV, 'b'); 7149 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 7150 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7151 } 7152