1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/notify.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk/log.h" 52 #include "spdk/string.h" 53 54 #include "bdev_internal.h" 55 #include "spdk_internal/trace_defs.h" 56 57 #ifdef SPDK_CONFIG_VTUNE 58 #include "ittnotify.h" 59 #include "ittnotify_types.h" 60 int __itt_init_ittlib(const char *, __itt_group_id); 61 #endif 62 63 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 64 #define SPDK_BDEV_IO_CACHE_SIZE 256 65 #define SPDK_BDEV_AUTO_EXAMINE true 66 #define BUF_SMALL_POOL_SIZE 8191 67 #define BUF_LARGE_POOL_SIZE 1023 68 #define NOMEM_THRESHOLD_COUNT 8 69 #define ZERO_BUFFER_SIZE 0x100000 70 71 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 72 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 73 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 74 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 75 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 76 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 77 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 78 79 #define SPDK_BDEV_POOL_ALIGNMENT 512 80 81 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 82 * when splitting into children requests at a time. 83 */ 84 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 85 86 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 87 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 88 }; 89 90 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 91 92 RB_HEAD(bdev_name_tree, spdk_bdev_name); 93 94 static int 95 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 96 { 97 return strcmp(name1->name, name2->name); 98 } 99 100 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 101 102 struct spdk_bdev_mgr { 103 struct spdk_mempool *bdev_io_pool; 104 105 struct spdk_mempool *buf_small_pool; 106 struct spdk_mempool *buf_large_pool; 107 108 void *zero_buffer; 109 110 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 111 112 struct spdk_bdev_list bdevs; 113 struct bdev_name_tree bdev_names; 114 115 bool init_complete; 116 bool module_init_complete; 117 118 pthread_mutex_t mutex; 119 120 #ifdef SPDK_CONFIG_VTUNE 121 __itt_domain *domain; 122 #endif 123 }; 124 125 static struct spdk_bdev_mgr g_bdev_mgr = { 126 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 127 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 128 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 129 .init_complete = false, 130 .module_init_complete = false, 131 .mutex = PTHREAD_MUTEX_INITIALIZER, 132 }; 133 134 typedef void (*lock_range_cb)(void *ctx, int status); 135 136 struct lba_range { 137 uint64_t offset; 138 uint64_t length; 139 void *locked_ctx; 140 struct spdk_bdev_channel *owner_ch; 141 TAILQ_ENTRY(lba_range) tailq; 142 }; 143 144 static struct spdk_bdev_opts g_bdev_opts = { 145 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 146 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 147 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 148 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 149 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 150 }; 151 152 static spdk_bdev_init_cb g_init_cb_fn = NULL; 153 static void *g_init_cb_arg = NULL; 154 155 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 156 static void *g_fini_cb_arg = NULL; 157 static struct spdk_thread *g_fini_thread = NULL; 158 159 struct spdk_bdev_qos_limit { 160 /** IOs or bytes allowed per second (i.e., 1s). */ 161 uint64_t limit; 162 163 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 164 * For remaining bytes, allowed to run negative if an I/O is submitted when 165 * some bytes are remaining, but the I/O is bigger than that amount. The 166 * excess will be deducted from the next timeslice. 167 */ 168 int64_t remaining_this_timeslice; 169 170 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 171 uint32_t min_per_timeslice; 172 173 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t max_per_timeslice; 175 176 /** Function to check whether to queue the IO. */ 177 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 178 179 /** Function to update for the submitted IO. */ 180 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 }; 182 183 struct spdk_bdev_qos { 184 /** Types of structure of rate limits. */ 185 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 186 187 /** The channel that all I/O are funneled through. */ 188 struct spdk_bdev_channel *ch; 189 190 /** The thread on which the poller is running. */ 191 struct spdk_thread *thread; 192 193 /** Queue of I/O waiting to be issued. */ 194 bdev_io_tailq_t queued; 195 196 /** Size of a timeslice in tsc ticks. */ 197 uint64_t timeslice_size; 198 199 /** Timestamp of start of last timeslice. */ 200 uint64_t last_timeslice; 201 202 /** Poller that processes queued I/O commands each time slice. */ 203 struct spdk_poller *poller; 204 }; 205 206 struct spdk_bdev_mgmt_channel { 207 bdev_io_stailq_t need_buf_small; 208 bdev_io_stailq_t need_buf_large; 209 210 /* 211 * Each thread keeps a cache of bdev_io - this allows 212 * bdev threads which are *not* DPDK threads to still 213 * benefit from a per-thread bdev_io cache. Without 214 * this, non-DPDK threads fetching from the mempool 215 * incur a cmpxchg on get and put. 216 */ 217 bdev_io_stailq_t per_thread_cache; 218 uint32_t per_thread_cache_count; 219 uint32_t bdev_io_cache_size; 220 221 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 222 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 223 }; 224 225 /* 226 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 227 * will queue here their IO that awaits retry. It makes it possible to retry sending 228 * IO to one bdev after IO from other bdev completes. 229 */ 230 struct spdk_bdev_shared_resource { 231 /* The bdev management channel */ 232 struct spdk_bdev_mgmt_channel *mgmt_ch; 233 234 /* 235 * Count of I/O submitted to bdev module and waiting for completion. 236 * Incremented before submit_request() is called on an spdk_bdev_io. 237 */ 238 uint64_t io_outstanding; 239 240 /* 241 * Queue of IO awaiting retry because of a previous NOMEM status returned 242 * on this channel. 243 */ 244 bdev_io_tailq_t nomem_io; 245 246 /* 247 * Threshold which io_outstanding must drop to before retrying nomem_io. 248 */ 249 uint64_t nomem_threshold; 250 251 /* I/O channel allocated by a bdev module */ 252 struct spdk_io_channel *shared_ch; 253 254 /* Refcount of bdev channels using this resource */ 255 uint32_t ref; 256 257 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 258 }; 259 260 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 261 #define BDEV_CH_QOS_ENABLED (1 << 1) 262 263 struct spdk_bdev_channel { 264 struct spdk_bdev *bdev; 265 266 /* The channel for the underlying device */ 267 struct spdk_io_channel *channel; 268 269 /* Per io_device per thread data */ 270 struct spdk_bdev_shared_resource *shared_resource; 271 272 struct spdk_bdev_io_stat stat; 273 274 /* 275 * Count of I/O submitted to the underlying dev module through this channel 276 * and waiting for completion. 277 */ 278 uint64_t io_outstanding; 279 280 /* 281 * List of all submitted I/Os including I/O that are generated via splitting. 282 */ 283 bdev_io_tailq_t io_submitted; 284 285 /* 286 * List of spdk_bdev_io that are currently queued because they write to a locked 287 * LBA range. 288 */ 289 bdev_io_tailq_t io_locked; 290 291 uint32_t flags; 292 293 struct spdk_histogram_data *histogram; 294 295 #ifdef SPDK_CONFIG_VTUNE 296 uint64_t start_tsc; 297 uint64_t interval_tsc; 298 __itt_string_handle *handle; 299 struct spdk_bdev_io_stat prev_stat; 300 #endif 301 302 bdev_io_tailq_t queued_resets; 303 304 lba_range_tailq_t locked_ranges; 305 }; 306 307 struct media_event_entry { 308 struct spdk_bdev_media_event event; 309 TAILQ_ENTRY(media_event_entry) tailq; 310 }; 311 312 #define MEDIA_EVENT_POOL_SIZE 64 313 314 struct spdk_bdev_desc { 315 struct spdk_bdev *bdev; 316 struct spdk_thread *thread; 317 struct { 318 spdk_bdev_event_cb_t event_fn; 319 void *ctx; 320 } callback; 321 bool closed; 322 bool write; 323 pthread_mutex_t mutex; 324 uint32_t refs; 325 TAILQ_HEAD(, media_event_entry) pending_media_events; 326 TAILQ_HEAD(, media_event_entry) free_media_events; 327 struct media_event_entry *media_events_buffer; 328 TAILQ_ENTRY(spdk_bdev_desc) link; 329 330 uint64_t timeout_in_sec; 331 spdk_bdev_io_timeout_cb cb_fn; 332 void *cb_arg; 333 struct spdk_poller *io_timeout_poller; 334 }; 335 336 struct spdk_bdev_iostat_ctx { 337 struct spdk_bdev_io_stat *stat; 338 spdk_bdev_get_device_stat_cb cb; 339 void *cb_arg; 340 }; 341 342 struct set_qos_limit_ctx { 343 void (*cb_fn)(void *cb_arg, int status); 344 void *cb_arg; 345 struct spdk_bdev *bdev; 346 }; 347 348 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 349 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 350 351 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 352 static void bdev_write_zero_buffer_next(void *_bdev_io); 353 354 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 355 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 356 357 static int 358 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 359 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 360 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 361 struct spdk_bdev_ext_io_opts *opts); 362 static int 363 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 364 struct iovec *iov, int iovcnt, void *md_buf, 365 uint64_t offset_blocks, uint64_t num_blocks, 366 spdk_bdev_io_completion_cb cb, void *cb_arg, 367 struct spdk_bdev_ext_io_opts *opts); 368 369 static int 370 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 371 uint64_t offset, uint64_t length, 372 lock_range_cb cb_fn, void *cb_arg); 373 374 static int 375 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 376 uint64_t offset, uint64_t length, 377 lock_range_cb cb_fn, void *cb_arg); 378 379 static inline void bdev_io_complete(void *ctx); 380 381 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 382 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 383 384 void 385 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 386 { 387 if (!opts) { 388 SPDK_ERRLOG("opts should not be NULL\n"); 389 return; 390 } 391 392 if (!opts_size) { 393 SPDK_ERRLOG("opts_size should not be zero value\n"); 394 return; 395 } 396 397 opts->opts_size = opts_size; 398 399 #define SET_FIELD(field) \ 400 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 401 opts->field = g_bdev_opts.field; \ 402 } \ 403 404 SET_FIELD(bdev_io_pool_size); 405 SET_FIELD(bdev_io_cache_size); 406 SET_FIELD(bdev_auto_examine); 407 SET_FIELD(small_buf_pool_size); 408 SET_FIELD(large_buf_pool_size); 409 410 /* Do not remove this statement, you should always update this statement when you adding a new field, 411 * and do not forget to add the SET_FIELD statement for your added field. */ 412 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 413 414 #undef SET_FIELD 415 } 416 417 int 418 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 419 { 420 uint32_t min_pool_size; 421 422 if (!opts) { 423 SPDK_ERRLOG("opts cannot be NULL\n"); 424 return -1; 425 } 426 427 if (!opts->opts_size) { 428 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 429 return -1; 430 } 431 432 /* 433 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 434 * initialization. A second mgmt_ch will be created on the same thread when the application starts 435 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 436 */ 437 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 438 if (opts->bdev_io_pool_size < min_pool_size) { 439 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 440 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 441 spdk_thread_get_count()); 442 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 443 return -1; 444 } 445 446 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 447 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 448 return -1; 449 } 450 451 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 452 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 453 return -1; 454 } 455 456 #define SET_FIELD(field) \ 457 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 458 g_bdev_opts.field = opts->field; \ 459 } \ 460 461 SET_FIELD(bdev_io_pool_size); 462 SET_FIELD(bdev_io_cache_size); 463 SET_FIELD(bdev_auto_examine); 464 SET_FIELD(small_buf_pool_size); 465 SET_FIELD(large_buf_pool_size); 466 467 g_bdev_opts.opts_size = opts->opts_size; 468 469 #undef SET_FIELD 470 471 return 0; 472 } 473 474 static struct spdk_bdev * 475 bdev_get_by_name(const char *bdev_name) 476 { 477 struct spdk_bdev_name find; 478 struct spdk_bdev_name *res; 479 480 find.name = (char *)bdev_name; 481 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 482 if (res != NULL) { 483 return res->bdev; 484 } 485 486 return NULL; 487 } 488 489 struct spdk_bdev * 490 spdk_bdev_get_by_name(const char *bdev_name) 491 { 492 struct spdk_bdev *bdev; 493 494 pthread_mutex_lock(&g_bdev_mgr.mutex); 495 bdev = bdev_get_by_name(bdev_name); 496 pthread_mutex_unlock(&g_bdev_mgr.mutex); 497 498 return bdev; 499 } 500 501 struct spdk_bdev_wait_for_examine_ctx { 502 struct spdk_poller *poller; 503 spdk_bdev_wait_for_examine_cb cb_fn; 504 void *cb_arg; 505 }; 506 507 static bool 508 bdev_module_all_actions_completed(void); 509 510 static int 511 bdev_wait_for_examine_cb(void *arg) 512 { 513 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 514 515 if (!bdev_module_all_actions_completed()) { 516 return SPDK_POLLER_IDLE; 517 } 518 519 spdk_poller_unregister(&ctx->poller); 520 ctx->cb_fn(ctx->cb_arg); 521 free(ctx); 522 523 return SPDK_POLLER_BUSY; 524 } 525 526 int 527 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 528 { 529 struct spdk_bdev_wait_for_examine_ctx *ctx; 530 531 ctx = calloc(1, sizeof(*ctx)); 532 if (ctx == NULL) { 533 return -ENOMEM; 534 } 535 ctx->cb_fn = cb_fn; 536 ctx->cb_arg = cb_arg; 537 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 538 539 return 0; 540 } 541 542 struct spdk_bdev_examine_item { 543 char *name; 544 TAILQ_ENTRY(spdk_bdev_examine_item) link; 545 }; 546 547 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 548 549 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 550 g_bdev_examine_allowlist); 551 552 static inline bool 553 bdev_examine_allowlist_check(const char *name) 554 { 555 struct spdk_bdev_examine_item *item; 556 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 557 if (strcmp(name, item->name) == 0) { 558 return true; 559 } 560 } 561 return false; 562 } 563 564 static inline void 565 bdev_examine_allowlist_free(void) 566 { 567 struct spdk_bdev_examine_item *item; 568 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 569 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 570 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 571 free(item->name); 572 free(item); 573 } 574 } 575 576 static inline bool 577 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 578 { 579 struct spdk_bdev_alias *tmp; 580 if (bdev_examine_allowlist_check(bdev->name)) { 581 return true; 582 } 583 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 584 if (bdev_examine_allowlist_check(tmp->alias.name)) { 585 return true; 586 } 587 } 588 return false; 589 } 590 591 static inline bool 592 bdev_ok_to_examine(struct spdk_bdev *bdev) 593 { 594 if (g_bdev_opts.bdev_auto_examine) { 595 return true; 596 } else { 597 return bdev_in_examine_allowlist(bdev); 598 } 599 } 600 601 static void 602 bdev_examine(struct spdk_bdev *bdev) 603 { 604 struct spdk_bdev_module *module; 605 uint32_t action; 606 607 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 608 if (module->examine_config && bdev_ok_to_examine(bdev)) { 609 action = module->internal.action_in_progress; 610 module->internal.action_in_progress++; 611 module->examine_config(bdev); 612 if (action != module->internal.action_in_progress) { 613 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 614 module->name); 615 } 616 } 617 } 618 619 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 620 if (bdev->internal.claim_module->examine_disk) { 621 bdev->internal.claim_module->internal.action_in_progress++; 622 bdev->internal.claim_module->examine_disk(bdev); 623 } 624 return; 625 } 626 627 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 628 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 629 module->internal.action_in_progress++; 630 module->examine_disk(bdev); 631 } 632 } 633 } 634 635 int 636 spdk_bdev_examine(const char *name) 637 { 638 struct spdk_bdev *bdev; 639 struct spdk_bdev_examine_item *item; 640 641 if (g_bdev_opts.bdev_auto_examine) { 642 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 643 return -EINVAL; 644 } 645 646 if (bdev_examine_allowlist_check(name)) { 647 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 648 return -EEXIST; 649 } 650 651 item = calloc(1, sizeof(*item)); 652 if (!item) { 653 return -ENOMEM; 654 } 655 item->name = strdup(name); 656 if (!item->name) { 657 free(item); 658 return -ENOMEM; 659 } 660 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 661 662 bdev = spdk_bdev_get_by_name(name); 663 if (bdev) { 664 bdev_examine(bdev); 665 } 666 return 0; 667 } 668 669 static inline void 670 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 671 { 672 struct spdk_bdev_examine_item *item; 673 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 674 spdk_json_write_object_begin(w); 675 spdk_json_write_named_string(w, "method", "bdev_examine"); 676 spdk_json_write_named_object_begin(w, "params"); 677 spdk_json_write_named_string(w, "name", item->name); 678 spdk_json_write_object_end(w); 679 spdk_json_write_object_end(w); 680 } 681 } 682 683 struct spdk_bdev * 684 spdk_bdev_first(void) 685 { 686 struct spdk_bdev *bdev; 687 688 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 689 if (bdev) { 690 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 691 } 692 693 return bdev; 694 } 695 696 struct spdk_bdev * 697 spdk_bdev_next(struct spdk_bdev *prev) 698 { 699 struct spdk_bdev *bdev; 700 701 bdev = TAILQ_NEXT(prev, internal.link); 702 if (bdev) { 703 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 704 } 705 706 return bdev; 707 } 708 709 static struct spdk_bdev * 710 _bdev_next_leaf(struct spdk_bdev *bdev) 711 { 712 while (bdev != NULL) { 713 if (bdev->internal.claim_module == NULL) { 714 return bdev; 715 } else { 716 bdev = TAILQ_NEXT(bdev, internal.link); 717 } 718 } 719 720 return bdev; 721 } 722 723 struct spdk_bdev * 724 spdk_bdev_first_leaf(void) 725 { 726 struct spdk_bdev *bdev; 727 728 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 729 730 if (bdev) { 731 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 732 } 733 734 return bdev; 735 } 736 737 struct spdk_bdev * 738 spdk_bdev_next_leaf(struct spdk_bdev *prev) 739 { 740 struct spdk_bdev *bdev; 741 742 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 743 744 if (bdev) { 745 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 746 } 747 748 return bdev; 749 } 750 751 void 752 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 753 { 754 struct iovec *iovs; 755 756 if (bdev_io->u.bdev.iovs == NULL) { 757 bdev_io->u.bdev.iovs = &bdev_io->iov; 758 bdev_io->u.bdev.iovcnt = 1; 759 } 760 761 iovs = bdev_io->u.bdev.iovs; 762 763 assert(iovs != NULL); 764 assert(bdev_io->u.bdev.iovcnt >= 1); 765 766 iovs[0].iov_base = buf; 767 iovs[0].iov_len = len; 768 } 769 770 void 771 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 772 { 773 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 774 bdev_io->u.bdev.md_buf = md_buf; 775 } 776 777 static bool 778 _is_buf_allocated(const struct iovec *iovs) 779 { 780 if (iovs == NULL) { 781 return false; 782 } 783 784 return iovs[0].iov_base != NULL; 785 } 786 787 static bool 788 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 789 { 790 int i; 791 uintptr_t iov_base; 792 793 if (spdk_likely(alignment == 1)) { 794 return true; 795 } 796 797 for (i = 0; i < iovcnt; i++) { 798 iov_base = (uintptr_t)iovs[i].iov_base; 799 if ((iov_base & (alignment - 1)) != 0) { 800 return false; 801 } 802 } 803 804 return true; 805 } 806 807 static void 808 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 809 { 810 int i; 811 size_t len; 812 813 for (i = 0; i < iovcnt; i++) { 814 len = spdk_min(iovs[i].iov_len, buf_len); 815 memcpy(buf, iovs[i].iov_base, len); 816 buf += len; 817 buf_len -= len; 818 } 819 } 820 821 static void 822 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 823 { 824 int i; 825 size_t len; 826 827 for (i = 0; i < iovcnt; i++) { 828 len = spdk_min(iovs[i].iov_len, buf_len); 829 memcpy(iovs[i].iov_base, buf, len); 830 buf += len; 831 buf_len -= len; 832 } 833 } 834 835 static void 836 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 837 { 838 /* save original iovec */ 839 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 840 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 841 /* set bounce iov */ 842 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 843 bdev_io->u.bdev.iovcnt = 1; 844 /* set bounce buffer for this operation */ 845 bdev_io->u.bdev.iovs[0].iov_base = buf; 846 bdev_io->u.bdev.iovs[0].iov_len = len; 847 /* if this is write path, copy data from original buffer to bounce buffer */ 848 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 849 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 850 } 851 } 852 853 static void 854 _bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 855 { 856 /* save original md_buf */ 857 bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf; 858 /* set bounce md_buf */ 859 bdev_io->u.bdev.md_buf = md_buf; 860 861 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 862 memcpy(md_buf, bdev_io->internal.orig_md_buf, len); 863 } 864 } 865 866 static void 867 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, void *buf, bool status) 868 { 869 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 870 871 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 872 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 873 bdev_io->internal.get_aux_buf_cb = NULL; 874 } else { 875 assert(bdev_io->internal.get_buf_cb != NULL); 876 bdev_io->internal.buf = buf; 877 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 878 bdev_io->internal.get_buf_cb = NULL; 879 } 880 } 881 882 static void 883 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 884 { 885 struct spdk_bdev *bdev = bdev_io->bdev; 886 bool buf_allocated; 887 uint64_t md_len, alignment; 888 void *aligned_buf; 889 890 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 891 bdev_io_get_buf_complete(bdev_io, buf, true); 892 return; 893 } 894 895 alignment = spdk_bdev_get_buf_align(bdev); 896 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 897 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 898 899 if (buf_allocated) { 900 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 901 } else { 902 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 903 } 904 905 if (spdk_bdev_is_md_separate(bdev)) { 906 aligned_buf = (char *)aligned_buf + len; 907 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 908 909 assert(((uintptr_t)aligned_buf & (alignment - 1)) == 0); 910 911 if (bdev_io->u.bdev.md_buf != NULL) { 912 _bdev_io_set_bounce_md_buf(bdev_io, aligned_buf, md_len); 913 } else { 914 spdk_bdev_io_set_md_buf(bdev_io, aligned_buf, md_len); 915 } 916 } 917 bdev_io_get_buf_complete(bdev_io, buf, true); 918 } 919 920 static void 921 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 922 { 923 struct spdk_bdev *bdev = bdev_io->bdev; 924 struct spdk_mempool *pool; 925 struct spdk_bdev_io *tmp; 926 bdev_io_stailq_t *stailq; 927 struct spdk_bdev_mgmt_channel *ch; 928 uint64_t md_len, alignment; 929 930 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 931 alignment = spdk_bdev_get_buf_align(bdev); 932 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 933 934 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 935 SPDK_BDEV_POOL_ALIGNMENT) { 936 pool = g_bdev_mgr.buf_small_pool; 937 stailq = &ch->need_buf_small; 938 } else { 939 pool = g_bdev_mgr.buf_large_pool; 940 stailq = &ch->need_buf_large; 941 } 942 943 if (STAILQ_EMPTY(stailq)) { 944 spdk_mempool_put(pool, buf); 945 } else { 946 tmp = STAILQ_FIRST(stailq); 947 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 948 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 949 } 950 } 951 952 static void 953 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 954 { 955 assert(bdev_io->internal.buf != NULL); 956 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 957 bdev_io->internal.buf = NULL; 958 } 959 960 void 961 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 962 { 963 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 964 965 assert(buf != NULL); 966 _bdev_io_put_buf(bdev_io, buf, len); 967 } 968 969 static void 970 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 971 { 972 if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) { 973 assert(bdev_io->internal.orig_md_buf == NULL); 974 return; 975 } 976 977 /* if this is read path, copy data from bounce buffer to original buffer */ 978 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 979 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 980 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 981 bdev_io->internal.orig_iovcnt, 982 bdev_io->internal.bounce_iov.iov_base, 983 bdev_io->internal.bounce_iov.iov_len); 984 } 985 /* set original buffer for this io */ 986 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 987 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 988 /* disable bouncing buffer for this io */ 989 bdev_io->internal.orig_iovcnt = 0; 990 bdev_io->internal.orig_iovs = NULL; 991 992 /* do the same for metadata buffer */ 993 if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) { 994 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 995 996 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 997 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 998 memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf, 999 bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev)); 1000 } 1001 1002 bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf; 1003 bdev_io->internal.orig_md_buf = NULL; 1004 } 1005 1006 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1007 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1008 */ 1009 bdev_io_put_buf(bdev_io); 1010 } 1011 1012 static void 1013 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1014 { 1015 struct spdk_bdev *bdev = bdev_io->bdev; 1016 struct spdk_mempool *pool; 1017 bdev_io_stailq_t *stailq; 1018 struct spdk_bdev_mgmt_channel *mgmt_ch; 1019 uint64_t alignment, md_len; 1020 void *buf; 1021 1022 alignment = spdk_bdev_get_buf_align(bdev); 1023 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1024 1025 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1026 SPDK_BDEV_POOL_ALIGNMENT) { 1027 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1028 len + alignment); 1029 bdev_io_get_buf_complete(bdev_io, NULL, false); 1030 return; 1031 } 1032 1033 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1034 1035 bdev_io->internal.buf_len = len; 1036 1037 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1038 SPDK_BDEV_POOL_ALIGNMENT) { 1039 pool = g_bdev_mgr.buf_small_pool; 1040 stailq = &mgmt_ch->need_buf_small; 1041 } else { 1042 pool = g_bdev_mgr.buf_large_pool; 1043 stailq = &mgmt_ch->need_buf_large; 1044 } 1045 1046 buf = spdk_mempool_get(pool); 1047 if (!buf) { 1048 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1049 } else { 1050 _bdev_io_set_buf(bdev_io, buf, len); 1051 } 1052 } 1053 1054 void 1055 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1056 { 1057 struct spdk_bdev *bdev = bdev_io->bdev; 1058 uint64_t alignment; 1059 1060 assert(cb != NULL); 1061 bdev_io->internal.get_buf_cb = cb; 1062 1063 alignment = spdk_bdev_get_buf_align(bdev); 1064 1065 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1066 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1067 /* Buffer already present and aligned */ 1068 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1069 return; 1070 } 1071 1072 bdev_io_get_buf(bdev_io, len); 1073 } 1074 1075 void 1076 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1077 { 1078 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1079 1080 assert(cb != NULL); 1081 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1082 bdev_io->internal.get_aux_buf_cb = cb; 1083 bdev_io_get_buf(bdev_io, len); 1084 } 1085 1086 static int 1087 bdev_module_get_max_ctx_size(void) 1088 { 1089 struct spdk_bdev_module *bdev_module; 1090 int max_bdev_module_size = 0; 1091 1092 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1093 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1094 max_bdev_module_size = bdev_module->get_ctx_size(); 1095 } 1096 } 1097 1098 return max_bdev_module_size; 1099 } 1100 1101 static void 1102 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1103 { 1104 int i; 1105 struct spdk_bdev_qos *qos = bdev->internal.qos; 1106 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1107 1108 if (!qos) { 1109 return; 1110 } 1111 1112 spdk_bdev_get_qos_rate_limits(bdev, limits); 1113 1114 spdk_json_write_object_begin(w); 1115 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1116 1117 spdk_json_write_named_object_begin(w, "params"); 1118 spdk_json_write_named_string(w, "name", bdev->name); 1119 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1120 if (limits[i] > 0) { 1121 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1122 } 1123 } 1124 spdk_json_write_object_end(w); 1125 1126 spdk_json_write_object_end(w); 1127 } 1128 1129 void 1130 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1131 { 1132 struct spdk_bdev_module *bdev_module; 1133 struct spdk_bdev *bdev; 1134 1135 assert(w != NULL); 1136 1137 spdk_json_write_array_begin(w); 1138 1139 spdk_json_write_object_begin(w); 1140 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1141 spdk_json_write_named_object_begin(w, "params"); 1142 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1143 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1144 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1145 spdk_json_write_object_end(w); 1146 spdk_json_write_object_end(w); 1147 1148 bdev_examine_allowlist_config_json(w); 1149 1150 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1151 if (bdev_module->config_json) { 1152 bdev_module->config_json(w); 1153 } 1154 } 1155 1156 pthread_mutex_lock(&g_bdev_mgr.mutex); 1157 1158 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1159 if (bdev->fn_table->write_config_json) { 1160 bdev->fn_table->write_config_json(bdev, w); 1161 } 1162 1163 bdev_qos_config_json(bdev, w); 1164 } 1165 1166 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1167 1168 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1169 spdk_json_write_object_begin(w); 1170 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1171 spdk_json_write_object_end(w); 1172 1173 spdk_json_write_array_end(w); 1174 } 1175 1176 static int 1177 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1178 { 1179 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1180 struct spdk_bdev_io *bdev_io; 1181 uint32_t i; 1182 1183 STAILQ_INIT(&ch->need_buf_small); 1184 STAILQ_INIT(&ch->need_buf_large); 1185 1186 STAILQ_INIT(&ch->per_thread_cache); 1187 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1188 1189 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1190 ch->per_thread_cache_count = 0; 1191 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1192 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1193 assert(bdev_io != NULL); 1194 ch->per_thread_cache_count++; 1195 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1196 } 1197 1198 TAILQ_INIT(&ch->shared_resources); 1199 TAILQ_INIT(&ch->io_wait_queue); 1200 1201 return 0; 1202 } 1203 1204 static void 1205 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1206 { 1207 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1208 struct spdk_bdev_io *bdev_io; 1209 1210 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1211 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1212 } 1213 1214 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1215 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1216 } 1217 1218 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1219 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1220 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1221 ch->per_thread_cache_count--; 1222 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1223 } 1224 1225 assert(ch->per_thread_cache_count == 0); 1226 } 1227 1228 static void 1229 bdev_init_complete(int rc) 1230 { 1231 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1232 void *cb_arg = g_init_cb_arg; 1233 struct spdk_bdev_module *m; 1234 1235 g_bdev_mgr.init_complete = true; 1236 g_init_cb_fn = NULL; 1237 g_init_cb_arg = NULL; 1238 1239 /* 1240 * For modules that need to know when subsystem init is complete, 1241 * inform them now. 1242 */ 1243 if (rc == 0) { 1244 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1245 if (m->init_complete) { 1246 m->init_complete(); 1247 } 1248 } 1249 } 1250 1251 cb_fn(cb_arg, rc); 1252 } 1253 1254 static bool 1255 bdev_module_all_actions_completed(void) 1256 { 1257 struct spdk_bdev_module *m; 1258 1259 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1260 if (m->internal.action_in_progress > 0) { 1261 return false; 1262 } 1263 } 1264 return true; 1265 } 1266 1267 static void 1268 bdev_module_action_complete(void) 1269 { 1270 /* 1271 * Don't finish bdev subsystem initialization if 1272 * module pre-initialization is still in progress, or 1273 * the subsystem been already initialized. 1274 */ 1275 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1276 return; 1277 } 1278 1279 /* 1280 * Check all bdev modules for inits/examinations in progress. If any 1281 * exist, return immediately since we cannot finish bdev subsystem 1282 * initialization until all are completed. 1283 */ 1284 if (!bdev_module_all_actions_completed()) { 1285 return; 1286 } 1287 1288 /* 1289 * Modules already finished initialization - now that all 1290 * the bdev modules have finished their asynchronous I/O 1291 * processing, the entire bdev layer can be marked as complete. 1292 */ 1293 bdev_init_complete(0); 1294 } 1295 1296 static void 1297 bdev_module_action_done(struct spdk_bdev_module *module) 1298 { 1299 assert(module->internal.action_in_progress > 0); 1300 module->internal.action_in_progress--; 1301 bdev_module_action_complete(); 1302 } 1303 1304 void 1305 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1306 { 1307 bdev_module_action_done(module); 1308 } 1309 1310 void 1311 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1312 { 1313 bdev_module_action_done(module); 1314 } 1315 1316 /** The last initialized bdev module */ 1317 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1318 1319 static void 1320 bdev_init_failed(void *cb_arg) 1321 { 1322 struct spdk_bdev_module *module = cb_arg; 1323 1324 module->internal.action_in_progress--; 1325 bdev_init_complete(-1); 1326 } 1327 1328 static int 1329 bdev_modules_init(void) 1330 { 1331 struct spdk_bdev_module *module; 1332 int rc = 0; 1333 1334 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1335 g_resume_bdev_module = module; 1336 if (module->async_init) { 1337 module->internal.action_in_progress = 1; 1338 } 1339 rc = module->module_init(); 1340 if (rc != 0) { 1341 /* Bump action_in_progress to prevent other modules from completion of modules_init 1342 * Send message to defer application shutdown until resources are cleaned up */ 1343 module->internal.action_in_progress = 1; 1344 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1345 return rc; 1346 } 1347 } 1348 1349 g_resume_bdev_module = NULL; 1350 return 0; 1351 } 1352 1353 void 1354 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1355 { 1356 int cache_size; 1357 int rc = 0; 1358 char mempool_name[32]; 1359 1360 assert(cb_fn != NULL); 1361 1362 g_init_cb_fn = cb_fn; 1363 g_init_cb_arg = cb_arg; 1364 1365 spdk_notify_type_register("bdev_register"); 1366 spdk_notify_type_register("bdev_unregister"); 1367 1368 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1369 1370 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1371 g_bdev_opts.bdev_io_pool_size, 1372 sizeof(struct spdk_bdev_io) + 1373 bdev_module_get_max_ctx_size(), 1374 0, 1375 SPDK_ENV_SOCKET_ID_ANY); 1376 1377 if (g_bdev_mgr.bdev_io_pool == NULL) { 1378 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1379 bdev_init_complete(-1); 1380 return; 1381 } 1382 1383 /** 1384 * Ensure no more than half of the total buffers end up local caches, by 1385 * using spdk_env_get_core_count() to determine how many local caches we need 1386 * to account for. 1387 */ 1388 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1389 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1390 1391 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1392 g_bdev_opts.small_buf_pool_size, 1393 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1394 SPDK_BDEV_POOL_ALIGNMENT, 1395 cache_size, 1396 SPDK_ENV_SOCKET_ID_ANY); 1397 if (!g_bdev_mgr.buf_small_pool) { 1398 SPDK_ERRLOG("create rbuf small pool failed\n"); 1399 bdev_init_complete(-1); 1400 return; 1401 } 1402 1403 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1404 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1405 1406 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1407 g_bdev_opts.large_buf_pool_size, 1408 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1409 SPDK_BDEV_POOL_ALIGNMENT, 1410 cache_size, 1411 SPDK_ENV_SOCKET_ID_ANY); 1412 if (!g_bdev_mgr.buf_large_pool) { 1413 SPDK_ERRLOG("create rbuf large pool failed\n"); 1414 bdev_init_complete(-1); 1415 return; 1416 } 1417 1418 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1419 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1420 if (!g_bdev_mgr.zero_buffer) { 1421 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1422 bdev_init_complete(-1); 1423 return; 1424 } 1425 1426 #ifdef SPDK_CONFIG_VTUNE 1427 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1428 #endif 1429 1430 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1431 bdev_mgmt_channel_destroy, 1432 sizeof(struct spdk_bdev_mgmt_channel), 1433 "bdev_mgr"); 1434 1435 rc = bdev_modules_init(); 1436 g_bdev_mgr.module_init_complete = true; 1437 if (rc != 0) { 1438 SPDK_ERRLOG("bdev modules init failed\n"); 1439 return; 1440 } 1441 1442 bdev_module_action_complete(); 1443 } 1444 1445 static void 1446 bdev_mgr_unregister_cb(void *io_device) 1447 { 1448 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1449 1450 if (g_bdev_mgr.bdev_io_pool) { 1451 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1452 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1453 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1454 g_bdev_opts.bdev_io_pool_size); 1455 } 1456 1457 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1458 } 1459 1460 if (g_bdev_mgr.buf_small_pool) { 1461 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1462 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1463 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1464 g_bdev_opts.small_buf_pool_size); 1465 assert(false); 1466 } 1467 1468 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1469 } 1470 1471 if (g_bdev_mgr.buf_large_pool) { 1472 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1473 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1474 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1475 g_bdev_opts.large_buf_pool_size); 1476 assert(false); 1477 } 1478 1479 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1480 } 1481 1482 spdk_free(g_bdev_mgr.zero_buffer); 1483 1484 bdev_examine_allowlist_free(); 1485 1486 cb_fn(g_fini_cb_arg); 1487 g_fini_cb_fn = NULL; 1488 g_fini_cb_arg = NULL; 1489 g_bdev_mgr.init_complete = false; 1490 g_bdev_mgr.module_init_complete = false; 1491 } 1492 1493 static void 1494 bdev_module_fini_iter(void *arg) 1495 { 1496 struct spdk_bdev_module *bdev_module; 1497 1498 /* FIXME: Handling initialization failures is broken now, 1499 * so we won't even try cleaning up after successfully 1500 * initialized modules. if module_init_complete is false, 1501 * just call spdk_bdev_mgr_unregister_cb 1502 */ 1503 if (!g_bdev_mgr.module_init_complete) { 1504 bdev_mgr_unregister_cb(NULL); 1505 return; 1506 } 1507 1508 /* Start iterating from the last touched module */ 1509 if (!g_resume_bdev_module) { 1510 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1511 } else { 1512 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1513 internal.tailq); 1514 } 1515 1516 while (bdev_module) { 1517 if (bdev_module->async_fini) { 1518 /* Save our place so we can resume later. We must 1519 * save the variable here, before calling module_fini() 1520 * below, because in some cases the module may immediately 1521 * call spdk_bdev_module_fini_done() and re-enter 1522 * this function to continue iterating. */ 1523 g_resume_bdev_module = bdev_module; 1524 } 1525 1526 if (bdev_module->module_fini) { 1527 bdev_module->module_fini(); 1528 } 1529 1530 if (bdev_module->async_fini) { 1531 return; 1532 } 1533 1534 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1535 internal.tailq); 1536 } 1537 1538 g_resume_bdev_module = NULL; 1539 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1540 } 1541 1542 void 1543 spdk_bdev_module_fini_done(void) 1544 { 1545 if (spdk_get_thread() != g_fini_thread) { 1546 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1547 } else { 1548 bdev_module_fini_iter(NULL); 1549 } 1550 } 1551 1552 /* Deprecated */ 1553 void 1554 spdk_bdev_module_finish_done(void) 1555 { 1556 SPDK_NOTICELOG("spdk_bdev_module_finish_done() is deprecated, please use spdk_bdev_module_fini_done().\n"); 1557 spdk_bdev_module_fini_done(); 1558 } 1559 1560 static void 1561 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1562 { 1563 struct spdk_bdev *bdev = cb_arg; 1564 1565 if (bdeverrno && bdev) { 1566 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1567 bdev->name); 1568 1569 /* 1570 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1571 * bdev; try to continue by manually removing this bdev from the list and continue 1572 * with the next bdev in the list. 1573 */ 1574 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1575 } 1576 1577 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1578 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1579 /* 1580 * Bdev module finish need to be deferred as we might be in the middle of some context 1581 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1582 * after returning. 1583 */ 1584 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1585 return; 1586 } 1587 1588 /* 1589 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1590 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1591 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1592 * base bdevs. 1593 * 1594 * Also, walk the list in the reverse order. 1595 */ 1596 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1597 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1598 if (bdev->internal.claim_module != NULL) { 1599 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1600 bdev->name, bdev->internal.claim_module->name); 1601 continue; 1602 } 1603 1604 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1605 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1606 return; 1607 } 1608 1609 /* 1610 * If any bdev fails to unclaim underlying bdev properly, we may face the 1611 * case of bdev list consisting of claimed bdevs only (if claims are managed 1612 * correctly, this would mean there's a loop in the claims graph which is 1613 * clearly impossible). Warn and unregister last bdev on the list then. 1614 */ 1615 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1616 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1617 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1618 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1619 return; 1620 } 1621 } 1622 1623 static void 1624 bdev_module_fini_start_iter(void *arg) 1625 { 1626 struct spdk_bdev_module *bdev_module; 1627 1628 if (!g_resume_bdev_module) { 1629 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1630 } else { 1631 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1632 } 1633 1634 while (bdev_module) { 1635 if (bdev_module->async_fini_start) { 1636 /* Save our place so we can resume later. We must 1637 * save the variable here, before calling fini_start() 1638 * below, because in some cases the module may immediately 1639 * call spdk_bdev_module_fini_start_done() and re-enter 1640 * this function to continue iterating. */ 1641 g_resume_bdev_module = bdev_module; 1642 } 1643 1644 if (bdev_module->fini_start) { 1645 bdev_module->fini_start(); 1646 } 1647 1648 if (bdev_module->async_fini_start) { 1649 return; 1650 } 1651 1652 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1653 } 1654 1655 g_resume_bdev_module = NULL; 1656 1657 bdev_finish_unregister_bdevs_iter(NULL, 0); 1658 } 1659 1660 void 1661 spdk_bdev_module_fini_start_done(void) 1662 { 1663 if (spdk_get_thread() != g_fini_thread) { 1664 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1665 } else { 1666 bdev_module_fini_start_iter(NULL); 1667 } 1668 } 1669 1670 void 1671 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1672 { 1673 assert(cb_fn != NULL); 1674 1675 g_fini_thread = spdk_get_thread(); 1676 1677 g_fini_cb_fn = cb_fn; 1678 g_fini_cb_arg = cb_arg; 1679 1680 bdev_module_fini_start_iter(NULL); 1681 } 1682 1683 struct spdk_bdev_io * 1684 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1685 { 1686 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1687 struct spdk_bdev_io *bdev_io; 1688 1689 if (ch->per_thread_cache_count > 0) { 1690 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1691 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1692 ch->per_thread_cache_count--; 1693 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1694 /* 1695 * Don't try to look for bdev_ios in the global pool if there are 1696 * waiters on bdev_ios - we don't want this caller to jump the line. 1697 */ 1698 bdev_io = NULL; 1699 } else { 1700 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1701 } 1702 1703 return bdev_io; 1704 } 1705 1706 void 1707 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1708 { 1709 struct spdk_bdev_mgmt_channel *ch; 1710 1711 assert(bdev_io != NULL); 1712 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1713 1714 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1715 1716 if (bdev_io->internal.buf != NULL) { 1717 bdev_io_put_buf(bdev_io); 1718 } 1719 1720 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1721 ch->per_thread_cache_count++; 1722 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1723 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1724 struct spdk_bdev_io_wait_entry *entry; 1725 1726 entry = TAILQ_FIRST(&ch->io_wait_queue); 1727 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1728 entry->cb_fn(entry->cb_arg); 1729 } 1730 } else { 1731 /* We should never have a full cache with entries on the io wait queue. */ 1732 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1733 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1734 } 1735 } 1736 1737 static bool 1738 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1739 { 1740 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1741 1742 switch (limit) { 1743 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1744 return true; 1745 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1746 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1747 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1748 return false; 1749 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1750 default: 1751 return false; 1752 } 1753 } 1754 1755 static bool 1756 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1757 { 1758 switch (bdev_io->type) { 1759 case SPDK_BDEV_IO_TYPE_NVME_IO: 1760 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1761 case SPDK_BDEV_IO_TYPE_READ: 1762 case SPDK_BDEV_IO_TYPE_WRITE: 1763 return true; 1764 case SPDK_BDEV_IO_TYPE_ZCOPY: 1765 if (bdev_io->u.bdev.zcopy.start) { 1766 return true; 1767 } else { 1768 return false; 1769 } 1770 default: 1771 return false; 1772 } 1773 } 1774 1775 static bool 1776 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1777 { 1778 switch (bdev_io->type) { 1779 case SPDK_BDEV_IO_TYPE_NVME_IO: 1780 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1781 /* Bit 1 (0x2) set for read operation */ 1782 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1783 return true; 1784 } else { 1785 return false; 1786 } 1787 case SPDK_BDEV_IO_TYPE_READ: 1788 return true; 1789 case SPDK_BDEV_IO_TYPE_ZCOPY: 1790 /* Populate to read from disk */ 1791 if (bdev_io->u.bdev.zcopy.populate) { 1792 return true; 1793 } else { 1794 return false; 1795 } 1796 default: 1797 return false; 1798 } 1799 } 1800 1801 static uint64_t 1802 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1803 { 1804 struct spdk_bdev *bdev = bdev_io->bdev; 1805 1806 switch (bdev_io->type) { 1807 case SPDK_BDEV_IO_TYPE_NVME_IO: 1808 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1809 return bdev_io->u.nvme_passthru.nbytes; 1810 case SPDK_BDEV_IO_TYPE_READ: 1811 case SPDK_BDEV_IO_TYPE_WRITE: 1812 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1813 case SPDK_BDEV_IO_TYPE_ZCOPY: 1814 /* Track the data in the start phase only */ 1815 if (bdev_io->u.bdev.zcopy.start) { 1816 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1817 } else { 1818 return 0; 1819 } 1820 default: 1821 return 0; 1822 } 1823 } 1824 1825 static bool 1826 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1827 { 1828 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1829 return true; 1830 } else { 1831 return false; 1832 } 1833 } 1834 1835 static bool 1836 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1837 { 1838 if (bdev_is_read_io(io) == false) { 1839 return false; 1840 } 1841 1842 return bdev_qos_rw_queue_io(limit, io); 1843 } 1844 1845 static bool 1846 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1847 { 1848 if (bdev_is_read_io(io) == true) { 1849 return false; 1850 } 1851 1852 return bdev_qos_rw_queue_io(limit, io); 1853 } 1854 1855 static void 1856 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1857 { 1858 limit->remaining_this_timeslice--; 1859 } 1860 1861 static void 1862 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1863 { 1864 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 1865 } 1866 1867 static void 1868 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1869 { 1870 if (bdev_is_read_io(io) == false) { 1871 return; 1872 } 1873 1874 return bdev_qos_rw_bps_update_quota(limit, io); 1875 } 1876 1877 static void 1878 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1879 { 1880 if (bdev_is_read_io(io) == true) { 1881 return; 1882 } 1883 1884 return bdev_qos_rw_bps_update_quota(limit, io); 1885 } 1886 1887 static void 1888 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1889 { 1890 int i; 1891 1892 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1893 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1894 qos->rate_limits[i].queue_io = NULL; 1895 qos->rate_limits[i].update_quota = NULL; 1896 continue; 1897 } 1898 1899 switch (i) { 1900 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1901 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1902 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 1903 break; 1904 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1905 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1906 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 1907 break; 1908 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1909 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 1910 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 1911 break; 1912 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1913 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 1914 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 1915 break; 1916 default: 1917 break; 1918 } 1919 } 1920 } 1921 1922 static void 1923 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 1924 struct spdk_bdev_io *bdev_io, 1925 enum spdk_bdev_io_status status) 1926 { 1927 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1928 1929 bdev_io->internal.in_submit_request = true; 1930 bdev_ch->io_outstanding++; 1931 shared_resource->io_outstanding++; 1932 spdk_bdev_io_complete(bdev_io, status); 1933 bdev_io->internal.in_submit_request = false; 1934 } 1935 1936 static inline void 1937 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1938 { 1939 struct spdk_bdev *bdev = bdev_io->bdev; 1940 struct spdk_io_channel *ch = bdev_ch->channel; 1941 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1942 1943 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 1944 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 1945 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 1946 1947 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 1948 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 1949 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 1950 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 1951 SPDK_BDEV_IO_STATUS_SUCCESS); 1952 return; 1953 } 1954 } 1955 1956 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1957 bdev_ch->io_outstanding++; 1958 shared_resource->io_outstanding++; 1959 bdev_io->internal.in_submit_request = true; 1960 bdev->fn_table->submit_request(ch, bdev_io); 1961 bdev_io->internal.in_submit_request = false; 1962 } else { 1963 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1964 } 1965 } 1966 1967 static int 1968 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1969 { 1970 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1971 int i, submitted_ios = 0; 1972 1973 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1974 if (bdev_qos_io_to_limit(bdev_io) == true) { 1975 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1976 if (!qos->rate_limits[i].queue_io) { 1977 continue; 1978 } 1979 1980 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1981 bdev_io) == true) { 1982 return submitted_ios; 1983 } 1984 } 1985 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1986 if (!qos->rate_limits[i].update_quota) { 1987 continue; 1988 } 1989 1990 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1991 } 1992 } 1993 1994 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1995 bdev_io_do_submit(ch, bdev_io); 1996 submitted_ios++; 1997 } 1998 1999 return submitted_ios; 2000 } 2001 2002 static void 2003 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2004 { 2005 int rc; 2006 2007 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2008 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2009 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2010 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2011 &bdev_io->internal.waitq_entry); 2012 if (rc != 0) { 2013 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2014 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2015 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2016 } 2017 } 2018 2019 static bool 2020 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2021 { 2022 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 2023 uint32_t max_size = bdev_io->bdev->max_segment_size; 2024 int max_segs = bdev_io->bdev->max_num_segments; 2025 2026 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 2027 2028 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2029 return false; 2030 } 2031 2032 if (io_boundary) { 2033 uint64_t start_stripe, end_stripe; 2034 2035 start_stripe = bdev_io->u.bdev.offset_blocks; 2036 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2037 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2038 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2039 start_stripe >>= spdk_u32log2(io_boundary); 2040 end_stripe >>= spdk_u32log2(io_boundary); 2041 } else { 2042 start_stripe /= io_boundary; 2043 end_stripe /= io_boundary; 2044 } 2045 2046 if (start_stripe != end_stripe) { 2047 return true; 2048 } 2049 } 2050 2051 if (max_segs) { 2052 if (bdev_io->u.bdev.iovcnt > max_segs) { 2053 return true; 2054 } 2055 } 2056 2057 if (max_size) { 2058 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2059 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2060 return true; 2061 } 2062 } 2063 } 2064 2065 return false; 2066 } 2067 2068 static bool 2069 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2070 { 2071 uint32_t num_unmap_segments; 2072 2073 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2074 return false; 2075 } 2076 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2077 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2078 return true; 2079 } 2080 2081 return false; 2082 } 2083 2084 static bool 2085 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2086 { 2087 if (!bdev_io->bdev->max_write_zeroes) { 2088 return false; 2089 } 2090 2091 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2092 return true; 2093 } 2094 2095 return false; 2096 } 2097 2098 static bool 2099 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2100 { 2101 switch (bdev_io->type) { 2102 case SPDK_BDEV_IO_TYPE_READ: 2103 case SPDK_BDEV_IO_TYPE_WRITE: 2104 return bdev_rw_should_split(bdev_io); 2105 case SPDK_BDEV_IO_TYPE_UNMAP: 2106 return bdev_unmap_should_split(bdev_io); 2107 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2108 return bdev_write_zeroes_should_split(bdev_io); 2109 default: 2110 return false; 2111 } 2112 } 2113 2114 static uint32_t 2115 _to_next_boundary(uint64_t offset, uint32_t boundary) 2116 { 2117 return (boundary - (offset % boundary)); 2118 } 2119 2120 static void 2121 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2122 2123 static void 2124 _bdev_rw_split(void *_bdev_io); 2125 2126 static void 2127 bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2128 2129 static void 2130 _bdev_unmap_split(void *_bdev_io) 2131 { 2132 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2133 } 2134 2135 static void 2136 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2137 2138 static void 2139 _bdev_write_zeroes_split(void *_bdev_io) 2140 { 2141 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2142 } 2143 2144 static int 2145 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2146 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2147 { 2148 int rc; 2149 uint64_t current_offset, current_remaining; 2150 spdk_bdev_io_wait_cb io_wait_fn; 2151 2152 current_offset = *offset; 2153 current_remaining = *remaining; 2154 2155 bdev_io->u.bdev.split_outstanding++; 2156 2157 io_wait_fn = _bdev_rw_split; 2158 switch (bdev_io->type) { 2159 case SPDK_BDEV_IO_TYPE_READ: 2160 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2161 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2162 iov, iovcnt, md_buf, current_offset, 2163 num_blocks, 2164 bdev_io_split_done, bdev_io, 2165 bdev_io->internal.ext_opts); 2166 break; 2167 case SPDK_BDEV_IO_TYPE_WRITE: 2168 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2169 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2170 iov, iovcnt, md_buf, current_offset, 2171 num_blocks, 2172 bdev_io_split_done, bdev_io, 2173 bdev_io->internal.ext_opts); 2174 break; 2175 case SPDK_BDEV_IO_TYPE_UNMAP: 2176 io_wait_fn = _bdev_unmap_split; 2177 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2178 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2179 current_offset, num_blocks, 2180 bdev_io_split_done, bdev_io); 2181 break; 2182 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2183 io_wait_fn = _bdev_write_zeroes_split; 2184 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2185 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2186 current_offset, num_blocks, 2187 bdev_io_split_done, bdev_io); 2188 break; 2189 default: 2190 assert(false); 2191 rc = -EINVAL; 2192 break; 2193 } 2194 2195 if (rc == 0) { 2196 current_offset += num_blocks; 2197 current_remaining -= num_blocks; 2198 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2199 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2200 *offset = current_offset; 2201 *remaining = current_remaining; 2202 } else { 2203 bdev_io->u.bdev.split_outstanding--; 2204 if (rc == -ENOMEM) { 2205 if (bdev_io->u.bdev.split_outstanding == 0) { 2206 /* No I/O is outstanding. Hence we should wait here. */ 2207 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2208 } 2209 } else { 2210 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2211 if (bdev_io->u.bdev.split_outstanding == 0) { 2212 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2213 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2214 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2215 } 2216 } 2217 } 2218 2219 return rc; 2220 } 2221 2222 static void 2223 _bdev_rw_split(void *_bdev_io) 2224 { 2225 struct iovec *parent_iov, *iov; 2226 struct spdk_bdev_io *bdev_io = _bdev_io; 2227 struct spdk_bdev *bdev = bdev_io->bdev; 2228 uint64_t parent_offset, current_offset, remaining; 2229 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2230 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2231 uint32_t iovcnt, iov_len, child_iovsize; 2232 uint32_t blocklen = bdev->blocklen; 2233 uint32_t io_boundary = bdev->optimal_io_boundary; 2234 uint32_t max_segment_size = bdev->max_segment_size; 2235 uint32_t max_child_iovcnt = bdev->max_num_segments; 2236 void *md_buf = NULL; 2237 int rc; 2238 2239 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2240 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2241 BDEV_IO_NUM_CHILD_IOV; 2242 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2243 2244 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2245 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2246 parent_offset = bdev_io->u.bdev.offset_blocks; 2247 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2248 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2249 2250 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2251 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2252 if (parent_iov_offset < parent_iov->iov_len) { 2253 break; 2254 } 2255 parent_iov_offset -= parent_iov->iov_len; 2256 } 2257 2258 child_iovcnt = 0; 2259 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2260 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2261 to_next_boundary = spdk_min(remaining, to_next_boundary); 2262 to_next_boundary_bytes = to_next_boundary * blocklen; 2263 2264 iov = &bdev_io->child_iov[child_iovcnt]; 2265 iovcnt = 0; 2266 2267 if (bdev_io->u.bdev.md_buf) { 2268 md_buf = (char *)bdev_io->u.bdev.md_buf + 2269 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2270 } 2271 2272 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2273 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2274 iovcnt < child_iovsize) { 2275 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2276 iov_len = parent_iov->iov_len - parent_iov_offset; 2277 2278 iov_len = spdk_min(iov_len, max_segment_size); 2279 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2280 to_next_boundary_bytes -= iov_len; 2281 2282 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2283 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2284 2285 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2286 parent_iov_offset += iov_len; 2287 } else { 2288 parent_iovpos++; 2289 parent_iov_offset = 0; 2290 } 2291 child_iovcnt++; 2292 iovcnt++; 2293 } 2294 2295 if (to_next_boundary_bytes > 0) { 2296 /* We had to stop this child I/O early because we ran out of 2297 * child_iov space or were limited by max_num_segments. 2298 * Ensure the iovs to be aligned with block size and 2299 * then adjust to_next_boundary before starting the 2300 * child I/O. 2301 */ 2302 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2303 iovcnt == child_iovsize); 2304 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2305 if (to_last_block_bytes != 0) { 2306 uint32_t child_iovpos = child_iovcnt - 1; 2307 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2308 * so the loop will naturally end 2309 */ 2310 2311 to_last_block_bytes = blocklen - to_last_block_bytes; 2312 to_next_boundary_bytes += to_last_block_bytes; 2313 while (to_last_block_bytes > 0 && iovcnt > 0) { 2314 iov_len = spdk_min(to_last_block_bytes, 2315 bdev_io->child_iov[child_iovpos].iov_len); 2316 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2317 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2318 child_iovpos--; 2319 if (--iovcnt == 0) { 2320 /* If the child IO is less than a block size just return. 2321 * If the first child IO of any split round is less than 2322 * a block size, an error exit. 2323 */ 2324 if (bdev_io->u.bdev.split_outstanding == 0) { 2325 SPDK_ERRLOG("The first child io was less than a block size\n"); 2326 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2327 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2328 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2329 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2330 } 2331 2332 return; 2333 } 2334 } 2335 2336 to_last_block_bytes -= iov_len; 2337 2338 if (parent_iov_offset == 0) { 2339 parent_iovpos--; 2340 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2341 } 2342 parent_iov_offset -= iov_len; 2343 } 2344 2345 assert(to_last_block_bytes == 0); 2346 } 2347 to_next_boundary -= to_next_boundary_bytes / blocklen; 2348 } 2349 2350 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2351 ¤t_offset, &remaining); 2352 if (spdk_unlikely(rc)) { 2353 return; 2354 } 2355 } 2356 } 2357 2358 static void 2359 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2360 { 2361 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2362 uint32_t num_children_reqs = 0; 2363 int rc; 2364 2365 offset = bdev_io->u.bdev.split_current_offset_blocks; 2366 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2367 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2368 2369 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2370 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2371 2372 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2373 &offset, &remaining); 2374 if (spdk_likely(rc == 0)) { 2375 num_children_reqs++; 2376 } else { 2377 return; 2378 } 2379 } 2380 } 2381 2382 static void 2383 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2384 { 2385 uint64_t offset, write_zeroes_blocks, remaining; 2386 uint32_t num_children_reqs = 0; 2387 int rc; 2388 2389 offset = bdev_io->u.bdev.split_current_offset_blocks; 2390 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2391 2392 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2393 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2394 2395 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2396 &offset, &remaining); 2397 if (spdk_likely(rc == 0)) { 2398 num_children_reqs++; 2399 } else { 2400 return; 2401 } 2402 } 2403 } 2404 2405 static void 2406 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2407 { 2408 struct spdk_bdev_io *parent_io = cb_arg; 2409 2410 spdk_bdev_free_io(bdev_io); 2411 2412 if (!success) { 2413 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2414 /* If any child I/O failed, stop further splitting process. */ 2415 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2416 parent_io->u.bdev.split_remaining_num_blocks = 0; 2417 } 2418 parent_io->u.bdev.split_outstanding--; 2419 if (parent_io->u.bdev.split_outstanding != 0) { 2420 return; 2421 } 2422 2423 /* 2424 * Parent I/O finishes when all blocks are consumed. 2425 */ 2426 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2427 assert(parent_io->internal.cb != bdev_io_split_done); 2428 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2429 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2430 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2431 parent_io->internal.caller_ctx); 2432 return; 2433 } 2434 2435 /* 2436 * Continue with the splitting process. This function will complete the parent I/O if the 2437 * splitting is done. 2438 */ 2439 switch (parent_io->type) { 2440 case SPDK_BDEV_IO_TYPE_READ: 2441 case SPDK_BDEV_IO_TYPE_WRITE: 2442 _bdev_rw_split(parent_io); 2443 break; 2444 case SPDK_BDEV_IO_TYPE_UNMAP: 2445 bdev_unmap_split(parent_io); 2446 break; 2447 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2448 bdev_write_zeroes_split(parent_io); 2449 break; 2450 default: 2451 assert(false); 2452 break; 2453 } 2454 } 2455 2456 static void 2457 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2458 2459 static void 2460 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2461 { 2462 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2463 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2464 bdev_io->u.bdev.split_outstanding = 0; 2465 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2466 2467 switch (bdev_io->type) { 2468 case SPDK_BDEV_IO_TYPE_READ: 2469 case SPDK_BDEV_IO_TYPE_WRITE: 2470 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2471 _bdev_rw_split(bdev_io); 2472 } else { 2473 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2474 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2475 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2476 } 2477 break; 2478 case SPDK_BDEV_IO_TYPE_UNMAP: 2479 bdev_unmap_split(bdev_io); 2480 break; 2481 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2482 bdev_write_zeroes_split(bdev_io); 2483 break; 2484 default: 2485 assert(false); 2486 break; 2487 } 2488 } 2489 2490 static void 2491 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2492 { 2493 if (!success) { 2494 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2495 return; 2496 } 2497 2498 _bdev_rw_split(bdev_io); 2499 } 2500 2501 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2502 * be inlined, at least on some compilers. 2503 */ 2504 static inline void 2505 _bdev_io_submit(void *ctx) 2506 { 2507 struct spdk_bdev_io *bdev_io = ctx; 2508 struct spdk_bdev *bdev = bdev_io->bdev; 2509 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2510 uint64_t tsc; 2511 2512 tsc = spdk_get_ticks(); 2513 bdev_io->internal.submit_tsc = tsc; 2514 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type, 2515 bdev_io->internal.caller_ctx); 2516 2517 if (spdk_likely(bdev_ch->flags == 0)) { 2518 bdev_io_do_submit(bdev_ch, bdev_io); 2519 return; 2520 } 2521 2522 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2523 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2524 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2525 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2526 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2527 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2528 } else { 2529 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2530 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2531 } 2532 } else { 2533 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2534 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2535 } 2536 } 2537 2538 bool 2539 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2540 2541 bool 2542 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2543 { 2544 if (range1->length == 0 || range2->length == 0) { 2545 return false; 2546 } 2547 2548 if (range1->offset + range1->length <= range2->offset) { 2549 return false; 2550 } 2551 2552 if (range2->offset + range2->length <= range1->offset) { 2553 return false; 2554 } 2555 2556 return true; 2557 } 2558 2559 static bool 2560 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2561 { 2562 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2563 struct lba_range r; 2564 2565 switch (bdev_io->type) { 2566 case SPDK_BDEV_IO_TYPE_NVME_IO: 2567 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2568 /* Don't try to decode the NVMe command - just assume worst-case and that 2569 * it overlaps a locked range. 2570 */ 2571 return true; 2572 case SPDK_BDEV_IO_TYPE_WRITE: 2573 case SPDK_BDEV_IO_TYPE_UNMAP: 2574 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2575 case SPDK_BDEV_IO_TYPE_ZCOPY: 2576 r.offset = bdev_io->u.bdev.offset_blocks; 2577 r.length = bdev_io->u.bdev.num_blocks; 2578 if (!bdev_lba_range_overlapped(range, &r)) { 2579 /* This I/O doesn't overlap the specified LBA range. */ 2580 return false; 2581 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2582 /* This I/O overlaps, but the I/O is on the same channel that locked this 2583 * range, and the caller_ctx is the same as the locked_ctx. This means 2584 * that this I/O is associated with the lock, and is allowed to execute. 2585 */ 2586 return false; 2587 } else { 2588 return true; 2589 } 2590 default: 2591 return false; 2592 } 2593 } 2594 2595 void 2596 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2597 { 2598 struct spdk_bdev *bdev = bdev_io->bdev; 2599 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2600 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2601 2602 assert(thread != NULL); 2603 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2604 2605 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2606 struct lba_range *range; 2607 2608 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2609 if (bdev_io_range_is_locked(bdev_io, range)) { 2610 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2611 return; 2612 } 2613 } 2614 } 2615 2616 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2617 2618 if (bdev_io_should_split(bdev_io)) { 2619 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2620 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2621 (uintptr_t)bdev_io, bdev_io->type, bdev_io->internal.caller_ctx); 2622 bdev_io_split(NULL, bdev_io); 2623 return; 2624 } 2625 2626 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2627 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2628 _bdev_io_submit(bdev_io); 2629 } else { 2630 bdev_io->internal.io_submit_ch = ch; 2631 bdev_io->internal.ch = bdev->internal.qos->ch; 2632 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2633 } 2634 } else { 2635 _bdev_io_submit(bdev_io); 2636 } 2637 } 2638 2639 static void 2640 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2641 { 2642 struct spdk_bdev *bdev = bdev_io->bdev; 2643 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2644 struct spdk_io_channel *ch = bdev_ch->channel; 2645 2646 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2647 2648 bdev_io->internal.in_submit_request = true; 2649 bdev->fn_table->submit_request(ch, bdev_io); 2650 bdev_io->internal.in_submit_request = false; 2651 } 2652 2653 void 2654 bdev_io_init(struct spdk_bdev_io *bdev_io, 2655 struct spdk_bdev *bdev, void *cb_arg, 2656 spdk_bdev_io_completion_cb cb) 2657 { 2658 bdev_io->bdev = bdev; 2659 bdev_io->internal.caller_ctx = cb_arg; 2660 bdev_io->internal.cb = cb; 2661 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2662 bdev_io->internal.in_submit_request = false; 2663 bdev_io->internal.buf = NULL; 2664 bdev_io->internal.io_submit_ch = NULL; 2665 bdev_io->internal.orig_iovs = NULL; 2666 bdev_io->internal.orig_iovcnt = 0; 2667 bdev_io->internal.orig_md_buf = NULL; 2668 bdev_io->internal.error.nvme.cdw0 = 0; 2669 bdev_io->num_retries = 0; 2670 bdev_io->internal.get_buf_cb = NULL; 2671 bdev_io->internal.get_aux_buf_cb = NULL; 2672 bdev_io->internal.ext_opts = NULL; 2673 } 2674 2675 static bool 2676 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2677 { 2678 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2679 } 2680 2681 bool 2682 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2683 { 2684 bool supported; 2685 2686 supported = bdev_io_type_supported(bdev, io_type); 2687 2688 if (!supported) { 2689 switch (io_type) { 2690 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2691 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2692 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2693 break; 2694 default: 2695 break; 2696 } 2697 } 2698 2699 return supported; 2700 } 2701 2702 int 2703 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2704 { 2705 if (bdev->fn_table->dump_info_json) { 2706 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2707 } 2708 2709 return 0; 2710 } 2711 2712 static void 2713 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 2714 { 2715 uint32_t max_per_timeslice = 0; 2716 int i; 2717 2718 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2719 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2720 qos->rate_limits[i].max_per_timeslice = 0; 2721 continue; 2722 } 2723 2724 max_per_timeslice = qos->rate_limits[i].limit * 2725 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 2726 2727 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 2728 qos->rate_limits[i].min_per_timeslice); 2729 2730 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 2731 } 2732 2733 bdev_qos_set_ops(qos); 2734 } 2735 2736 static int 2737 bdev_channel_poll_qos(void *arg) 2738 { 2739 struct spdk_bdev_qos *qos = arg; 2740 uint64_t now = spdk_get_ticks(); 2741 int i; 2742 2743 if (now < (qos->last_timeslice + qos->timeslice_size)) { 2744 /* We received our callback earlier than expected - return 2745 * immediately and wait to do accounting until at least one 2746 * timeslice has actually expired. This should never happen 2747 * with a well-behaved timer implementation. 2748 */ 2749 return SPDK_POLLER_IDLE; 2750 } 2751 2752 /* Reset for next round of rate limiting */ 2753 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2754 /* We may have allowed the IOs or bytes to slightly overrun in the last 2755 * timeslice. remaining_this_timeslice is signed, so if it's negative 2756 * here, we'll account for the overrun so that the next timeslice will 2757 * be appropriately reduced. 2758 */ 2759 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 2760 qos->rate_limits[i].remaining_this_timeslice = 0; 2761 } 2762 } 2763 2764 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 2765 qos->last_timeslice += qos->timeslice_size; 2766 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2767 qos->rate_limits[i].remaining_this_timeslice += 2768 qos->rate_limits[i].max_per_timeslice; 2769 } 2770 } 2771 2772 return bdev_qos_io_submit(qos->ch, qos); 2773 } 2774 2775 static void 2776 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 2777 { 2778 struct spdk_bdev_shared_resource *shared_resource; 2779 struct lba_range *range; 2780 2781 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 2782 range = TAILQ_FIRST(&ch->locked_ranges); 2783 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 2784 free(range); 2785 } 2786 2787 spdk_put_io_channel(ch->channel); 2788 2789 shared_resource = ch->shared_resource; 2790 2791 assert(TAILQ_EMPTY(&ch->io_locked)); 2792 assert(TAILQ_EMPTY(&ch->io_submitted)); 2793 assert(ch->io_outstanding == 0); 2794 assert(shared_resource->ref > 0); 2795 shared_resource->ref--; 2796 if (shared_resource->ref == 0) { 2797 assert(shared_resource->io_outstanding == 0); 2798 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 2799 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 2800 free(shared_resource); 2801 } 2802 } 2803 2804 /* Caller must hold bdev->internal.mutex. */ 2805 static void 2806 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 2807 { 2808 struct spdk_bdev_qos *qos = bdev->internal.qos; 2809 int i; 2810 2811 /* Rate limiting on this bdev enabled */ 2812 if (qos) { 2813 if (qos->ch == NULL) { 2814 struct spdk_io_channel *io_ch; 2815 2816 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 2817 bdev->name, spdk_get_thread()); 2818 2819 /* No qos channel has been selected, so set one up */ 2820 2821 /* Take another reference to ch */ 2822 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2823 assert(io_ch != NULL); 2824 qos->ch = ch; 2825 2826 qos->thread = spdk_io_channel_get_thread(io_ch); 2827 2828 TAILQ_INIT(&qos->queued); 2829 2830 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2831 if (bdev_qos_is_iops_rate_limit(i) == true) { 2832 qos->rate_limits[i].min_per_timeslice = 2833 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 2834 } else { 2835 qos->rate_limits[i].min_per_timeslice = 2836 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 2837 } 2838 2839 if (qos->rate_limits[i].limit == 0) { 2840 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 2841 } 2842 } 2843 bdev_qos_update_max_quota_per_timeslice(qos); 2844 qos->timeslice_size = 2845 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 2846 qos->last_timeslice = spdk_get_ticks(); 2847 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 2848 qos, 2849 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 2850 } 2851 2852 ch->flags |= BDEV_CH_QOS_ENABLED; 2853 } 2854 } 2855 2856 struct poll_timeout_ctx { 2857 struct spdk_bdev_desc *desc; 2858 uint64_t timeout_in_sec; 2859 spdk_bdev_io_timeout_cb cb_fn; 2860 void *cb_arg; 2861 }; 2862 2863 static void 2864 bdev_desc_free(struct spdk_bdev_desc *desc) 2865 { 2866 pthread_mutex_destroy(&desc->mutex); 2867 free(desc->media_events_buffer); 2868 free(desc); 2869 } 2870 2871 static void 2872 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 2873 { 2874 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2875 struct spdk_bdev_desc *desc = ctx->desc; 2876 2877 free(ctx); 2878 2879 pthread_mutex_lock(&desc->mutex); 2880 desc->refs--; 2881 if (desc->closed == true && desc->refs == 0) { 2882 pthread_mutex_unlock(&desc->mutex); 2883 bdev_desc_free(desc); 2884 return; 2885 } 2886 pthread_mutex_unlock(&desc->mutex); 2887 } 2888 2889 static void 2890 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 2891 { 2892 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2893 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2894 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 2895 struct spdk_bdev_desc *desc = ctx->desc; 2896 struct spdk_bdev_io *bdev_io; 2897 uint64_t now; 2898 2899 pthread_mutex_lock(&desc->mutex); 2900 if (desc->closed == true) { 2901 pthread_mutex_unlock(&desc->mutex); 2902 spdk_for_each_channel_continue(i, -1); 2903 return; 2904 } 2905 pthread_mutex_unlock(&desc->mutex); 2906 2907 now = spdk_get_ticks(); 2908 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 2909 /* Exclude any I/O that are generated via splitting. */ 2910 if (bdev_io->internal.cb == bdev_io_split_done) { 2911 continue; 2912 } 2913 2914 /* Once we find an I/O that has not timed out, we can immediately 2915 * exit the loop. 2916 */ 2917 if (now < (bdev_io->internal.submit_tsc + 2918 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 2919 goto end; 2920 } 2921 2922 if (bdev_io->internal.desc == desc) { 2923 ctx->cb_fn(ctx->cb_arg, bdev_io); 2924 } 2925 } 2926 2927 end: 2928 spdk_for_each_channel_continue(i, 0); 2929 } 2930 2931 static int 2932 bdev_poll_timeout_io(void *arg) 2933 { 2934 struct spdk_bdev_desc *desc = arg; 2935 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 2936 struct poll_timeout_ctx *ctx; 2937 2938 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 2939 if (!ctx) { 2940 SPDK_ERRLOG("failed to allocate memory\n"); 2941 return SPDK_POLLER_BUSY; 2942 } 2943 ctx->desc = desc; 2944 ctx->cb_arg = desc->cb_arg; 2945 ctx->cb_fn = desc->cb_fn; 2946 ctx->timeout_in_sec = desc->timeout_in_sec; 2947 2948 /* Take a ref on the descriptor in case it gets closed while we are checking 2949 * all of the channels. 2950 */ 2951 pthread_mutex_lock(&desc->mutex); 2952 desc->refs++; 2953 pthread_mutex_unlock(&desc->mutex); 2954 2955 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2956 bdev_channel_poll_timeout_io, 2957 ctx, 2958 bdev_channel_poll_timeout_io_done); 2959 2960 return SPDK_POLLER_BUSY; 2961 } 2962 2963 int 2964 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 2965 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 2966 { 2967 assert(desc->thread == spdk_get_thread()); 2968 2969 spdk_poller_unregister(&desc->io_timeout_poller); 2970 2971 if (timeout_in_sec) { 2972 assert(cb_fn != NULL); 2973 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 2974 desc, 2975 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 2976 1000); 2977 if (desc->io_timeout_poller == NULL) { 2978 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 2979 return -1; 2980 } 2981 } 2982 2983 desc->cb_fn = cb_fn; 2984 desc->cb_arg = cb_arg; 2985 desc->timeout_in_sec = timeout_in_sec; 2986 2987 return 0; 2988 } 2989 2990 static int 2991 bdev_channel_create(void *io_device, void *ctx_buf) 2992 { 2993 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 2994 struct spdk_bdev_channel *ch = ctx_buf; 2995 struct spdk_io_channel *mgmt_io_ch; 2996 struct spdk_bdev_mgmt_channel *mgmt_ch; 2997 struct spdk_bdev_shared_resource *shared_resource; 2998 struct lba_range *range; 2999 3000 ch->bdev = bdev; 3001 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3002 if (!ch->channel) { 3003 return -1; 3004 } 3005 3006 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3007 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3008 3009 assert(ch->histogram == NULL); 3010 if (bdev->internal.histogram_enabled) { 3011 ch->histogram = spdk_histogram_data_alloc(); 3012 if (ch->histogram == NULL) { 3013 SPDK_ERRLOG("Could not allocate histogram\n"); 3014 } 3015 } 3016 3017 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3018 if (!mgmt_io_ch) { 3019 spdk_put_io_channel(ch->channel); 3020 return -1; 3021 } 3022 3023 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3024 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3025 if (shared_resource->shared_ch == ch->channel) { 3026 spdk_put_io_channel(mgmt_io_ch); 3027 shared_resource->ref++; 3028 break; 3029 } 3030 } 3031 3032 if (shared_resource == NULL) { 3033 shared_resource = calloc(1, sizeof(*shared_resource)); 3034 if (shared_resource == NULL) { 3035 spdk_put_io_channel(ch->channel); 3036 spdk_put_io_channel(mgmt_io_ch); 3037 return -1; 3038 } 3039 3040 shared_resource->mgmt_ch = mgmt_ch; 3041 shared_resource->io_outstanding = 0; 3042 TAILQ_INIT(&shared_resource->nomem_io); 3043 shared_resource->nomem_threshold = 0; 3044 shared_resource->shared_ch = ch->channel; 3045 shared_resource->ref = 1; 3046 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3047 } 3048 3049 memset(&ch->stat, 0, sizeof(ch->stat)); 3050 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3051 ch->io_outstanding = 0; 3052 TAILQ_INIT(&ch->queued_resets); 3053 TAILQ_INIT(&ch->locked_ranges); 3054 ch->flags = 0; 3055 ch->shared_resource = shared_resource; 3056 3057 TAILQ_INIT(&ch->io_submitted); 3058 TAILQ_INIT(&ch->io_locked); 3059 3060 #ifdef SPDK_CONFIG_VTUNE 3061 { 3062 char *name; 3063 __itt_init_ittlib(NULL, 0); 3064 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3065 if (!name) { 3066 bdev_channel_destroy_resource(ch); 3067 return -1; 3068 } 3069 ch->handle = __itt_string_handle_create(name); 3070 free(name); 3071 ch->start_tsc = spdk_get_ticks(); 3072 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3073 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3074 } 3075 #endif 3076 3077 pthread_mutex_lock(&bdev->internal.mutex); 3078 bdev_enable_qos(bdev, ch); 3079 3080 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3081 struct lba_range *new_range; 3082 3083 new_range = calloc(1, sizeof(*new_range)); 3084 if (new_range == NULL) { 3085 pthread_mutex_unlock(&bdev->internal.mutex); 3086 bdev_channel_destroy_resource(ch); 3087 return -1; 3088 } 3089 new_range->length = range->length; 3090 new_range->offset = range->offset; 3091 new_range->locked_ctx = range->locked_ctx; 3092 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3093 } 3094 3095 pthread_mutex_unlock(&bdev->internal.mutex); 3096 3097 return 0; 3098 } 3099 3100 /* 3101 * Abort I/O that are waiting on a data buffer. These types of I/O are 3102 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3103 */ 3104 static void 3105 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3106 { 3107 bdev_io_stailq_t tmp; 3108 struct spdk_bdev_io *bdev_io; 3109 3110 STAILQ_INIT(&tmp); 3111 3112 while (!STAILQ_EMPTY(queue)) { 3113 bdev_io = STAILQ_FIRST(queue); 3114 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3115 if (bdev_io->internal.ch == ch) { 3116 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3117 } else { 3118 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3119 } 3120 } 3121 3122 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3123 } 3124 3125 /* 3126 * Abort I/O that are queued waiting for submission. These types of I/O are 3127 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3128 */ 3129 static void 3130 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3131 { 3132 struct spdk_bdev_io *bdev_io, *tmp; 3133 3134 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3135 if (bdev_io->internal.ch == ch) { 3136 TAILQ_REMOVE(queue, bdev_io, internal.link); 3137 /* 3138 * spdk_bdev_io_complete() assumes that the completed I/O had 3139 * been submitted to the bdev module. Since in this case it 3140 * hadn't, bump io_outstanding to account for the decrement 3141 * that spdk_bdev_io_complete() will do. 3142 */ 3143 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3144 ch->io_outstanding++; 3145 ch->shared_resource->io_outstanding++; 3146 } 3147 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3148 } 3149 } 3150 } 3151 3152 static bool 3153 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3154 { 3155 struct spdk_bdev_io *bdev_io; 3156 3157 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3158 if (bdev_io == bio_to_abort) { 3159 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3160 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3161 return true; 3162 } 3163 } 3164 3165 return false; 3166 } 3167 3168 static bool 3169 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3170 { 3171 struct spdk_bdev_io *bdev_io; 3172 3173 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3174 if (bdev_io == bio_to_abort) { 3175 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3176 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3177 return true; 3178 } 3179 } 3180 3181 return false; 3182 } 3183 3184 static void 3185 bdev_qos_channel_destroy(void *cb_arg) 3186 { 3187 struct spdk_bdev_qos *qos = cb_arg; 3188 3189 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3190 spdk_poller_unregister(&qos->poller); 3191 3192 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3193 3194 free(qos); 3195 } 3196 3197 static int 3198 bdev_qos_destroy(struct spdk_bdev *bdev) 3199 { 3200 int i; 3201 3202 /* 3203 * Cleanly shutting down the QoS poller is tricky, because 3204 * during the asynchronous operation the user could open 3205 * a new descriptor and create a new channel, spawning 3206 * a new QoS poller. 3207 * 3208 * The strategy is to create a new QoS structure here and swap it 3209 * in. The shutdown path then continues to refer to the old one 3210 * until it completes and then releases it. 3211 */ 3212 struct spdk_bdev_qos *new_qos, *old_qos; 3213 3214 old_qos = bdev->internal.qos; 3215 3216 new_qos = calloc(1, sizeof(*new_qos)); 3217 if (!new_qos) { 3218 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3219 return -ENOMEM; 3220 } 3221 3222 /* Copy the old QoS data into the newly allocated structure */ 3223 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3224 3225 /* Zero out the key parts of the QoS structure */ 3226 new_qos->ch = NULL; 3227 new_qos->thread = NULL; 3228 new_qos->poller = NULL; 3229 TAILQ_INIT(&new_qos->queued); 3230 /* 3231 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3232 * It will be used later for the new QoS structure. 3233 */ 3234 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3235 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3236 new_qos->rate_limits[i].min_per_timeslice = 0; 3237 new_qos->rate_limits[i].max_per_timeslice = 0; 3238 } 3239 3240 bdev->internal.qos = new_qos; 3241 3242 if (old_qos->thread == NULL) { 3243 free(old_qos); 3244 } else { 3245 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3246 } 3247 3248 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3249 * been destroyed yet. The destruction path will end up waiting for the final 3250 * channel to be put before it releases resources. */ 3251 3252 return 0; 3253 } 3254 3255 static void 3256 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3257 { 3258 total->bytes_read += add->bytes_read; 3259 total->num_read_ops += add->num_read_ops; 3260 total->bytes_written += add->bytes_written; 3261 total->num_write_ops += add->num_write_ops; 3262 total->bytes_unmapped += add->bytes_unmapped; 3263 total->num_unmap_ops += add->num_unmap_ops; 3264 total->read_latency_ticks += add->read_latency_ticks; 3265 total->write_latency_ticks += add->write_latency_ticks; 3266 total->unmap_latency_ticks += add->unmap_latency_ticks; 3267 } 3268 3269 static void 3270 bdev_channel_destroy(void *io_device, void *ctx_buf) 3271 { 3272 struct spdk_bdev_channel *ch = ctx_buf; 3273 struct spdk_bdev_mgmt_channel *mgmt_ch; 3274 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3275 3276 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3277 spdk_get_thread()); 3278 3279 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3280 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3281 3282 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3283 pthread_mutex_lock(&ch->bdev->internal.mutex); 3284 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3285 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3286 3287 mgmt_ch = shared_resource->mgmt_ch; 3288 3289 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3290 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3291 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3292 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3293 3294 if (ch->histogram) { 3295 spdk_histogram_data_free(ch->histogram); 3296 } 3297 3298 bdev_channel_destroy_resource(ch); 3299 } 3300 3301 /* 3302 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3303 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3304 */ 3305 static int 3306 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3307 { 3308 struct spdk_bdev_name *tmp; 3309 3310 bdev_name->name = strdup(name); 3311 if (bdev_name->name == NULL) { 3312 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3313 return -ENOMEM; 3314 } 3315 3316 bdev_name->bdev = bdev; 3317 3318 pthread_mutex_lock(&g_bdev_mgr.mutex); 3319 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3320 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3321 3322 if (tmp != NULL) { 3323 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3324 free(bdev_name->name); 3325 return -EEXIST; 3326 } 3327 3328 return 0; 3329 } 3330 3331 static void 3332 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3333 { 3334 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3335 free(bdev_name->name); 3336 } 3337 3338 static void 3339 bdev_name_del(struct spdk_bdev_name *bdev_name) 3340 { 3341 pthread_mutex_lock(&g_bdev_mgr.mutex); 3342 bdev_name_del_unsafe(bdev_name); 3343 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3344 } 3345 3346 int 3347 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3348 { 3349 struct spdk_bdev_alias *tmp; 3350 int ret; 3351 3352 if (alias == NULL) { 3353 SPDK_ERRLOG("Empty alias passed\n"); 3354 return -EINVAL; 3355 } 3356 3357 tmp = calloc(1, sizeof(*tmp)); 3358 if (tmp == NULL) { 3359 SPDK_ERRLOG("Unable to allocate alias\n"); 3360 return -ENOMEM; 3361 } 3362 3363 ret = bdev_name_add(&tmp->alias, bdev, alias); 3364 if (ret != 0) { 3365 free(tmp); 3366 return ret; 3367 } 3368 3369 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3370 3371 return 0; 3372 } 3373 3374 static int 3375 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3376 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3377 { 3378 struct spdk_bdev_alias *tmp; 3379 3380 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3381 if (strcmp(alias, tmp->alias.name) == 0) { 3382 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3383 alias_del_fn(&tmp->alias); 3384 free(tmp); 3385 return 0; 3386 } 3387 } 3388 3389 return -ENOENT; 3390 } 3391 3392 int 3393 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3394 { 3395 int rc; 3396 3397 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3398 if (rc == -ENOENT) { 3399 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3400 } 3401 3402 return rc; 3403 } 3404 3405 void 3406 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3407 { 3408 struct spdk_bdev_alias *p, *tmp; 3409 3410 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3411 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3412 bdev_name_del(&p->alias); 3413 free(p); 3414 } 3415 } 3416 3417 struct spdk_io_channel * 3418 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3419 { 3420 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3421 } 3422 3423 void * 3424 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3425 { 3426 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3427 void *ctx = NULL; 3428 3429 if (bdev->fn_table->get_module_ctx) { 3430 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3431 } 3432 3433 return ctx; 3434 } 3435 3436 const char * 3437 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3438 { 3439 return bdev->module->name; 3440 } 3441 3442 const char * 3443 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3444 { 3445 return bdev->name; 3446 } 3447 3448 const char * 3449 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3450 { 3451 return bdev->product_name; 3452 } 3453 3454 const struct spdk_bdev_aliases_list * 3455 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3456 { 3457 return &bdev->aliases; 3458 } 3459 3460 uint32_t 3461 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3462 { 3463 return bdev->blocklen; 3464 } 3465 3466 uint32_t 3467 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3468 { 3469 return bdev->write_unit_size; 3470 } 3471 3472 uint64_t 3473 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3474 { 3475 return bdev->blockcnt; 3476 } 3477 3478 const char * 3479 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3480 { 3481 return qos_rpc_type[type]; 3482 } 3483 3484 void 3485 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3486 { 3487 int i; 3488 3489 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3490 3491 pthread_mutex_lock(&bdev->internal.mutex); 3492 if (bdev->internal.qos) { 3493 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3494 if (bdev->internal.qos->rate_limits[i].limit != 3495 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3496 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3497 if (bdev_qos_is_iops_rate_limit(i) == false) { 3498 /* Change from Byte to Megabyte which is user visible. */ 3499 limits[i] = limits[i] / 1024 / 1024; 3500 } 3501 } 3502 } 3503 } 3504 pthread_mutex_unlock(&bdev->internal.mutex); 3505 } 3506 3507 size_t 3508 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3509 { 3510 return 1 << bdev->required_alignment; 3511 } 3512 3513 uint32_t 3514 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3515 { 3516 return bdev->optimal_io_boundary; 3517 } 3518 3519 bool 3520 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3521 { 3522 return bdev->write_cache; 3523 } 3524 3525 const struct spdk_uuid * 3526 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3527 { 3528 return &bdev->uuid; 3529 } 3530 3531 uint16_t 3532 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3533 { 3534 return bdev->acwu; 3535 } 3536 3537 uint32_t 3538 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3539 { 3540 return bdev->md_len; 3541 } 3542 3543 bool 3544 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3545 { 3546 return (bdev->md_len != 0) && bdev->md_interleave; 3547 } 3548 3549 bool 3550 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3551 { 3552 return (bdev->md_len != 0) && !bdev->md_interleave; 3553 } 3554 3555 bool 3556 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3557 { 3558 return bdev->zoned; 3559 } 3560 3561 uint32_t 3562 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3563 { 3564 if (spdk_bdev_is_md_interleaved(bdev)) { 3565 return bdev->blocklen - bdev->md_len; 3566 } else { 3567 return bdev->blocklen; 3568 } 3569 } 3570 3571 uint32_t 3572 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3573 { 3574 return bdev->phys_blocklen; 3575 } 3576 3577 static uint32_t 3578 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3579 { 3580 if (!spdk_bdev_is_md_interleaved(bdev)) { 3581 return bdev->blocklen + bdev->md_len; 3582 } else { 3583 return bdev->blocklen; 3584 } 3585 } 3586 3587 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3588 { 3589 if (bdev->md_len != 0) { 3590 return bdev->dif_type; 3591 } else { 3592 return SPDK_DIF_DISABLE; 3593 } 3594 } 3595 3596 bool 3597 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3598 { 3599 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3600 return bdev->dif_is_head_of_md; 3601 } else { 3602 return false; 3603 } 3604 } 3605 3606 bool 3607 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3608 enum spdk_dif_check_type check_type) 3609 { 3610 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3611 return false; 3612 } 3613 3614 switch (check_type) { 3615 case SPDK_DIF_CHECK_TYPE_REFTAG: 3616 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3617 case SPDK_DIF_CHECK_TYPE_APPTAG: 3618 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3619 case SPDK_DIF_CHECK_TYPE_GUARD: 3620 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3621 default: 3622 return false; 3623 } 3624 } 3625 3626 uint64_t 3627 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3628 { 3629 return bdev->internal.measured_queue_depth; 3630 } 3631 3632 uint64_t 3633 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3634 { 3635 return bdev->internal.period; 3636 } 3637 3638 uint64_t 3639 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3640 { 3641 return bdev->internal.weighted_io_time; 3642 } 3643 3644 uint64_t 3645 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3646 { 3647 return bdev->internal.io_time; 3648 } 3649 3650 static void 3651 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3652 { 3653 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3654 3655 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3656 3657 if (bdev->internal.measured_queue_depth) { 3658 bdev->internal.io_time += bdev->internal.period; 3659 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3660 } 3661 } 3662 3663 static void 3664 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3665 { 3666 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3667 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3668 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3669 3670 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3671 spdk_for_each_channel_continue(i, 0); 3672 } 3673 3674 static int 3675 bdev_calculate_measured_queue_depth(void *ctx) 3676 { 3677 struct spdk_bdev *bdev = ctx; 3678 bdev->internal.temporary_queue_depth = 0; 3679 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3680 _calculate_measured_qd_cpl); 3681 return SPDK_POLLER_BUSY; 3682 } 3683 3684 void 3685 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3686 { 3687 bdev->internal.period = period; 3688 3689 if (bdev->internal.qd_poller != NULL) { 3690 spdk_poller_unregister(&bdev->internal.qd_poller); 3691 bdev->internal.measured_queue_depth = UINT64_MAX; 3692 } 3693 3694 if (period != 0) { 3695 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 3696 period); 3697 } 3698 } 3699 3700 static void 3701 _resize_notify(void *arg) 3702 { 3703 struct spdk_bdev_desc *desc = arg; 3704 3705 pthread_mutex_lock(&desc->mutex); 3706 desc->refs--; 3707 if (!desc->closed) { 3708 pthread_mutex_unlock(&desc->mutex); 3709 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 3710 desc->bdev, 3711 desc->callback.ctx); 3712 return; 3713 } else if (0 == desc->refs) { 3714 /* This descriptor was closed after this resize_notify message was sent. 3715 * spdk_bdev_close() could not free the descriptor since this message was 3716 * in flight, so we free it now using bdev_desc_free(). 3717 */ 3718 pthread_mutex_unlock(&desc->mutex); 3719 bdev_desc_free(desc); 3720 return; 3721 } 3722 pthread_mutex_unlock(&desc->mutex); 3723 } 3724 3725 int 3726 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 3727 { 3728 struct spdk_bdev_desc *desc; 3729 int ret; 3730 3731 if (size == bdev->blockcnt) { 3732 return 0; 3733 } 3734 3735 pthread_mutex_lock(&bdev->internal.mutex); 3736 3737 /* bdev has open descriptors */ 3738 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 3739 bdev->blockcnt > size) { 3740 ret = -EBUSY; 3741 } else { 3742 bdev->blockcnt = size; 3743 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 3744 pthread_mutex_lock(&desc->mutex); 3745 if (!desc->closed) { 3746 desc->refs++; 3747 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 3748 } 3749 pthread_mutex_unlock(&desc->mutex); 3750 } 3751 ret = 0; 3752 } 3753 3754 pthread_mutex_unlock(&bdev->internal.mutex); 3755 3756 return ret; 3757 } 3758 3759 /* 3760 * Convert I/O offset and length from bytes to blocks. 3761 * 3762 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 3763 */ 3764 static uint64_t 3765 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 3766 uint64_t num_bytes, uint64_t *num_blocks) 3767 { 3768 uint32_t block_size = bdev->blocklen; 3769 uint8_t shift_cnt; 3770 3771 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3772 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 3773 shift_cnt = spdk_u32log2(block_size); 3774 *offset_blocks = offset_bytes >> shift_cnt; 3775 *num_blocks = num_bytes >> shift_cnt; 3776 return (offset_bytes - (*offset_blocks << shift_cnt)) | 3777 (num_bytes - (*num_blocks << shift_cnt)); 3778 } else { 3779 *offset_blocks = offset_bytes / block_size; 3780 *num_blocks = num_bytes / block_size; 3781 return (offset_bytes % block_size) | (num_bytes % block_size); 3782 } 3783 } 3784 3785 static bool 3786 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 3787 { 3788 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 3789 * has been an overflow and hence the offset has been wrapped around */ 3790 if (offset_blocks + num_blocks < offset_blocks) { 3791 return false; 3792 } 3793 3794 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 3795 if (offset_blocks + num_blocks > bdev->blockcnt) { 3796 return false; 3797 } 3798 3799 return true; 3800 } 3801 3802 static bool 3803 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 3804 { 3805 return _is_buf_allocated(iovs) == (md_buf != NULL); 3806 } 3807 3808 static int 3809 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 3810 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3811 spdk_bdev_io_completion_cb cb, void *cb_arg) 3812 { 3813 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3814 struct spdk_bdev_io *bdev_io; 3815 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3816 3817 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3818 return -EINVAL; 3819 } 3820 3821 bdev_io = bdev_channel_get_io(channel); 3822 if (!bdev_io) { 3823 return -ENOMEM; 3824 } 3825 3826 bdev_io->internal.ch = channel; 3827 bdev_io->internal.desc = desc; 3828 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3829 bdev_io->u.bdev.iovs = &bdev_io->iov; 3830 bdev_io->u.bdev.iovs[0].iov_base = buf; 3831 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3832 bdev_io->u.bdev.iovcnt = 1; 3833 bdev_io->u.bdev.md_buf = md_buf; 3834 bdev_io->u.bdev.num_blocks = num_blocks; 3835 bdev_io->u.bdev.offset_blocks = offset_blocks; 3836 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3837 3838 bdev_io_submit(bdev_io); 3839 return 0; 3840 } 3841 3842 int 3843 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3844 void *buf, uint64_t offset, uint64_t nbytes, 3845 spdk_bdev_io_completion_cb cb, void *cb_arg) 3846 { 3847 uint64_t offset_blocks, num_blocks; 3848 3849 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3850 nbytes, &num_blocks) != 0) { 3851 return -EINVAL; 3852 } 3853 3854 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3855 } 3856 3857 int 3858 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3859 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3860 spdk_bdev_io_completion_cb cb, void *cb_arg) 3861 { 3862 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 3863 } 3864 3865 int 3866 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3867 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3868 spdk_bdev_io_completion_cb cb, void *cb_arg) 3869 { 3870 struct iovec iov = { 3871 .iov_base = buf, 3872 }; 3873 3874 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3875 return -EINVAL; 3876 } 3877 3878 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3879 return -EINVAL; 3880 } 3881 3882 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3883 cb, cb_arg); 3884 } 3885 3886 int 3887 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3888 struct iovec *iov, int iovcnt, 3889 uint64_t offset, uint64_t nbytes, 3890 spdk_bdev_io_completion_cb cb, void *cb_arg) 3891 { 3892 uint64_t offset_blocks, num_blocks; 3893 3894 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3895 nbytes, &num_blocks) != 0) { 3896 return -EINVAL; 3897 } 3898 3899 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3900 } 3901 3902 static int 3903 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3904 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 3905 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 3906 struct spdk_bdev_ext_io_opts *opts) 3907 { 3908 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3909 struct spdk_bdev_io *bdev_io; 3910 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3911 3912 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3913 return -EINVAL; 3914 } 3915 3916 bdev_io = bdev_channel_get_io(channel); 3917 if (!bdev_io) { 3918 return -ENOMEM; 3919 } 3920 3921 bdev_io->internal.ch = channel; 3922 bdev_io->internal.desc = desc; 3923 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3924 bdev_io->u.bdev.iovs = iov; 3925 bdev_io->u.bdev.iovcnt = iovcnt; 3926 bdev_io->u.bdev.md_buf = md_buf; 3927 bdev_io->u.bdev.num_blocks = num_blocks; 3928 bdev_io->u.bdev.offset_blocks = offset_blocks; 3929 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3930 bdev_io->internal.ext_opts = opts; 3931 3932 bdev_io_submit(bdev_io); 3933 return 0; 3934 } 3935 3936 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3937 struct iovec *iov, int iovcnt, 3938 uint64_t offset_blocks, uint64_t num_blocks, 3939 spdk_bdev_io_completion_cb cb, void *cb_arg) 3940 { 3941 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3942 num_blocks, cb, cb_arg, NULL); 3943 } 3944 3945 int 3946 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3947 struct iovec *iov, int iovcnt, void *md_buf, 3948 uint64_t offset_blocks, uint64_t num_blocks, 3949 spdk_bdev_io_completion_cb cb, void *cb_arg) 3950 { 3951 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3952 return -EINVAL; 3953 } 3954 3955 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3956 return -EINVAL; 3957 } 3958 3959 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3960 num_blocks, cb, cb_arg, NULL); 3961 } 3962 3963 int 3964 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3965 struct iovec *iov, int iovcnt, 3966 uint64_t offset_blocks, uint64_t num_blocks, 3967 spdk_bdev_io_completion_cb cb, void *cb_arg, 3968 struct spdk_bdev_ext_io_opts *opts) 3969 { 3970 void *md = NULL; 3971 3972 if (opts) { 3973 md = opts->metadata; 3974 } 3975 3976 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3977 return -EINVAL; 3978 } 3979 3980 if (md && !_bdev_io_check_md_buf(iov, md)) { 3981 return -EINVAL; 3982 } 3983 3984 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 3985 num_blocks, cb, cb_arg, opts); 3986 } 3987 3988 static int 3989 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3990 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3991 spdk_bdev_io_completion_cb cb, void *cb_arg) 3992 { 3993 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3994 struct spdk_bdev_io *bdev_io; 3995 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3996 3997 if (!desc->write) { 3998 return -EBADF; 3999 } 4000 4001 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4002 return -EINVAL; 4003 } 4004 4005 bdev_io = bdev_channel_get_io(channel); 4006 if (!bdev_io) { 4007 return -ENOMEM; 4008 } 4009 4010 bdev_io->internal.ch = channel; 4011 bdev_io->internal.desc = desc; 4012 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4013 bdev_io->u.bdev.iovs = &bdev_io->iov; 4014 bdev_io->u.bdev.iovs[0].iov_base = buf; 4015 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4016 bdev_io->u.bdev.iovcnt = 1; 4017 bdev_io->u.bdev.md_buf = md_buf; 4018 bdev_io->u.bdev.num_blocks = num_blocks; 4019 bdev_io->u.bdev.offset_blocks = offset_blocks; 4020 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4021 4022 bdev_io_submit(bdev_io); 4023 return 0; 4024 } 4025 4026 int 4027 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4028 void *buf, uint64_t offset, uint64_t nbytes, 4029 spdk_bdev_io_completion_cb cb, void *cb_arg) 4030 { 4031 uint64_t offset_blocks, num_blocks; 4032 4033 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4034 nbytes, &num_blocks) != 0) { 4035 return -EINVAL; 4036 } 4037 4038 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4039 } 4040 4041 int 4042 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4043 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4044 spdk_bdev_io_completion_cb cb, void *cb_arg) 4045 { 4046 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4047 cb, cb_arg); 4048 } 4049 4050 int 4051 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4052 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4053 spdk_bdev_io_completion_cb cb, void *cb_arg) 4054 { 4055 struct iovec iov = { 4056 .iov_base = buf, 4057 }; 4058 4059 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4060 return -EINVAL; 4061 } 4062 4063 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4064 return -EINVAL; 4065 } 4066 4067 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4068 cb, cb_arg); 4069 } 4070 4071 static int 4072 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4073 struct iovec *iov, int iovcnt, void *md_buf, 4074 uint64_t offset_blocks, uint64_t num_blocks, 4075 spdk_bdev_io_completion_cb cb, void *cb_arg, 4076 struct spdk_bdev_ext_io_opts *opts) 4077 { 4078 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4079 struct spdk_bdev_io *bdev_io; 4080 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4081 4082 if (!desc->write) { 4083 return -EBADF; 4084 } 4085 4086 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4087 return -EINVAL; 4088 } 4089 4090 bdev_io = bdev_channel_get_io(channel); 4091 if (!bdev_io) { 4092 return -ENOMEM; 4093 } 4094 4095 bdev_io->internal.ch = channel; 4096 bdev_io->internal.desc = desc; 4097 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4098 bdev_io->u.bdev.iovs = iov; 4099 bdev_io->u.bdev.iovcnt = iovcnt; 4100 bdev_io->u.bdev.md_buf = md_buf; 4101 bdev_io->u.bdev.num_blocks = num_blocks; 4102 bdev_io->u.bdev.offset_blocks = offset_blocks; 4103 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4104 bdev_io->internal.ext_opts = opts; 4105 4106 bdev_io_submit(bdev_io); 4107 return 0; 4108 } 4109 4110 int 4111 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4112 struct iovec *iov, int iovcnt, 4113 uint64_t offset, uint64_t len, 4114 spdk_bdev_io_completion_cb cb, void *cb_arg) 4115 { 4116 uint64_t offset_blocks, num_blocks; 4117 4118 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4119 len, &num_blocks) != 0) { 4120 return -EINVAL; 4121 } 4122 4123 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4124 } 4125 4126 int 4127 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4128 struct iovec *iov, int iovcnt, 4129 uint64_t offset_blocks, uint64_t num_blocks, 4130 spdk_bdev_io_completion_cb cb, void *cb_arg) 4131 { 4132 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4133 num_blocks, cb, cb_arg, NULL); 4134 } 4135 4136 int 4137 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4138 struct iovec *iov, int iovcnt, void *md_buf, 4139 uint64_t offset_blocks, uint64_t num_blocks, 4140 spdk_bdev_io_completion_cb cb, void *cb_arg) 4141 { 4142 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4143 return -EINVAL; 4144 } 4145 4146 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4147 return -EINVAL; 4148 } 4149 4150 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4151 num_blocks, cb, cb_arg, NULL); 4152 } 4153 4154 int 4155 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4156 struct iovec *iov, int iovcnt, 4157 uint64_t offset_blocks, uint64_t num_blocks, 4158 spdk_bdev_io_completion_cb cb, void *cb_arg, 4159 struct spdk_bdev_ext_io_opts *opts) 4160 { 4161 void *md = NULL; 4162 4163 if (opts) { 4164 md = opts->metadata; 4165 } 4166 4167 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4168 return -EINVAL; 4169 } 4170 4171 if (md && !_bdev_io_check_md_buf(iov, md)) { 4172 return -EINVAL; 4173 } 4174 4175 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4176 num_blocks, cb, cb_arg, opts); 4177 } 4178 4179 static void 4180 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4181 { 4182 struct spdk_bdev_io *parent_io = cb_arg; 4183 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4184 int i, rc = 0; 4185 4186 if (!success) { 4187 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4188 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4189 spdk_bdev_free_io(bdev_io); 4190 return; 4191 } 4192 4193 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4194 rc = memcmp(read_buf, 4195 parent_io->u.bdev.iovs[i].iov_base, 4196 parent_io->u.bdev.iovs[i].iov_len); 4197 if (rc) { 4198 break; 4199 } 4200 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4201 } 4202 4203 spdk_bdev_free_io(bdev_io); 4204 4205 if (rc == 0) { 4206 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4207 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4208 } else { 4209 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4210 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4211 } 4212 } 4213 4214 static void 4215 bdev_compare_do_read(void *_bdev_io) 4216 { 4217 struct spdk_bdev_io *bdev_io = _bdev_io; 4218 int rc; 4219 4220 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4221 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4222 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4223 bdev_compare_do_read_done, bdev_io); 4224 4225 if (rc == -ENOMEM) { 4226 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4227 } else if (rc != 0) { 4228 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4229 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4230 } 4231 } 4232 4233 static int 4234 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4235 struct iovec *iov, int iovcnt, void *md_buf, 4236 uint64_t offset_blocks, uint64_t num_blocks, 4237 spdk_bdev_io_completion_cb cb, void *cb_arg) 4238 { 4239 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4240 struct spdk_bdev_io *bdev_io; 4241 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4242 4243 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4244 return -EINVAL; 4245 } 4246 4247 bdev_io = bdev_channel_get_io(channel); 4248 if (!bdev_io) { 4249 return -ENOMEM; 4250 } 4251 4252 bdev_io->internal.ch = channel; 4253 bdev_io->internal.desc = desc; 4254 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4255 bdev_io->u.bdev.iovs = iov; 4256 bdev_io->u.bdev.iovcnt = iovcnt; 4257 bdev_io->u.bdev.md_buf = md_buf; 4258 bdev_io->u.bdev.num_blocks = num_blocks; 4259 bdev_io->u.bdev.offset_blocks = offset_blocks; 4260 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4261 4262 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4263 bdev_io_submit(bdev_io); 4264 return 0; 4265 } 4266 4267 bdev_compare_do_read(bdev_io); 4268 4269 return 0; 4270 } 4271 4272 int 4273 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4274 struct iovec *iov, int iovcnt, 4275 uint64_t offset_blocks, uint64_t num_blocks, 4276 spdk_bdev_io_completion_cb cb, void *cb_arg) 4277 { 4278 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4279 num_blocks, cb, cb_arg); 4280 } 4281 4282 int 4283 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4284 struct iovec *iov, int iovcnt, void *md_buf, 4285 uint64_t offset_blocks, uint64_t num_blocks, 4286 spdk_bdev_io_completion_cb cb, void *cb_arg) 4287 { 4288 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4289 return -EINVAL; 4290 } 4291 4292 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4293 return -EINVAL; 4294 } 4295 4296 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4297 num_blocks, cb, cb_arg); 4298 } 4299 4300 static int 4301 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4302 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4303 spdk_bdev_io_completion_cb cb, void *cb_arg) 4304 { 4305 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4306 struct spdk_bdev_io *bdev_io; 4307 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4308 4309 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4310 return -EINVAL; 4311 } 4312 4313 bdev_io = bdev_channel_get_io(channel); 4314 if (!bdev_io) { 4315 return -ENOMEM; 4316 } 4317 4318 bdev_io->internal.ch = channel; 4319 bdev_io->internal.desc = desc; 4320 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4321 bdev_io->u.bdev.iovs = &bdev_io->iov; 4322 bdev_io->u.bdev.iovs[0].iov_base = buf; 4323 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4324 bdev_io->u.bdev.iovcnt = 1; 4325 bdev_io->u.bdev.md_buf = md_buf; 4326 bdev_io->u.bdev.num_blocks = num_blocks; 4327 bdev_io->u.bdev.offset_blocks = offset_blocks; 4328 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4329 4330 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4331 bdev_io_submit(bdev_io); 4332 return 0; 4333 } 4334 4335 bdev_compare_do_read(bdev_io); 4336 4337 return 0; 4338 } 4339 4340 int 4341 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4342 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4343 spdk_bdev_io_completion_cb cb, void *cb_arg) 4344 { 4345 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4346 cb, cb_arg); 4347 } 4348 4349 int 4350 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4351 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4352 spdk_bdev_io_completion_cb cb, void *cb_arg) 4353 { 4354 struct iovec iov = { 4355 .iov_base = buf, 4356 }; 4357 4358 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4359 return -EINVAL; 4360 } 4361 4362 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4363 return -EINVAL; 4364 } 4365 4366 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4367 cb, cb_arg); 4368 } 4369 4370 static void 4371 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4372 { 4373 struct spdk_bdev_io *bdev_io = ctx; 4374 4375 if (unlock_status) { 4376 SPDK_ERRLOG("LBA range unlock failed\n"); 4377 } 4378 4379 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4380 false, bdev_io->internal.caller_ctx); 4381 } 4382 4383 static void 4384 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4385 { 4386 bdev_io->internal.status = status; 4387 4388 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4389 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4390 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4391 } 4392 4393 static void 4394 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4395 { 4396 struct spdk_bdev_io *parent_io = cb_arg; 4397 4398 if (!success) { 4399 SPDK_ERRLOG("Compare and write operation failed\n"); 4400 } 4401 4402 spdk_bdev_free_io(bdev_io); 4403 4404 bdev_comparev_and_writev_blocks_unlock(parent_io, 4405 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4406 } 4407 4408 static void 4409 bdev_compare_and_write_do_write(void *_bdev_io) 4410 { 4411 struct spdk_bdev_io *bdev_io = _bdev_io; 4412 int rc; 4413 4414 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4415 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4416 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4417 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4418 bdev_compare_and_write_do_write_done, bdev_io); 4419 4420 4421 if (rc == -ENOMEM) { 4422 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4423 } else if (rc != 0) { 4424 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4425 } 4426 } 4427 4428 static void 4429 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4430 { 4431 struct spdk_bdev_io *parent_io = cb_arg; 4432 4433 spdk_bdev_free_io(bdev_io); 4434 4435 if (!success) { 4436 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4437 return; 4438 } 4439 4440 bdev_compare_and_write_do_write(parent_io); 4441 } 4442 4443 static void 4444 bdev_compare_and_write_do_compare(void *_bdev_io) 4445 { 4446 struct spdk_bdev_io *bdev_io = _bdev_io; 4447 int rc; 4448 4449 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4450 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4451 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4452 bdev_compare_and_write_do_compare_done, bdev_io); 4453 4454 if (rc == -ENOMEM) { 4455 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4456 } else if (rc != 0) { 4457 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4458 } 4459 } 4460 4461 static void 4462 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4463 { 4464 struct spdk_bdev_io *bdev_io = ctx; 4465 4466 if (status) { 4467 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4468 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4469 return; 4470 } 4471 4472 bdev_compare_and_write_do_compare(bdev_io); 4473 } 4474 4475 int 4476 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4477 struct iovec *compare_iov, int compare_iovcnt, 4478 struct iovec *write_iov, int write_iovcnt, 4479 uint64_t offset_blocks, uint64_t num_blocks, 4480 spdk_bdev_io_completion_cb cb, void *cb_arg) 4481 { 4482 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4483 struct spdk_bdev_io *bdev_io; 4484 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4485 4486 if (!desc->write) { 4487 return -EBADF; 4488 } 4489 4490 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4491 return -EINVAL; 4492 } 4493 4494 if (num_blocks > bdev->acwu) { 4495 return -EINVAL; 4496 } 4497 4498 bdev_io = bdev_channel_get_io(channel); 4499 if (!bdev_io) { 4500 return -ENOMEM; 4501 } 4502 4503 bdev_io->internal.ch = channel; 4504 bdev_io->internal.desc = desc; 4505 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 4506 bdev_io->u.bdev.iovs = compare_iov; 4507 bdev_io->u.bdev.iovcnt = compare_iovcnt; 4508 bdev_io->u.bdev.fused_iovs = write_iov; 4509 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4510 bdev_io->u.bdev.md_buf = NULL; 4511 bdev_io->u.bdev.num_blocks = num_blocks; 4512 bdev_io->u.bdev.offset_blocks = offset_blocks; 4513 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4514 4515 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4516 bdev_io_submit(bdev_io); 4517 return 0; 4518 } 4519 4520 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4521 bdev_comparev_and_writev_blocks_locked, bdev_io); 4522 } 4523 4524 int 4525 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4526 struct iovec *iov, int iovcnt, 4527 uint64_t offset_blocks, uint64_t num_blocks, 4528 bool populate, 4529 spdk_bdev_io_completion_cb cb, void *cb_arg) 4530 { 4531 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4532 struct spdk_bdev_io *bdev_io; 4533 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4534 4535 if (!desc->write) { 4536 return -EBADF; 4537 } 4538 4539 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4540 return -EINVAL; 4541 } 4542 4543 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4544 return -ENOTSUP; 4545 } 4546 4547 bdev_io = bdev_channel_get_io(channel); 4548 if (!bdev_io) { 4549 return -ENOMEM; 4550 } 4551 4552 bdev_io->internal.ch = channel; 4553 bdev_io->internal.desc = desc; 4554 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4555 bdev_io->u.bdev.num_blocks = num_blocks; 4556 bdev_io->u.bdev.offset_blocks = offset_blocks; 4557 bdev_io->u.bdev.iovs = iov; 4558 bdev_io->u.bdev.iovcnt = iovcnt; 4559 bdev_io->u.bdev.md_buf = NULL; 4560 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4561 bdev_io->u.bdev.zcopy.commit = 0; 4562 bdev_io->u.bdev.zcopy.start = 1; 4563 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4564 4565 bdev_io_submit(bdev_io); 4566 4567 return 0; 4568 } 4569 4570 int 4571 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4572 spdk_bdev_io_completion_cb cb, void *cb_arg) 4573 { 4574 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4575 return -EINVAL; 4576 } 4577 4578 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4579 bdev_io->u.bdev.zcopy.start = 0; 4580 bdev_io->internal.caller_ctx = cb_arg; 4581 bdev_io->internal.cb = cb; 4582 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4583 4584 bdev_io_submit(bdev_io); 4585 4586 return 0; 4587 } 4588 4589 int 4590 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4591 uint64_t offset, uint64_t len, 4592 spdk_bdev_io_completion_cb cb, void *cb_arg) 4593 { 4594 uint64_t offset_blocks, num_blocks; 4595 4596 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4597 len, &num_blocks) != 0) { 4598 return -EINVAL; 4599 } 4600 4601 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4602 } 4603 4604 int 4605 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4606 uint64_t offset_blocks, uint64_t num_blocks, 4607 spdk_bdev_io_completion_cb cb, void *cb_arg) 4608 { 4609 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4610 struct spdk_bdev_io *bdev_io; 4611 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4612 4613 if (!desc->write) { 4614 return -EBADF; 4615 } 4616 4617 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4618 return -EINVAL; 4619 } 4620 4621 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4622 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4623 return -ENOTSUP; 4624 } 4625 4626 bdev_io = bdev_channel_get_io(channel); 4627 4628 if (!bdev_io) { 4629 return -ENOMEM; 4630 } 4631 4632 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 4633 bdev_io->internal.ch = channel; 4634 bdev_io->internal.desc = desc; 4635 bdev_io->u.bdev.offset_blocks = offset_blocks; 4636 bdev_io->u.bdev.num_blocks = num_blocks; 4637 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4638 4639 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 4640 bdev_io_submit(bdev_io); 4641 return 0; 4642 } 4643 4644 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 4645 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 4646 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 4647 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 4648 bdev_write_zero_buffer_next(bdev_io); 4649 4650 return 0; 4651 } 4652 4653 int 4654 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4655 uint64_t offset, uint64_t nbytes, 4656 spdk_bdev_io_completion_cb cb, void *cb_arg) 4657 { 4658 uint64_t offset_blocks, num_blocks; 4659 4660 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4661 nbytes, &num_blocks) != 0) { 4662 return -EINVAL; 4663 } 4664 4665 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4666 } 4667 4668 int 4669 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4670 uint64_t offset_blocks, uint64_t num_blocks, 4671 spdk_bdev_io_completion_cb cb, void *cb_arg) 4672 { 4673 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4674 struct spdk_bdev_io *bdev_io; 4675 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4676 4677 if (!desc->write) { 4678 return -EBADF; 4679 } 4680 4681 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4682 return -EINVAL; 4683 } 4684 4685 if (num_blocks == 0) { 4686 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 4687 return -EINVAL; 4688 } 4689 4690 bdev_io = bdev_channel_get_io(channel); 4691 if (!bdev_io) { 4692 return -ENOMEM; 4693 } 4694 4695 bdev_io->internal.ch = channel; 4696 bdev_io->internal.desc = desc; 4697 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 4698 4699 bdev_io->u.bdev.iovs = &bdev_io->iov; 4700 bdev_io->u.bdev.iovs[0].iov_base = NULL; 4701 bdev_io->u.bdev.iovs[0].iov_len = 0; 4702 bdev_io->u.bdev.iovcnt = 1; 4703 4704 bdev_io->u.bdev.offset_blocks = offset_blocks; 4705 bdev_io->u.bdev.num_blocks = num_blocks; 4706 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4707 4708 bdev_io_submit(bdev_io); 4709 return 0; 4710 } 4711 4712 int 4713 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4714 uint64_t offset, uint64_t length, 4715 spdk_bdev_io_completion_cb cb, void *cb_arg) 4716 { 4717 uint64_t offset_blocks, num_blocks; 4718 4719 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4720 length, &num_blocks) != 0) { 4721 return -EINVAL; 4722 } 4723 4724 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4725 } 4726 4727 int 4728 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4729 uint64_t offset_blocks, uint64_t num_blocks, 4730 spdk_bdev_io_completion_cb cb, void *cb_arg) 4731 { 4732 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4733 struct spdk_bdev_io *bdev_io; 4734 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4735 4736 if (!desc->write) { 4737 return -EBADF; 4738 } 4739 4740 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4741 return -EINVAL; 4742 } 4743 4744 bdev_io = bdev_channel_get_io(channel); 4745 if (!bdev_io) { 4746 return -ENOMEM; 4747 } 4748 4749 bdev_io->internal.ch = channel; 4750 bdev_io->internal.desc = desc; 4751 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 4752 bdev_io->u.bdev.iovs = NULL; 4753 bdev_io->u.bdev.iovcnt = 0; 4754 bdev_io->u.bdev.offset_blocks = offset_blocks; 4755 bdev_io->u.bdev.num_blocks = num_blocks; 4756 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4757 4758 bdev_io_submit(bdev_io); 4759 return 0; 4760 } 4761 4762 static void 4763 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 4764 { 4765 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 4766 struct spdk_bdev_io *bdev_io; 4767 4768 bdev_io = TAILQ_FIRST(&ch->queued_resets); 4769 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 4770 bdev_io_submit_reset(bdev_io); 4771 } 4772 4773 static void 4774 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 4775 { 4776 struct spdk_io_channel *ch; 4777 struct spdk_bdev_channel *channel; 4778 struct spdk_bdev_mgmt_channel *mgmt_channel; 4779 struct spdk_bdev_shared_resource *shared_resource; 4780 bdev_io_tailq_t tmp_queued; 4781 4782 TAILQ_INIT(&tmp_queued); 4783 4784 ch = spdk_io_channel_iter_get_channel(i); 4785 channel = spdk_io_channel_get_ctx(ch); 4786 shared_resource = channel->shared_resource; 4787 mgmt_channel = shared_resource->mgmt_ch; 4788 4789 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 4790 4791 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 4792 /* The QoS object is always valid and readable while 4793 * the channel flag is set, so the lock here should not 4794 * be necessary. We're not in the fast path though, so 4795 * just take it anyway. */ 4796 pthread_mutex_lock(&channel->bdev->internal.mutex); 4797 if (channel->bdev->internal.qos->ch == channel) { 4798 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 4799 } 4800 pthread_mutex_unlock(&channel->bdev->internal.mutex); 4801 } 4802 4803 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 4804 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 4805 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 4806 bdev_abort_all_queued_io(&tmp_queued, channel); 4807 4808 spdk_for_each_channel_continue(i, 0); 4809 } 4810 4811 static void 4812 bdev_start_reset(void *ctx) 4813 { 4814 struct spdk_bdev_channel *ch = ctx; 4815 4816 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 4817 ch, bdev_reset_dev); 4818 } 4819 4820 static void 4821 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 4822 { 4823 struct spdk_bdev *bdev = ch->bdev; 4824 4825 assert(!TAILQ_EMPTY(&ch->queued_resets)); 4826 4827 pthread_mutex_lock(&bdev->internal.mutex); 4828 if (bdev->internal.reset_in_progress == NULL) { 4829 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 4830 /* 4831 * Take a channel reference for the target bdev for the life of this 4832 * reset. This guards against the channel getting destroyed while 4833 * spdk_for_each_channel() calls related to this reset IO are in 4834 * progress. We will release the reference when this reset is 4835 * completed. 4836 */ 4837 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4838 bdev_start_reset(ch); 4839 } 4840 pthread_mutex_unlock(&bdev->internal.mutex); 4841 } 4842 4843 int 4844 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4845 spdk_bdev_io_completion_cb cb, void *cb_arg) 4846 { 4847 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4848 struct spdk_bdev_io *bdev_io; 4849 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4850 4851 bdev_io = bdev_channel_get_io(channel); 4852 if (!bdev_io) { 4853 return -ENOMEM; 4854 } 4855 4856 bdev_io->internal.ch = channel; 4857 bdev_io->internal.desc = desc; 4858 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4859 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 4860 bdev_io->u.reset.ch_ref = NULL; 4861 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4862 4863 pthread_mutex_lock(&bdev->internal.mutex); 4864 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 4865 pthread_mutex_unlock(&bdev->internal.mutex); 4866 4867 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 4868 internal.ch_link); 4869 4870 bdev_channel_start_reset(channel); 4871 4872 return 0; 4873 } 4874 4875 void 4876 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4877 struct spdk_bdev_io_stat *stat) 4878 { 4879 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4880 4881 *stat = channel->stat; 4882 } 4883 4884 static void 4885 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 4886 { 4887 void *io_device = spdk_io_channel_iter_get_io_device(i); 4888 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4889 4890 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 4891 bdev_iostat_ctx->cb_arg, 0); 4892 free(bdev_iostat_ctx); 4893 } 4894 4895 static void 4896 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 4897 { 4898 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4899 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4900 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4901 4902 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 4903 spdk_for_each_channel_continue(i, 0); 4904 } 4905 4906 void 4907 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 4908 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 4909 { 4910 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 4911 4912 assert(bdev != NULL); 4913 assert(stat != NULL); 4914 assert(cb != NULL); 4915 4916 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 4917 if (bdev_iostat_ctx == NULL) { 4918 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 4919 cb(bdev, stat, cb_arg, -ENOMEM); 4920 return; 4921 } 4922 4923 bdev_iostat_ctx->stat = stat; 4924 bdev_iostat_ctx->cb = cb; 4925 bdev_iostat_ctx->cb_arg = cb_arg; 4926 4927 /* Start with the statistics from previously deleted channels. */ 4928 pthread_mutex_lock(&bdev->internal.mutex); 4929 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 4930 pthread_mutex_unlock(&bdev->internal.mutex); 4931 4932 /* Then iterate and add the statistics from each existing channel. */ 4933 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4934 bdev_get_each_channel_stat, 4935 bdev_iostat_ctx, 4936 bdev_get_device_stat_done); 4937 } 4938 4939 int 4940 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4941 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4942 spdk_bdev_io_completion_cb cb, void *cb_arg) 4943 { 4944 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4945 struct spdk_bdev_io *bdev_io; 4946 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4947 4948 if (!desc->write) { 4949 return -EBADF; 4950 } 4951 4952 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 4953 return -ENOTSUP; 4954 } 4955 4956 bdev_io = bdev_channel_get_io(channel); 4957 if (!bdev_io) { 4958 return -ENOMEM; 4959 } 4960 4961 bdev_io->internal.ch = channel; 4962 bdev_io->internal.desc = desc; 4963 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 4964 bdev_io->u.nvme_passthru.cmd = *cmd; 4965 bdev_io->u.nvme_passthru.buf = buf; 4966 bdev_io->u.nvme_passthru.nbytes = nbytes; 4967 bdev_io->u.nvme_passthru.md_buf = NULL; 4968 bdev_io->u.nvme_passthru.md_len = 0; 4969 4970 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4971 4972 bdev_io_submit(bdev_io); 4973 return 0; 4974 } 4975 4976 int 4977 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4978 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4979 spdk_bdev_io_completion_cb cb, void *cb_arg) 4980 { 4981 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4982 struct spdk_bdev_io *bdev_io; 4983 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4984 4985 if (!desc->write) { 4986 /* 4987 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 4988 * to easily determine if the command is a read or write, but for now just 4989 * do not allow io_passthru with a read-only descriptor. 4990 */ 4991 return -EBADF; 4992 } 4993 4994 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 4995 return -ENOTSUP; 4996 } 4997 4998 bdev_io = bdev_channel_get_io(channel); 4999 if (!bdev_io) { 5000 return -ENOMEM; 5001 } 5002 5003 bdev_io->internal.ch = channel; 5004 bdev_io->internal.desc = desc; 5005 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5006 bdev_io->u.nvme_passthru.cmd = *cmd; 5007 bdev_io->u.nvme_passthru.buf = buf; 5008 bdev_io->u.nvme_passthru.nbytes = nbytes; 5009 bdev_io->u.nvme_passthru.md_buf = NULL; 5010 bdev_io->u.nvme_passthru.md_len = 0; 5011 5012 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5013 5014 bdev_io_submit(bdev_io); 5015 return 0; 5016 } 5017 5018 int 5019 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5020 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5021 spdk_bdev_io_completion_cb cb, void *cb_arg) 5022 { 5023 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5024 struct spdk_bdev_io *bdev_io; 5025 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5026 5027 if (!desc->write) { 5028 /* 5029 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5030 * to easily determine if the command is a read or write, but for now just 5031 * do not allow io_passthru with a read-only descriptor. 5032 */ 5033 return -EBADF; 5034 } 5035 5036 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5037 return -ENOTSUP; 5038 } 5039 5040 bdev_io = bdev_channel_get_io(channel); 5041 if (!bdev_io) { 5042 return -ENOMEM; 5043 } 5044 5045 bdev_io->internal.ch = channel; 5046 bdev_io->internal.desc = desc; 5047 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5048 bdev_io->u.nvme_passthru.cmd = *cmd; 5049 bdev_io->u.nvme_passthru.buf = buf; 5050 bdev_io->u.nvme_passthru.nbytes = nbytes; 5051 bdev_io->u.nvme_passthru.md_buf = md_buf; 5052 bdev_io->u.nvme_passthru.md_len = md_len; 5053 5054 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5055 5056 bdev_io_submit(bdev_io); 5057 return 0; 5058 } 5059 5060 static void bdev_abort_retry(void *ctx); 5061 static void bdev_abort(struct spdk_bdev_io *parent_io); 5062 5063 static void 5064 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5065 { 5066 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5067 struct spdk_bdev_io *parent_io = cb_arg; 5068 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5069 5070 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5071 5072 spdk_bdev_free_io(bdev_io); 5073 5074 if (!success) { 5075 /* Check if the target I/O completed in the meantime. */ 5076 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5077 if (tmp_io == bio_to_abort) { 5078 break; 5079 } 5080 } 5081 5082 /* If the target I/O still exists, set the parent to failed. */ 5083 if (tmp_io != NULL) { 5084 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5085 } 5086 } 5087 5088 parent_io->u.bdev.split_outstanding--; 5089 if (parent_io->u.bdev.split_outstanding == 0) { 5090 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5091 bdev_abort_retry(parent_io); 5092 } else { 5093 bdev_io_complete(parent_io); 5094 } 5095 } 5096 } 5097 5098 static int 5099 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5100 struct spdk_bdev_io *bio_to_abort, 5101 spdk_bdev_io_completion_cb cb, void *cb_arg) 5102 { 5103 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5104 struct spdk_bdev_io *bdev_io; 5105 5106 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5107 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5108 /* TODO: Abort reset or abort request. */ 5109 return -ENOTSUP; 5110 } 5111 5112 bdev_io = bdev_channel_get_io(channel); 5113 if (bdev_io == NULL) { 5114 return -ENOMEM; 5115 } 5116 5117 bdev_io->internal.ch = channel; 5118 bdev_io->internal.desc = desc; 5119 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5120 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5121 5122 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5123 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5124 5125 /* Parent abort request is not submitted directly, but to manage its 5126 * execution add it to the submitted list here. 5127 */ 5128 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5129 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5130 5131 bdev_abort(bdev_io); 5132 5133 return 0; 5134 } 5135 5136 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5137 5138 /* Submit the abort request to the underlying bdev module. */ 5139 bdev_io_submit(bdev_io); 5140 5141 return 0; 5142 } 5143 5144 static uint32_t 5145 _bdev_abort(struct spdk_bdev_io *parent_io) 5146 { 5147 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5148 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5149 void *bio_cb_arg; 5150 struct spdk_bdev_io *bio_to_abort; 5151 uint32_t matched_ios; 5152 int rc; 5153 5154 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5155 5156 /* matched_ios is returned and will be kept by the caller. 5157 * 5158 * This funcion will be used for two cases, 1) the same cb_arg is used for 5159 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5160 * Incrementing split_outstanding directly here may confuse readers especially 5161 * for the 1st case. 5162 * 5163 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5164 * works as expected. 5165 */ 5166 matched_ios = 0; 5167 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5168 5169 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5170 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5171 continue; 5172 } 5173 5174 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5175 /* Any I/O which was submitted after this abort command should be excluded. */ 5176 continue; 5177 } 5178 5179 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5180 if (rc != 0) { 5181 if (rc == -ENOMEM) { 5182 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5183 } else { 5184 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5185 } 5186 break; 5187 } 5188 matched_ios++; 5189 } 5190 5191 return matched_ios; 5192 } 5193 5194 static void 5195 bdev_abort_retry(void *ctx) 5196 { 5197 struct spdk_bdev_io *parent_io = ctx; 5198 uint32_t matched_ios; 5199 5200 matched_ios = _bdev_abort(parent_io); 5201 5202 if (matched_ios == 0) { 5203 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5204 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5205 } else { 5206 /* For retry, the case that no target I/O was found is success 5207 * because it means target I/Os completed in the meantime. 5208 */ 5209 bdev_io_complete(parent_io); 5210 } 5211 return; 5212 } 5213 5214 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5215 parent_io->u.bdev.split_outstanding = matched_ios; 5216 } 5217 5218 static void 5219 bdev_abort(struct spdk_bdev_io *parent_io) 5220 { 5221 uint32_t matched_ios; 5222 5223 matched_ios = _bdev_abort(parent_io); 5224 5225 if (matched_ios == 0) { 5226 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5227 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5228 } else { 5229 /* The case the no target I/O was found is failure. */ 5230 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5231 bdev_io_complete(parent_io); 5232 } 5233 return; 5234 } 5235 5236 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5237 parent_io->u.bdev.split_outstanding = matched_ios; 5238 } 5239 5240 int 5241 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5242 void *bio_cb_arg, 5243 spdk_bdev_io_completion_cb cb, void *cb_arg) 5244 { 5245 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5246 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5247 struct spdk_bdev_io *bdev_io; 5248 5249 if (bio_cb_arg == NULL) { 5250 return -EINVAL; 5251 } 5252 5253 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5254 return -ENOTSUP; 5255 } 5256 5257 bdev_io = bdev_channel_get_io(channel); 5258 if (bdev_io == NULL) { 5259 return -ENOMEM; 5260 } 5261 5262 bdev_io->internal.ch = channel; 5263 bdev_io->internal.desc = desc; 5264 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5265 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5266 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5267 5268 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5269 5270 /* Parent abort request is not submitted directly, but to manage its execution, 5271 * add it to the submitted list here. 5272 */ 5273 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5274 5275 bdev_abort(bdev_io); 5276 5277 return 0; 5278 } 5279 5280 int 5281 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5282 struct spdk_bdev_io_wait_entry *entry) 5283 { 5284 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5285 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5286 5287 if (bdev != entry->bdev) { 5288 SPDK_ERRLOG("bdevs do not match\n"); 5289 return -EINVAL; 5290 } 5291 5292 if (mgmt_ch->per_thread_cache_count > 0) { 5293 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5294 return -EINVAL; 5295 } 5296 5297 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5298 return 0; 5299 } 5300 5301 static void 5302 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 5303 { 5304 struct spdk_bdev *bdev = bdev_ch->bdev; 5305 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5306 struct spdk_bdev_io *bdev_io; 5307 5308 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 5309 /* 5310 * Allow some more I/O to complete before retrying the nomem_io queue. 5311 * Some drivers (such as nvme) cannot immediately take a new I/O in 5312 * the context of a completion, because the resources for the I/O are 5313 * not released until control returns to the bdev poller. Also, we 5314 * may require several small I/O to complete before a larger I/O 5315 * (that requires splitting) can be submitted. 5316 */ 5317 return; 5318 } 5319 5320 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 5321 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 5322 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 5323 bdev_io->internal.ch->io_outstanding++; 5324 shared_resource->io_outstanding++; 5325 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5326 bdev_io->internal.error.nvme.cdw0 = 0; 5327 bdev_io->num_retries++; 5328 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 5329 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5330 break; 5331 } 5332 } 5333 } 5334 5335 static inline void 5336 bdev_io_complete(void *ctx) 5337 { 5338 struct spdk_bdev_io *bdev_io = ctx; 5339 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5340 uint64_t tsc, tsc_diff; 5341 5342 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5343 /* 5344 * Send the completion to the thread that originally submitted the I/O, 5345 * which may not be the current thread in the case of QoS. 5346 */ 5347 if (bdev_io->internal.io_submit_ch) { 5348 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5349 bdev_io->internal.io_submit_ch = NULL; 5350 } 5351 5352 /* 5353 * Defer completion to avoid potential infinite recursion if the 5354 * user's completion callback issues a new I/O. 5355 */ 5356 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5357 bdev_io_complete, bdev_io); 5358 return; 5359 } 5360 5361 tsc = spdk_get_ticks(); 5362 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5363 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5364 bdev_io->internal.caller_ctx); 5365 5366 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5367 5368 if (bdev_io->internal.ch->histogram) { 5369 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5370 } 5371 5372 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5373 switch (bdev_io->type) { 5374 case SPDK_BDEV_IO_TYPE_READ: 5375 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5376 bdev_io->internal.ch->stat.num_read_ops++; 5377 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5378 break; 5379 case SPDK_BDEV_IO_TYPE_WRITE: 5380 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5381 bdev_io->internal.ch->stat.num_write_ops++; 5382 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5383 break; 5384 case SPDK_BDEV_IO_TYPE_UNMAP: 5385 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5386 bdev_io->internal.ch->stat.num_unmap_ops++; 5387 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5388 break; 5389 case SPDK_BDEV_IO_TYPE_ZCOPY: 5390 /* Track the data in the start phase only */ 5391 if (bdev_io->u.bdev.zcopy.start) { 5392 if (bdev_io->u.bdev.zcopy.populate) { 5393 bdev_io->internal.ch->stat.bytes_read += 5394 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5395 bdev_io->internal.ch->stat.num_read_ops++; 5396 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5397 } else { 5398 bdev_io->internal.ch->stat.bytes_written += 5399 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5400 bdev_io->internal.ch->stat.num_write_ops++; 5401 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5402 } 5403 } 5404 break; 5405 default: 5406 break; 5407 } 5408 } 5409 5410 #ifdef SPDK_CONFIG_VTUNE 5411 uint64_t now_tsc = spdk_get_ticks(); 5412 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5413 uint64_t data[5]; 5414 5415 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5416 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5417 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5418 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5419 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5420 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5421 5422 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5423 __itt_metadata_u64, 5, data); 5424 5425 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5426 bdev_io->internal.ch->start_tsc = now_tsc; 5427 } 5428 #endif 5429 5430 assert(bdev_io->internal.cb != NULL); 5431 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5432 5433 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5434 bdev_io->internal.caller_ctx); 5435 } 5436 5437 static void 5438 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 5439 { 5440 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5441 5442 if (bdev_io->u.reset.ch_ref != NULL) { 5443 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 5444 bdev_io->u.reset.ch_ref = NULL; 5445 } 5446 5447 bdev_io_complete(bdev_io); 5448 } 5449 5450 static void 5451 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 5452 { 5453 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5454 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5455 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5456 struct spdk_bdev_io *queued_reset; 5457 5458 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 5459 while (!TAILQ_EMPTY(&ch->queued_resets)) { 5460 queued_reset = TAILQ_FIRST(&ch->queued_resets); 5461 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 5462 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 5463 } 5464 5465 spdk_for_each_channel_continue(i, 0); 5466 } 5467 5468 void 5469 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 5470 { 5471 struct spdk_bdev *bdev = bdev_io->bdev; 5472 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5473 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5474 5475 bdev_io->internal.status = status; 5476 5477 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5478 bool unlock_channels = false; 5479 5480 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5481 SPDK_ERRLOG("NOMEM returned for reset\n"); 5482 } 5483 pthread_mutex_lock(&bdev->internal.mutex); 5484 if (bdev_io == bdev->internal.reset_in_progress) { 5485 bdev->internal.reset_in_progress = NULL; 5486 unlock_channels = true; 5487 } 5488 pthread_mutex_unlock(&bdev->internal.mutex); 5489 5490 if (unlock_channels) { 5491 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5492 bdev_io, bdev_reset_complete); 5493 return; 5494 } 5495 } else { 5496 _bdev_io_unset_bounce_buf(bdev_io); 5497 5498 assert(bdev_ch->io_outstanding > 0); 5499 assert(shared_resource->io_outstanding > 0); 5500 bdev_ch->io_outstanding--; 5501 shared_resource->io_outstanding--; 5502 5503 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 5504 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 5505 /* 5506 * Wait for some of the outstanding I/O to complete before we 5507 * retry any of the nomem_io. Normally we will wait for 5508 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 5509 * depth channels we will instead wait for half to complete. 5510 */ 5511 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 5512 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 5513 return; 5514 } 5515 5516 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 5517 bdev_ch_retry_io(bdev_ch); 5518 } 5519 } 5520 5521 bdev_io_complete(bdev_io); 5522 } 5523 5524 void 5525 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5526 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5527 { 5528 if (sc == SPDK_SCSI_STATUS_GOOD) { 5529 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5530 } else { 5531 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5532 bdev_io->internal.error.scsi.sc = sc; 5533 bdev_io->internal.error.scsi.sk = sk; 5534 bdev_io->internal.error.scsi.asc = asc; 5535 bdev_io->internal.error.scsi.ascq = ascq; 5536 } 5537 5538 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5539 } 5540 5541 void 5542 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5543 int *sc, int *sk, int *asc, int *ascq) 5544 { 5545 assert(sc != NULL); 5546 assert(sk != NULL); 5547 assert(asc != NULL); 5548 assert(ascq != NULL); 5549 5550 switch (bdev_io->internal.status) { 5551 case SPDK_BDEV_IO_STATUS_SUCCESS: 5552 *sc = SPDK_SCSI_STATUS_GOOD; 5553 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5554 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5555 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5556 break; 5557 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5558 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5559 break; 5560 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5561 *sc = bdev_io->internal.error.scsi.sc; 5562 *sk = bdev_io->internal.error.scsi.sk; 5563 *asc = bdev_io->internal.error.scsi.asc; 5564 *ascq = bdev_io->internal.error.scsi.ascq; 5565 break; 5566 default: 5567 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5568 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5569 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5570 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5571 break; 5572 } 5573 } 5574 5575 void 5576 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 5577 { 5578 if (aio_result == 0) { 5579 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5580 } else { 5581 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 5582 } 5583 5584 bdev_io->internal.error.aio_result = aio_result; 5585 5586 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5587 } 5588 5589 void 5590 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 5591 { 5592 assert(aio_result != NULL); 5593 5594 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 5595 *aio_result = bdev_io->internal.error.aio_result; 5596 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5597 *aio_result = 0; 5598 } else { 5599 *aio_result = -EIO; 5600 } 5601 } 5602 5603 void 5604 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5605 { 5606 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5607 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5608 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 5609 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 5610 } else { 5611 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5612 } 5613 5614 bdev_io->internal.error.nvme.cdw0 = cdw0; 5615 bdev_io->internal.error.nvme.sct = sct; 5616 bdev_io->internal.error.nvme.sc = sc; 5617 5618 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5619 } 5620 5621 void 5622 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5623 { 5624 assert(sct != NULL); 5625 assert(sc != NULL); 5626 assert(cdw0 != NULL); 5627 5628 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 5629 *sct = SPDK_NVME_SCT_GENERIC; 5630 *sc = SPDK_NVME_SC_SUCCESS; 5631 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5632 *cdw0 = 0; 5633 } else { 5634 *cdw0 = 1U; 5635 } 5636 return; 5637 } 5638 5639 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5640 *sct = bdev_io->internal.error.nvme.sct; 5641 *sc = bdev_io->internal.error.nvme.sc; 5642 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5643 *sct = SPDK_NVME_SCT_GENERIC; 5644 *sc = SPDK_NVME_SC_SUCCESS; 5645 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5646 *sct = SPDK_NVME_SCT_GENERIC; 5647 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5648 } else { 5649 *sct = SPDK_NVME_SCT_GENERIC; 5650 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5651 } 5652 5653 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5654 } 5655 5656 void 5657 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5658 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5659 { 5660 assert(first_sct != NULL); 5661 assert(first_sc != NULL); 5662 assert(second_sct != NULL); 5663 assert(second_sc != NULL); 5664 assert(cdw0 != NULL); 5665 5666 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5667 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5668 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5669 *first_sct = bdev_io->internal.error.nvme.sct; 5670 *first_sc = bdev_io->internal.error.nvme.sc; 5671 *second_sct = SPDK_NVME_SCT_GENERIC; 5672 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5673 } else { 5674 *first_sct = SPDK_NVME_SCT_GENERIC; 5675 *first_sc = SPDK_NVME_SC_SUCCESS; 5676 *second_sct = bdev_io->internal.error.nvme.sct; 5677 *second_sc = bdev_io->internal.error.nvme.sc; 5678 } 5679 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5680 *first_sct = SPDK_NVME_SCT_GENERIC; 5681 *first_sc = SPDK_NVME_SC_SUCCESS; 5682 *second_sct = SPDK_NVME_SCT_GENERIC; 5683 *second_sc = SPDK_NVME_SC_SUCCESS; 5684 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 5685 *first_sct = SPDK_NVME_SCT_GENERIC; 5686 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5687 *second_sct = SPDK_NVME_SCT_GENERIC; 5688 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5689 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 5690 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 5691 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 5692 *second_sct = SPDK_NVME_SCT_GENERIC; 5693 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5694 } else { 5695 *first_sct = SPDK_NVME_SCT_GENERIC; 5696 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5697 *second_sct = SPDK_NVME_SCT_GENERIC; 5698 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5699 } 5700 5701 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5702 } 5703 5704 struct spdk_thread * 5705 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 5706 { 5707 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 5708 } 5709 5710 struct spdk_io_channel * 5711 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 5712 { 5713 return bdev_io->internal.ch->channel; 5714 } 5715 5716 static int 5717 bdev_register(struct spdk_bdev *bdev) 5718 { 5719 char *bdev_name; 5720 char uuid[SPDK_UUID_STRING_LEN]; 5721 int ret; 5722 5723 assert(bdev->module != NULL); 5724 5725 if (!bdev->name) { 5726 SPDK_ERRLOG("Bdev name is NULL\n"); 5727 return -EINVAL; 5728 } 5729 5730 if (!strlen(bdev->name)) { 5731 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 5732 return -EINVAL; 5733 } 5734 5735 /* Users often register their own I/O devices using the bdev name. In 5736 * order to avoid conflicts, prepend bdev_. */ 5737 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 5738 if (!bdev_name) { 5739 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 5740 return -ENOMEM; 5741 } 5742 5743 bdev->internal.status = SPDK_BDEV_STATUS_READY; 5744 bdev->internal.measured_queue_depth = UINT64_MAX; 5745 bdev->internal.claim_module = NULL; 5746 bdev->internal.qd_poller = NULL; 5747 bdev->internal.qos = NULL; 5748 5749 TAILQ_INIT(&bdev->internal.open_descs); 5750 TAILQ_INIT(&bdev->internal.locked_ranges); 5751 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 5752 TAILQ_INIT(&bdev->aliases); 5753 5754 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 5755 if (ret != 0) { 5756 free(bdev_name); 5757 return ret; 5758 } 5759 5760 /* If the user didn't specify a uuid, generate one. */ 5761 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 5762 spdk_uuid_generate(&bdev->uuid); 5763 } 5764 5765 /* Add the UUID alias only if it's different than the name */ 5766 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 5767 if (strcmp(bdev->name, uuid) != 0) { 5768 ret = spdk_bdev_alias_add(bdev, uuid); 5769 if (ret != 0) { 5770 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 5771 bdev_name_del(&bdev->internal.bdev_name); 5772 free(bdev_name); 5773 return ret; 5774 } 5775 } 5776 5777 if (spdk_bdev_get_buf_align(bdev) > 1) { 5778 if (bdev->split_on_optimal_io_boundary) { 5779 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 5780 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 5781 } else { 5782 bdev->split_on_optimal_io_boundary = true; 5783 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 5784 } 5785 } 5786 5787 /* If the user didn't specify a write unit size, set it to one. */ 5788 if (bdev->write_unit_size == 0) { 5789 bdev->write_unit_size = 1; 5790 } 5791 5792 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 5793 if (bdev->acwu == 0) { 5794 bdev->acwu = 1; 5795 } 5796 5797 if (bdev->phys_blocklen == 0) { 5798 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 5799 } 5800 5801 bdev->internal.reset_in_progress = NULL; 5802 5803 spdk_io_device_register(__bdev_to_io_dev(bdev), 5804 bdev_channel_create, bdev_channel_destroy, 5805 sizeof(struct spdk_bdev_channel), 5806 bdev_name); 5807 5808 free(bdev_name); 5809 5810 pthread_mutex_init(&bdev->internal.mutex, NULL); 5811 5812 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 5813 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 5814 5815 return 0; 5816 } 5817 5818 static void 5819 bdev_destroy_cb(void *io_device) 5820 { 5821 int rc; 5822 struct spdk_bdev *bdev; 5823 spdk_bdev_unregister_cb cb_fn; 5824 void *cb_arg; 5825 5826 bdev = __bdev_from_io_dev(io_device); 5827 cb_fn = bdev->internal.unregister_cb; 5828 cb_arg = bdev->internal.unregister_ctx; 5829 5830 pthread_mutex_destroy(&bdev->internal.mutex); 5831 free(bdev->internal.qos); 5832 5833 rc = bdev->fn_table->destruct(bdev->ctxt); 5834 if (rc < 0) { 5835 SPDK_ERRLOG("destruct failed\n"); 5836 } 5837 if (rc <= 0 && cb_fn != NULL) { 5838 cb_fn(cb_arg, rc); 5839 } 5840 } 5841 5842 static void 5843 bdev_register_finished(void *arg) 5844 { 5845 struct spdk_bdev *bdev = arg; 5846 5847 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 5848 } 5849 5850 int 5851 spdk_bdev_register(struct spdk_bdev *bdev) 5852 { 5853 int rc = bdev_register(bdev); 5854 5855 if (rc == 0) { 5856 /* Examine configuration before initializing I/O */ 5857 bdev_examine(bdev); 5858 5859 spdk_bdev_wait_for_examine(bdev_register_finished, bdev); 5860 } 5861 5862 return rc; 5863 } 5864 5865 void 5866 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 5867 { 5868 if (bdev->internal.unregister_cb != NULL) { 5869 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 5870 } 5871 } 5872 5873 static void 5874 _remove_notify(void *arg) 5875 { 5876 struct spdk_bdev_desc *desc = arg; 5877 5878 pthread_mutex_lock(&desc->mutex); 5879 desc->refs--; 5880 5881 if (!desc->closed) { 5882 pthread_mutex_unlock(&desc->mutex); 5883 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 5884 return; 5885 } else if (0 == desc->refs) { 5886 /* This descriptor was closed after this remove_notify message was sent. 5887 * spdk_bdev_close() could not free the descriptor since this message was 5888 * in flight, so we free it now using bdev_desc_free(). 5889 */ 5890 pthread_mutex_unlock(&desc->mutex); 5891 bdev_desc_free(desc); 5892 return; 5893 } 5894 pthread_mutex_unlock(&desc->mutex); 5895 } 5896 5897 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 5898 * returns: 0 - bdev removed and ready to be destructed. 5899 * -EBUSY - bdev can't be destructed yet. */ 5900 static int 5901 bdev_unregister_unsafe(struct spdk_bdev *bdev) 5902 { 5903 struct spdk_bdev_desc *desc, *tmp; 5904 int rc = 0; 5905 char uuid[SPDK_UUID_STRING_LEN]; 5906 5907 /* Notify each descriptor about hotremoval */ 5908 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 5909 rc = -EBUSY; 5910 pthread_mutex_lock(&desc->mutex); 5911 /* 5912 * Defer invocation of the event_cb to a separate message that will 5913 * run later on its thread. This ensures this context unwinds and 5914 * we don't recursively unregister this bdev again if the event_cb 5915 * immediately closes its descriptor. 5916 */ 5917 desc->refs++; 5918 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 5919 pthread_mutex_unlock(&desc->mutex); 5920 } 5921 5922 /* If there are no descriptors, proceed removing the bdev */ 5923 if (rc == 0) { 5924 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 5925 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 5926 5927 /* Delete the name and the UUID alias */ 5928 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 5929 bdev_name_del_unsafe(&bdev->internal.bdev_name); 5930 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 5931 5932 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 5933 } 5934 5935 return rc; 5936 } 5937 5938 void 5939 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 5940 { 5941 struct spdk_thread *thread; 5942 int rc; 5943 5944 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 5945 5946 thread = spdk_get_thread(); 5947 if (!thread) { 5948 /* The user called this from a non-SPDK thread. */ 5949 if (cb_fn != NULL) { 5950 cb_fn(cb_arg, -ENOTSUP); 5951 } 5952 return; 5953 } 5954 5955 pthread_mutex_lock(&g_bdev_mgr.mutex); 5956 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5957 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5958 if (cb_fn) { 5959 cb_fn(cb_arg, -EBUSY); 5960 } 5961 return; 5962 } 5963 5964 pthread_mutex_lock(&bdev->internal.mutex); 5965 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 5966 bdev->internal.unregister_cb = cb_fn; 5967 bdev->internal.unregister_ctx = cb_arg; 5968 5969 /* Call under lock. */ 5970 rc = bdev_unregister_unsafe(bdev); 5971 pthread_mutex_unlock(&bdev->internal.mutex); 5972 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5973 5974 if (rc == 0) { 5975 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 5976 } 5977 } 5978 5979 static int 5980 bdev_start_qos(struct spdk_bdev *bdev) 5981 { 5982 struct set_qos_limit_ctx *ctx; 5983 5984 /* Enable QoS */ 5985 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 5986 ctx = calloc(1, sizeof(*ctx)); 5987 if (ctx == NULL) { 5988 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 5989 return -ENOMEM; 5990 } 5991 ctx->bdev = bdev; 5992 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5993 bdev_enable_qos_msg, ctx, 5994 bdev_enable_qos_done); 5995 } 5996 5997 return 0; 5998 } 5999 6000 static int 6001 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6002 { 6003 struct spdk_thread *thread; 6004 int rc = 0; 6005 6006 thread = spdk_get_thread(); 6007 if (!thread) { 6008 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6009 return -ENOTSUP; 6010 } 6011 6012 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6013 spdk_get_thread()); 6014 6015 desc->bdev = bdev; 6016 desc->thread = thread; 6017 desc->write = write; 6018 6019 pthread_mutex_lock(&bdev->internal.mutex); 6020 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6021 pthread_mutex_unlock(&bdev->internal.mutex); 6022 return -ENODEV; 6023 } 6024 6025 if (write && bdev->internal.claim_module) { 6026 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6027 bdev->name, bdev->internal.claim_module->name); 6028 pthread_mutex_unlock(&bdev->internal.mutex); 6029 return -EPERM; 6030 } 6031 6032 rc = bdev_start_qos(bdev); 6033 if (rc != 0) { 6034 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6035 pthread_mutex_unlock(&bdev->internal.mutex); 6036 return rc; 6037 } 6038 6039 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6040 6041 pthread_mutex_unlock(&bdev->internal.mutex); 6042 6043 return 0; 6044 } 6045 6046 int 6047 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6048 void *event_ctx, struct spdk_bdev_desc **_desc) 6049 { 6050 struct spdk_bdev_desc *desc; 6051 struct spdk_bdev *bdev; 6052 unsigned int event_id; 6053 int rc; 6054 6055 if (event_cb == NULL) { 6056 SPDK_ERRLOG("Missing event callback function\n"); 6057 return -EINVAL; 6058 } 6059 6060 pthread_mutex_lock(&g_bdev_mgr.mutex); 6061 6062 bdev = bdev_get_by_name(bdev_name); 6063 6064 if (bdev == NULL) { 6065 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6066 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6067 return -ENODEV; 6068 } 6069 6070 desc = calloc(1, sizeof(*desc)); 6071 if (desc == NULL) { 6072 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6073 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6074 return -ENOMEM; 6075 } 6076 6077 TAILQ_INIT(&desc->pending_media_events); 6078 TAILQ_INIT(&desc->free_media_events); 6079 6080 desc->callback.event_fn = event_cb; 6081 desc->callback.ctx = event_ctx; 6082 pthread_mutex_init(&desc->mutex, NULL); 6083 6084 if (bdev->media_events) { 6085 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6086 sizeof(*desc->media_events_buffer)); 6087 if (desc->media_events_buffer == NULL) { 6088 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6089 bdev_desc_free(desc); 6090 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6091 return -ENOMEM; 6092 } 6093 6094 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6095 TAILQ_INSERT_TAIL(&desc->free_media_events, 6096 &desc->media_events_buffer[event_id], tailq); 6097 } 6098 } 6099 6100 rc = bdev_open(bdev, write, desc); 6101 if (rc != 0) { 6102 bdev_desc_free(desc); 6103 desc = NULL; 6104 } 6105 6106 *_desc = desc; 6107 6108 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6109 6110 return rc; 6111 } 6112 6113 void 6114 spdk_bdev_close(struct spdk_bdev_desc *desc) 6115 { 6116 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6117 int rc; 6118 6119 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6120 spdk_get_thread()); 6121 6122 assert(desc->thread == spdk_get_thread()); 6123 6124 spdk_poller_unregister(&desc->io_timeout_poller); 6125 6126 pthread_mutex_lock(&g_bdev_mgr.mutex); 6127 pthread_mutex_lock(&bdev->internal.mutex); 6128 pthread_mutex_lock(&desc->mutex); 6129 6130 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6131 6132 desc->closed = true; 6133 6134 if (0 == desc->refs) { 6135 pthread_mutex_unlock(&desc->mutex); 6136 bdev_desc_free(desc); 6137 } else { 6138 pthread_mutex_unlock(&desc->mutex); 6139 } 6140 6141 /* If no more descriptors, kill QoS channel */ 6142 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6143 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6144 bdev->name, spdk_get_thread()); 6145 6146 if (bdev_qos_destroy(bdev)) { 6147 /* There isn't anything we can do to recover here. Just let the 6148 * old QoS poller keep running. The QoS handling won't change 6149 * cores when the user allocates a new channel, but it won't break. */ 6150 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6151 } 6152 } 6153 6154 spdk_bdev_set_qd_sampling_period(bdev, 0); 6155 6156 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6157 rc = bdev_unregister_unsafe(bdev); 6158 pthread_mutex_unlock(&bdev->internal.mutex); 6159 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6160 6161 if (rc == 0) { 6162 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6163 } 6164 } else { 6165 pthread_mutex_unlock(&bdev->internal.mutex); 6166 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6167 } 6168 } 6169 6170 int 6171 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6172 struct spdk_bdev_module *module) 6173 { 6174 if (bdev->internal.claim_module != NULL) { 6175 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6176 bdev->internal.claim_module->name); 6177 return -EPERM; 6178 } 6179 6180 if (desc && !desc->write) { 6181 desc->write = true; 6182 } 6183 6184 bdev->internal.claim_module = module; 6185 return 0; 6186 } 6187 6188 void 6189 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6190 { 6191 assert(bdev->internal.claim_module != NULL); 6192 bdev->internal.claim_module = NULL; 6193 } 6194 6195 struct spdk_bdev * 6196 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6197 { 6198 assert(desc != NULL); 6199 return desc->bdev; 6200 } 6201 6202 void 6203 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6204 { 6205 struct iovec *iovs; 6206 int iovcnt; 6207 6208 if (bdev_io == NULL) { 6209 return; 6210 } 6211 6212 switch (bdev_io->type) { 6213 case SPDK_BDEV_IO_TYPE_READ: 6214 case SPDK_BDEV_IO_TYPE_WRITE: 6215 case SPDK_BDEV_IO_TYPE_ZCOPY: 6216 iovs = bdev_io->u.bdev.iovs; 6217 iovcnt = bdev_io->u.bdev.iovcnt; 6218 break; 6219 default: 6220 iovs = NULL; 6221 iovcnt = 0; 6222 break; 6223 } 6224 6225 if (iovp) { 6226 *iovp = iovs; 6227 } 6228 if (iovcntp) { 6229 *iovcntp = iovcnt; 6230 } 6231 } 6232 6233 void * 6234 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 6235 { 6236 if (bdev_io == NULL) { 6237 return NULL; 6238 } 6239 6240 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 6241 return NULL; 6242 } 6243 6244 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 6245 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 6246 return bdev_io->u.bdev.md_buf; 6247 } 6248 6249 return NULL; 6250 } 6251 6252 void * 6253 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 6254 { 6255 if (bdev_io == NULL) { 6256 assert(false); 6257 return NULL; 6258 } 6259 6260 return bdev_io->internal.caller_ctx; 6261 } 6262 6263 void 6264 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 6265 { 6266 6267 if (spdk_bdev_module_list_find(bdev_module->name)) { 6268 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 6269 assert(false); 6270 } 6271 6272 /* 6273 * Modules with examine callbacks must be initialized first, so they are 6274 * ready to handle examine callbacks from later modules that will 6275 * register physical bdevs. 6276 */ 6277 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 6278 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6279 } else { 6280 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6281 } 6282 } 6283 6284 struct spdk_bdev_module * 6285 spdk_bdev_module_list_find(const char *name) 6286 { 6287 struct spdk_bdev_module *bdev_module; 6288 6289 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 6290 if (strcmp(name, bdev_module->name) == 0) { 6291 break; 6292 } 6293 } 6294 6295 return bdev_module; 6296 } 6297 6298 static void 6299 bdev_write_zero_buffer_next(void *_bdev_io) 6300 { 6301 struct spdk_bdev_io *bdev_io = _bdev_io; 6302 uint64_t num_bytes, num_blocks; 6303 void *md_buf = NULL; 6304 int rc; 6305 6306 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 6307 bdev_io->u.bdev.split_remaining_num_blocks, 6308 ZERO_BUFFER_SIZE); 6309 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 6310 6311 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 6312 md_buf = (char *)g_bdev_mgr.zero_buffer + 6313 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 6314 } 6315 6316 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 6317 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6318 g_bdev_mgr.zero_buffer, md_buf, 6319 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 6320 bdev_write_zero_buffer_done, bdev_io); 6321 if (rc == 0) { 6322 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 6323 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 6324 } else if (rc == -ENOMEM) { 6325 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 6326 } else { 6327 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6328 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6329 } 6330 } 6331 6332 static void 6333 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6334 { 6335 struct spdk_bdev_io *parent_io = cb_arg; 6336 6337 spdk_bdev_free_io(bdev_io); 6338 6339 if (!success) { 6340 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6341 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6342 return; 6343 } 6344 6345 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 6346 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6347 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6348 return; 6349 } 6350 6351 bdev_write_zero_buffer_next(parent_io); 6352 } 6353 6354 static void 6355 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 6356 { 6357 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6358 ctx->bdev->internal.qos_mod_in_progress = false; 6359 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6360 6361 if (ctx->cb_fn) { 6362 ctx->cb_fn(ctx->cb_arg, status); 6363 } 6364 free(ctx); 6365 } 6366 6367 static void 6368 bdev_disable_qos_done(void *cb_arg) 6369 { 6370 struct set_qos_limit_ctx *ctx = cb_arg; 6371 struct spdk_bdev *bdev = ctx->bdev; 6372 struct spdk_bdev_io *bdev_io; 6373 struct spdk_bdev_qos *qos; 6374 6375 pthread_mutex_lock(&bdev->internal.mutex); 6376 qos = bdev->internal.qos; 6377 bdev->internal.qos = NULL; 6378 pthread_mutex_unlock(&bdev->internal.mutex); 6379 6380 while (!TAILQ_EMPTY(&qos->queued)) { 6381 /* Send queued I/O back to their original thread for resubmission. */ 6382 bdev_io = TAILQ_FIRST(&qos->queued); 6383 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6384 6385 if (bdev_io->internal.io_submit_ch) { 6386 /* 6387 * Channel was changed when sending it to the QoS thread - change it back 6388 * before sending it back to the original thread. 6389 */ 6390 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6391 bdev_io->internal.io_submit_ch = NULL; 6392 } 6393 6394 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6395 _bdev_io_submit, bdev_io); 6396 } 6397 6398 if (qos->thread != NULL) { 6399 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6400 spdk_poller_unregister(&qos->poller); 6401 } 6402 6403 free(qos); 6404 6405 bdev_set_qos_limit_done(ctx, 0); 6406 } 6407 6408 static void 6409 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6410 { 6411 void *io_device = spdk_io_channel_iter_get_io_device(i); 6412 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6413 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6414 struct spdk_thread *thread; 6415 6416 pthread_mutex_lock(&bdev->internal.mutex); 6417 thread = bdev->internal.qos->thread; 6418 pthread_mutex_unlock(&bdev->internal.mutex); 6419 6420 if (thread != NULL) { 6421 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6422 } else { 6423 bdev_disable_qos_done(ctx); 6424 } 6425 } 6426 6427 static void 6428 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6429 { 6430 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6431 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6432 6433 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6434 6435 spdk_for_each_channel_continue(i, 0); 6436 } 6437 6438 static void 6439 bdev_update_qos_rate_limit_msg(void *cb_arg) 6440 { 6441 struct set_qos_limit_ctx *ctx = cb_arg; 6442 struct spdk_bdev *bdev = ctx->bdev; 6443 6444 pthread_mutex_lock(&bdev->internal.mutex); 6445 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6446 pthread_mutex_unlock(&bdev->internal.mutex); 6447 6448 bdev_set_qos_limit_done(ctx, 0); 6449 } 6450 6451 static void 6452 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6453 { 6454 void *io_device = spdk_io_channel_iter_get_io_device(i); 6455 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6456 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6457 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6458 6459 pthread_mutex_lock(&bdev->internal.mutex); 6460 bdev_enable_qos(bdev, bdev_ch); 6461 pthread_mutex_unlock(&bdev->internal.mutex); 6462 spdk_for_each_channel_continue(i, 0); 6463 } 6464 6465 static void 6466 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6467 { 6468 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6469 6470 bdev_set_qos_limit_done(ctx, status); 6471 } 6472 6473 static void 6474 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6475 { 6476 int i; 6477 6478 assert(bdev->internal.qos != NULL); 6479 6480 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6481 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6482 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6483 6484 if (limits[i] == 0) { 6485 bdev->internal.qos->rate_limits[i].limit = 6486 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6487 } 6488 } 6489 } 6490 } 6491 6492 void 6493 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6494 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6495 { 6496 struct set_qos_limit_ctx *ctx; 6497 uint32_t limit_set_complement; 6498 uint64_t min_limit_per_sec; 6499 int i; 6500 bool disable_rate_limit = true; 6501 6502 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6503 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6504 continue; 6505 } 6506 6507 if (limits[i] > 0) { 6508 disable_rate_limit = false; 6509 } 6510 6511 if (bdev_qos_is_iops_rate_limit(i) == true) { 6512 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 6513 } else { 6514 /* Change from megabyte to byte rate limit */ 6515 limits[i] = limits[i] * 1024 * 1024; 6516 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 6517 } 6518 6519 limit_set_complement = limits[i] % min_limit_per_sec; 6520 if (limit_set_complement) { 6521 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 6522 limits[i], min_limit_per_sec); 6523 limits[i] += min_limit_per_sec - limit_set_complement; 6524 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 6525 } 6526 } 6527 6528 ctx = calloc(1, sizeof(*ctx)); 6529 if (ctx == NULL) { 6530 cb_fn(cb_arg, -ENOMEM); 6531 return; 6532 } 6533 6534 ctx->cb_fn = cb_fn; 6535 ctx->cb_arg = cb_arg; 6536 ctx->bdev = bdev; 6537 6538 pthread_mutex_lock(&bdev->internal.mutex); 6539 if (bdev->internal.qos_mod_in_progress) { 6540 pthread_mutex_unlock(&bdev->internal.mutex); 6541 free(ctx); 6542 cb_fn(cb_arg, -EAGAIN); 6543 return; 6544 } 6545 bdev->internal.qos_mod_in_progress = true; 6546 6547 if (disable_rate_limit == true && bdev->internal.qos) { 6548 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6549 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 6550 (bdev->internal.qos->rate_limits[i].limit > 0 && 6551 bdev->internal.qos->rate_limits[i].limit != 6552 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 6553 disable_rate_limit = false; 6554 break; 6555 } 6556 } 6557 } 6558 6559 if (disable_rate_limit == false) { 6560 if (bdev->internal.qos == NULL) { 6561 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 6562 if (!bdev->internal.qos) { 6563 pthread_mutex_unlock(&bdev->internal.mutex); 6564 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 6565 bdev_set_qos_limit_done(ctx, -ENOMEM); 6566 return; 6567 } 6568 } 6569 6570 if (bdev->internal.qos->thread == NULL) { 6571 /* Enabling */ 6572 bdev_set_qos_rate_limits(bdev, limits); 6573 6574 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6575 bdev_enable_qos_msg, ctx, 6576 bdev_enable_qos_done); 6577 } else { 6578 /* Updating */ 6579 bdev_set_qos_rate_limits(bdev, limits); 6580 6581 spdk_thread_send_msg(bdev->internal.qos->thread, 6582 bdev_update_qos_rate_limit_msg, ctx); 6583 } 6584 } else { 6585 if (bdev->internal.qos != NULL) { 6586 bdev_set_qos_rate_limits(bdev, limits); 6587 6588 /* Disabling */ 6589 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6590 bdev_disable_qos_msg, ctx, 6591 bdev_disable_qos_msg_done); 6592 } else { 6593 pthread_mutex_unlock(&bdev->internal.mutex); 6594 bdev_set_qos_limit_done(ctx, 0); 6595 return; 6596 } 6597 } 6598 6599 pthread_mutex_unlock(&bdev->internal.mutex); 6600 } 6601 6602 struct spdk_bdev_histogram_ctx { 6603 spdk_bdev_histogram_status_cb cb_fn; 6604 void *cb_arg; 6605 struct spdk_bdev *bdev; 6606 int status; 6607 }; 6608 6609 static void 6610 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 6611 { 6612 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6613 6614 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6615 ctx->bdev->internal.histogram_in_progress = false; 6616 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6617 ctx->cb_fn(ctx->cb_arg, ctx->status); 6618 free(ctx); 6619 } 6620 6621 static void 6622 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 6623 { 6624 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6625 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6626 6627 if (ch->histogram != NULL) { 6628 spdk_histogram_data_free(ch->histogram); 6629 ch->histogram = NULL; 6630 } 6631 spdk_for_each_channel_continue(i, 0); 6632 } 6633 6634 static void 6635 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 6636 { 6637 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6638 6639 if (status != 0) { 6640 ctx->status = status; 6641 ctx->bdev->internal.histogram_enabled = false; 6642 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 6643 bdev_histogram_disable_channel_cb); 6644 } else { 6645 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6646 ctx->bdev->internal.histogram_in_progress = false; 6647 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6648 ctx->cb_fn(ctx->cb_arg, ctx->status); 6649 free(ctx); 6650 } 6651 } 6652 6653 static void 6654 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 6655 { 6656 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6657 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6658 int status = 0; 6659 6660 if (ch->histogram == NULL) { 6661 ch->histogram = spdk_histogram_data_alloc(); 6662 if (ch->histogram == NULL) { 6663 status = -ENOMEM; 6664 } 6665 } 6666 6667 spdk_for_each_channel_continue(i, status); 6668 } 6669 6670 void 6671 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 6672 void *cb_arg, bool enable) 6673 { 6674 struct spdk_bdev_histogram_ctx *ctx; 6675 6676 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 6677 if (ctx == NULL) { 6678 cb_fn(cb_arg, -ENOMEM); 6679 return; 6680 } 6681 6682 ctx->bdev = bdev; 6683 ctx->status = 0; 6684 ctx->cb_fn = cb_fn; 6685 ctx->cb_arg = cb_arg; 6686 6687 pthread_mutex_lock(&bdev->internal.mutex); 6688 if (bdev->internal.histogram_in_progress) { 6689 pthread_mutex_unlock(&bdev->internal.mutex); 6690 free(ctx); 6691 cb_fn(cb_arg, -EAGAIN); 6692 return; 6693 } 6694 6695 bdev->internal.histogram_in_progress = true; 6696 pthread_mutex_unlock(&bdev->internal.mutex); 6697 6698 bdev->internal.histogram_enabled = enable; 6699 6700 if (enable) { 6701 /* Allocate histogram for each channel */ 6702 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 6703 bdev_histogram_enable_channel_cb); 6704 } else { 6705 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 6706 bdev_histogram_disable_channel_cb); 6707 } 6708 } 6709 6710 struct spdk_bdev_histogram_data_ctx { 6711 spdk_bdev_histogram_data_cb cb_fn; 6712 void *cb_arg; 6713 struct spdk_bdev *bdev; 6714 /** merged histogram data from all channels */ 6715 struct spdk_histogram_data *histogram; 6716 }; 6717 6718 static void 6719 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 6720 { 6721 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6722 6723 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 6724 free(ctx); 6725 } 6726 6727 static void 6728 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 6729 { 6730 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6731 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6732 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6733 int status = 0; 6734 6735 if (ch->histogram == NULL) { 6736 status = -EFAULT; 6737 } else { 6738 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 6739 } 6740 6741 spdk_for_each_channel_continue(i, status); 6742 } 6743 6744 void 6745 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 6746 spdk_bdev_histogram_data_cb cb_fn, 6747 void *cb_arg) 6748 { 6749 struct spdk_bdev_histogram_data_ctx *ctx; 6750 6751 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 6752 if (ctx == NULL) { 6753 cb_fn(cb_arg, -ENOMEM, NULL); 6754 return; 6755 } 6756 6757 ctx->bdev = bdev; 6758 ctx->cb_fn = cb_fn; 6759 ctx->cb_arg = cb_arg; 6760 6761 ctx->histogram = histogram; 6762 6763 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 6764 bdev_histogram_get_channel_cb); 6765 } 6766 6767 size_t 6768 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 6769 size_t max_events) 6770 { 6771 struct media_event_entry *entry; 6772 size_t num_events = 0; 6773 6774 for (; num_events < max_events; ++num_events) { 6775 entry = TAILQ_FIRST(&desc->pending_media_events); 6776 if (entry == NULL) { 6777 break; 6778 } 6779 6780 events[num_events] = entry->event; 6781 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 6782 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 6783 } 6784 6785 return num_events; 6786 } 6787 6788 int 6789 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 6790 size_t num_events) 6791 { 6792 struct spdk_bdev_desc *desc; 6793 struct media_event_entry *entry; 6794 size_t event_id; 6795 int rc = 0; 6796 6797 assert(bdev->media_events); 6798 6799 pthread_mutex_lock(&bdev->internal.mutex); 6800 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6801 if (desc->write) { 6802 break; 6803 } 6804 } 6805 6806 if (desc == NULL || desc->media_events_buffer == NULL) { 6807 rc = -ENODEV; 6808 goto out; 6809 } 6810 6811 for (event_id = 0; event_id < num_events; ++event_id) { 6812 entry = TAILQ_FIRST(&desc->free_media_events); 6813 if (entry == NULL) { 6814 break; 6815 } 6816 6817 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 6818 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 6819 entry->event = events[event_id]; 6820 } 6821 6822 rc = event_id; 6823 out: 6824 pthread_mutex_unlock(&bdev->internal.mutex); 6825 return rc; 6826 } 6827 6828 void 6829 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 6830 { 6831 struct spdk_bdev_desc *desc; 6832 6833 pthread_mutex_lock(&bdev->internal.mutex); 6834 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6835 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 6836 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 6837 desc->callback.ctx); 6838 } 6839 } 6840 pthread_mutex_unlock(&bdev->internal.mutex); 6841 } 6842 6843 struct locked_lba_range_ctx { 6844 struct lba_range range; 6845 struct spdk_bdev *bdev; 6846 struct lba_range *current_range; 6847 struct lba_range *owner_range; 6848 struct spdk_poller *poller; 6849 lock_range_cb cb_fn; 6850 void *cb_arg; 6851 }; 6852 6853 static void 6854 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 6855 { 6856 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6857 6858 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 6859 free(ctx); 6860 } 6861 6862 static void 6863 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 6864 6865 static void 6866 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6867 { 6868 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6869 struct spdk_bdev *bdev = ctx->bdev; 6870 6871 if (status == -ENOMEM) { 6872 /* One of the channels could not allocate a range object. 6873 * So we have to go back and clean up any ranges that were 6874 * allocated successfully before we return error status to 6875 * the caller. We can reuse the unlock function to do that 6876 * clean up. 6877 */ 6878 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6879 bdev_unlock_lba_range_get_channel, ctx, 6880 bdev_lock_error_cleanup_cb); 6881 return; 6882 } 6883 6884 /* All channels have locked this range and no I/O overlapping the range 6885 * are outstanding! Set the owner_ch for the range object for the 6886 * locking channel, so that this channel will know that it is allowed 6887 * to write to this range. 6888 */ 6889 ctx->owner_range->owner_ch = ctx->range.owner_ch; 6890 ctx->cb_fn(ctx->cb_arg, status); 6891 6892 /* Don't free the ctx here. Its range is in the bdev's global list of 6893 * locked ranges still, and will be removed and freed when this range 6894 * is later unlocked. 6895 */ 6896 } 6897 6898 static int 6899 bdev_lock_lba_range_check_io(void *_i) 6900 { 6901 struct spdk_io_channel_iter *i = _i; 6902 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6903 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6904 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6905 struct lba_range *range = ctx->current_range; 6906 struct spdk_bdev_io *bdev_io; 6907 6908 spdk_poller_unregister(&ctx->poller); 6909 6910 /* The range is now in the locked_ranges, so no new IO can be submitted to this 6911 * range. But we need to wait until any outstanding IO overlapping with this range 6912 * are completed. 6913 */ 6914 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 6915 if (bdev_io_range_is_locked(bdev_io, range)) { 6916 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 6917 return SPDK_POLLER_BUSY; 6918 } 6919 } 6920 6921 spdk_for_each_channel_continue(i, 0); 6922 return SPDK_POLLER_BUSY; 6923 } 6924 6925 static void 6926 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6927 { 6928 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6929 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6930 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6931 struct lba_range *range; 6932 6933 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6934 if (range->length == ctx->range.length && 6935 range->offset == ctx->range.offset && 6936 range->locked_ctx == ctx->range.locked_ctx) { 6937 /* This range already exists on this channel, so don't add 6938 * it again. This can happen when a new channel is created 6939 * while the for_each_channel operation is in progress. 6940 * Do not check for outstanding I/O in that case, since the 6941 * range was locked before any I/O could be submitted to the 6942 * new channel. 6943 */ 6944 spdk_for_each_channel_continue(i, 0); 6945 return; 6946 } 6947 } 6948 6949 range = calloc(1, sizeof(*range)); 6950 if (range == NULL) { 6951 spdk_for_each_channel_continue(i, -ENOMEM); 6952 return; 6953 } 6954 6955 range->length = ctx->range.length; 6956 range->offset = ctx->range.offset; 6957 range->locked_ctx = ctx->range.locked_ctx; 6958 ctx->current_range = range; 6959 if (ctx->range.owner_ch == ch) { 6960 /* This is the range object for the channel that will hold 6961 * the lock. Store it in the ctx object so that we can easily 6962 * set its owner_ch after the lock is finally acquired. 6963 */ 6964 ctx->owner_range = range; 6965 } 6966 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 6967 bdev_lock_lba_range_check_io(i); 6968 } 6969 6970 static void 6971 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 6972 { 6973 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 6974 6975 /* We will add a copy of this range to each channel now. */ 6976 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 6977 bdev_lock_lba_range_cb); 6978 } 6979 6980 static bool 6981 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 6982 { 6983 struct lba_range *r; 6984 6985 TAILQ_FOREACH(r, tailq, tailq) { 6986 if (bdev_lba_range_overlapped(range, r)) { 6987 return true; 6988 } 6989 } 6990 return false; 6991 } 6992 6993 static int 6994 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 6995 uint64_t offset, uint64_t length, 6996 lock_range_cb cb_fn, void *cb_arg) 6997 { 6998 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6999 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7000 struct locked_lba_range_ctx *ctx; 7001 7002 if (cb_arg == NULL) { 7003 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7004 return -EINVAL; 7005 } 7006 7007 ctx = calloc(1, sizeof(*ctx)); 7008 if (ctx == NULL) { 7009 return -ENOMEM; 7010 } 7011 7012 ctx->range.offset = offset; 7013 ctx->range.length = length; 7014 ctx->range.owner_ch = ch; 7015 ctx->range.locked_ctx = cb_arg; 7016 ctx->bdev = bdev; 7017 ctx->cb_fn = cb_fn; 7018 ctx->cb_arg = cb_arg; 7019 7020 pthread_mutex_lock(&bdev->internal.mutex); 7021 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7022 /* There is an active lock overlapping with this range. 7023 * Put it on the pending list until this range no 7024 * longer overlaps with another. 7025 */ 7026 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7027 } else { 7028 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7029 bdev_lock_lba_range_ctx(bdev, ctx); 7030 } 7031 pthread_mutex_unlock(&bdev->internal.mutex); 7032 return 0; 7033 } 7034 7035 static void 7036 bdev_lock_lba_range_ctx_msg(void *_ctx) 7037 { 7038 struct locked_lba_range_ctx *ctx = _ctx; 7039 7040 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7041 } 7042 7043 static void 7044 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7045 { 7046 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7047 struct locked_lba_range_ctx *pending_ctx; 7048 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7049 struct spdk_bdev *bdev = ch->bdev; 7050 struct lba_range *range, *tmp; 7051 7052 pthread_mutex_lock(&bdev->internal.mutex); 7053 /* Check if there are any pending locked ranges that overlap with this range 7054 * that was just unlocked. If there are, check that it doesn't overlap with any 7055 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7056 * the lock process. 7057 */ 7058 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7059 if (bdev_lba_range_overlapped(range, &ctx->range) && 7060 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7061 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7062 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7063 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7064 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7065 bdev_lock_lba_range_ctx_msg, pending_ctx); 7066 } 7067 } 7068 pthread_mutex_unlock(&bdev->internal.mutex); 7069 7070 ctx->cb_fn(ctx->cb_arg, status); 7071 free(ctx); 7072 } 7073 7074 static void 7075 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7076 { 7077 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7078 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7079 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7080 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7081 struct spdk_bdev_io *bdev_io; 7082 struct lba_range *range; 7083 7084 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7085 if (ctx->range.offset == range->offset && 7086 ctx->range.length == range->length && 7087 ctx->range.locked_ctx == range->locked_ctx) { 7088 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7089 free(range); 7090 break; 7091 } 7092 } 7093 7094 /* Note: we should almost always be able to assert that the range specified 7095 * was found. But there are some very rare corner cases where a new channel 7096 * gets created simultaneously with a range unlock, where this function 7097 * would execute on that new channel and wouldn't have the range. 7098 * We also use this to clean up range allocations when a later allocation 7099 * fails in the locking path. 7100 * So we can't actually assert() here. 7101 */ 7102 7103 /* Swap the locked IO into a temporary list, and then try to submit them again. 7104 * We could hyper-optimize this to only resubmit locked I/O that overlap 7105 * with the range that was just unlocked, but this isn't a performance path so 7106 * we go for simplicity here. 7107 */ 7108 TAILQ_INIT(&io_locked); 7109 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7110 while (!TAILQ_EMPTY(&io_locked)) { 7111 bdev_io = TAILQ_FIRST(&io_locked); 7112 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7113 bdev_io_submit(bdev_io); 7114 } 7115 7116 spdk_for_each_channel_continue(i, 0); 7117 } 7118 7119 static int 7120 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7121 uint64_t offset, uint64_t length, 7122 lock_range_cb cb_fn, void *cb_arg) 7123 { 7124 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7125 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7126 struct locked_lba_range_ctx *ctx; 7127 struct lba_range *range; 7128 bool range_found = false; 7129 7130 /* Let's make sure the specified channel actually has a lock on 7131 * the specified range. Note that the range must match exactly. 7132 */ 7133 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7134 if (range->offset == offset && range->length == length && 7135 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7136 range_found = true; 7137 break; 7138 } 7139 } 7140 7141 if (!range_found) { 7142 return -EINVAL; 7143 } 7144 7145 pthread_mutex_lock(&bdev->internal.mutex); 7146 /* We confirmed that this channel has locked the specified range. To 7147 * start the unlock the process, we find the range in the bdev's locked_ranges 7148 * and remove it. This ensures new channels don't inherit the locked range. 7149 * Then we will send a message to each channel (including the one specified 7150 * here) to remove the range from its per-channel list. 7151 */ 7152 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7153 if (range->offset == offset && range->length == length && 7154 range->locked_ctx == cb_arg) { 7155 break; 7156 } 7157 } 7158 if (range == NULL) { 7159 assert(false); 7160 pthread_mutex_unlock(&bdev->internal.mutex); 7161 return -EINVAL; 7162 } 7163 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7164 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7165 pthread_mutex_unlock(&bdev->internal.mutex); 7166 7167 ctx->cb_fn = cb_fn; 7168 ctx->cb_arg = cb_arg; 7169 7170 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7171 bdev_unlock_lba_range_cb); 7172 return 0; 7173 } 7174 7175 int 7176 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7177 int array_size) 7178 { 7179 if (!bdev) { 7180 return -EINVAL; 7181 } 7182 7183 if (bdev->fn_table->get_memory_domains) { 7184 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7185 } 7186 7187 return 0; 7188 } 7189 7190 SPDK_LOG_REGISTER_COMPONENT(bdev) 7191 7192 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 7193 { 7194 struct spdk_trace_tpoint_opts opts[] = { 7195 { 7196 "BDEV_IO_START", TRACE_BDEV_IO_START, 7197 OWNER_BDEV, OBJECT_BDEV_IO, 1, 7198 { 7199 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7200 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 } 7201 } 7202 }, 7203 { 7204 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 7205 OWNER_BDEV, OBJECT_BDEV_IO, 0, 7206 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7207 }, 7208 { 7209 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 7210 OWNER_BDEV, OBJECT_NONE, 1, 7211 { 7212 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7213 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7214 } 7215 }, 7216 { 7217 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 7218 OWNER_BDEV, OBJECT_NONE, 0, 7219 { 7220 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7221 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7222 } 7223 }, 7224 }; 7225 7226 7227 spdk_trace_register_owner(OWNER_BDEV, 'b'); 7228 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 7229 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7230 } 7231