1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/notify.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk/log.h" 52 #include "spdk/string.h" 53 54 #include "bdev_internal.h" 55 #include "spdk_internal/trace_defs.h" 56 57 #ifdef SPDK_CONFIG_VTUNE 58 #include "ittnotify.h" 59 #include "ittnotify_types.h" 60 int __itt_init_ittlib(const char *, __itt_group_id); 61 #endif 62 63 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 64 #define SPDK_BDEV_IO_CACHE_SIZE 256 65 #define SPDK_BDEV_AUTO_EXAMINE true 66 #define BUF_SMALL_POOL_SIZE 8191 67 #define BUF_LARGE_POOL_SIZE 1023 68 #define NOMEM_THRESHOLD_COUNT 8 69 #define ZERO_BUFFER_SIZE 0x100000 70 71 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 72 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 73 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 74 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 75 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 76 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 77 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 78 79 #define SPDK_BDEV_POOL_ALIGNMENT 512 80 81 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 82 * when splitting into children requests at a time. 83 */ 84 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 85 86 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 87 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 88 }; 89 90 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 91 92 RB_HEAD(bdev_name_tree, spdk_bdev_name); 93 94 static int 95 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 96 { 97 return strcmp(name1->name, name2->name); 98 } 99 100 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 101 102 struct spdk_bdev_mgr { 103 struct spdk_mempool *bdev_io_pool; 104 105 struct spdk_mempool *buf_small_pool; 106 struct spdk_mempool *buf_large_pool; 107 108 void *zero_buffer; 109 110 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 111 112 struct spdk_bdev_list bdevs; 113 struct bdev_name_tree bdev_names; 114 115 bool init_complete; 116 bool module_init_complete; 117 118 pthread_mutex_t mutex; 119 120 #ifdef SPDK_CONFIG_VTUNE 121 __itt_domain *domain; 122 #endif 123 }; 124 125 static struct spdk_bdev_mgr g_bdev_mgr = { 126 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 127 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 128 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 129 .init_complete = false, 130 .module_init_complete = false, 131 .mutex = PTHREAD_MUTEX_INITIALIZER, 132 }; 133 134 typedef void (*lock_range_cb)(void *ctx, int status); 135 136 struct lba_range { 137 uint64_t offset; 138 uint64_t length; 139 void *locked_ctx; 140 struct spdk_bdev_channel *owner_ch; 141 TAILQ_ENTRY(lba_range) tailq; 142 }; 143 144 static struct spdk_bdev_opts g_bdev_opts = { 145 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 146 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 147 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 148 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 149 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 150 }; 151 152 static spdk_bdev_init_cb g_init_cb_fn = NULL; 153 static void *g_init_cb_arg = NULL; 154 155 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 156 static void *g_fini_cb_arg = NULL; 157 static struct spdk_thread *g_fini_thread = NULL; 158 159 struct spdk_bdev_qos_limit { 160 /** IOs or bytes allowed per second (i.e., 1s). */ 161 uint64_t limit; 162 163 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 164 * For remaining bytes, allowed to run negative if an I/O is submitted when 165 * some bytes are remaining, but the I/O is bigger than that amount. The 166 * excess will be deducted from the next timeslice. 167 */ 168 int64_t remaining_this_timeslice; 169 170 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 171 uint32_t min_per_timeslice; 172 173 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t max_per_timeslice; 175 176 /** Function to check whether to queue the IO. */ 177 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 178 179 /** Function to update for the submitted IO. */ 180 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 }; 182 183 struct spdk_bdev_qos { 184 /** Types of structure of rate limits. */ 185 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 186 187 /** The channel that all I/O are funneled through. */ 188 struct spdk_bdev_channel *ch; 189 190 /** The thread on which the poller is running. */ 191 struct spdk_thread *thread; 192 193 /** Queue of I/O waiting to be issued. */ 194 bdev_io_tailq_t queued; 195 196 /** Size of a timeslice in tsc ticks. */ 197 uint64_t timeslice_size; 198 199 /** Timestamp of start of last timeslice. */ 200 uint64_t last_timeslice; 201 202 /** Poller that processes queued I/O commands each time slice. */ 203 struct spdk_poller *poller; 204 }; 205 206 struct spdk_bdev_mgmt_channel { 207 bdev_io_stailq_t need_buf_small; 208 bdev_io_stailq_t need_buf_large; 209 210 /* 211 * Each thread keeps a cache of bdev_io - this allows 212 * bdev threads which are *not* DPDK threads to still 213 * benefit from a per-thread bdev_io cache. Without 214 * this, non-DPDK threads fetching from the mempool 215 * incur a cmpxchg on get and put. 216 */ 217 bdev_io_stailq_t per_thread_cache; 218 uint32_t per_thread_cache_count; 219 uint32_t bdev_io_cache_size; 220 221 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 222 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 223 }; 224 225 /* 226 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 227 * will queue here their IO that awaits retry. It makes it possible to retry sending 228 * IO to one bdev after IO from other bdev completes. 229 */ 230 struct spdk_bdev_shared_resource { 231 /* The bdev management channel */ 232 struct spdk_bdev_mgmt_channel *mgmt_ch; 233 234 /* 235 * Count of I/O submitted to bdev module and waiting for completion. 236 * Incremented before submit_request() is called on an spdk_bdev_io. 237 */ 238 uint64_t io_outstanding; 239 240 /* 241 * Queue of IO awaiting retry because of a previous NOMEM status returned 242 * on this channel. 243 */ 244 bdev_io_tailq_t nomem_io; 245 246 /* 247 * Threshold which io_outstanding must drop to before retrying nomem_io. 248 */ 249 uint64_t nomem_threshold; 250 251 /* I/O channel allocated by a bdev module */ 252 struct spdk_io_channel *shared_ch; 253 254 /* Refcount of bdev channels using this resource */ 255 uint32_t ref; 256 257 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 258 }; 259 260 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 261 #define BDEV_CH_QOS_ENABLED (1 << 1) 262 263 struct spdk_bdev_channel { 264 struct spdk_bdev *bdev; 265 266 /* The channel for the underlying device */ 267 struct spdk_io_channel *channel; 268 269 /* Per io_device per thread data */ 270 struct spdk_bdev_shared_resource *shared_resource; 271 272 struct spdk_bdev_io_stat stat; 273 274 /* 275 * Count of I/O submitted to the underlying dev module through this channel 276 * and waiting for completion. 277 */ 278 uint64_t io_outstanding; 279 280 /* 281 * List of all submitted I/Os including I/O that are generated via splitting. 282 */ 283 bdev_io_tailq_t io_submitted; 284 285 /* 286 * List of spdk_bdev_io that are currently queued because they write to a locked 287 * LBA range. 288 */ 289 bdev_io_tailq_t io_locked; 290 291 uint32_t flags; 292 293 struct spdk_histogram_data *histogram; 294 295 #ifdef SPDK_CONFIG_VTUNE 296 uint64_t start_tsc; 297 uint64_t interval_tsc; 298 __itt_string_handle *handle; 299 struct spdk_bdev_io_stat prev_stat; 300 #endif 301 302 bdev_io_tailq_t queued_resets; 303 304 lba_range_tailq_t locked_ranges; 305 }; 306 307 struct media_event_entry { 308 struct spdk_bdev_media_event event; 309 TAILQ_ENTRY(media_event_entry) tailq; 310 }; 311 312 #define MEDIA_EVENT_POOL_SIZE 64 313 314 struct spdk_bdev_desc { 315 struct spdk_bdev *bdev; 316 struct spdk_thread *thread; 317 struct { 318 spdk_bdev_event_cb_t event_fn; 319 void *ctx; 320 } callback; 321 bool closed; 322 bool write; 323 pthread_mutex_t mutex; 324 uint32_t refs; 325 TAILQ_HEAD(, media_event_entry) pending_media_events; 326 TAILQ_HEAD(, media_event_entry) free_media_events; 327 struct media_event_entry *media_events_buffer; 328 TAILQ_ENTRY(spdk_bdev_desc) link; 329 330 uint64_t timeout_in_sec; 331 spdk_bdev_io_timeout_cb cb_fn; 332 void *cb_arg; 333 struct spdk_poller *io_timeout_poller; 334 }; 335 336 struct spdk_bdev_iostat_ctx { 337 struct spdk_bdev_io_stat *stat; 338 spdk_bdev_get_device_stat_cb cb; 339 void *cb_arg; 340 }; 341 342 struct set_qos_limit_ctx { 343 void (*cb_fn)(void *cb_arg, int status); 344 void *cb_arg; 345 struct spdk_bdev *bdev; 346 }; 347 348 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 349 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 350 351 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 352 static void bdev_write_zero_buffer_next(void *_bdev_io); 353 354 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 355 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 356 357 static int 358 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 359 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 360 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 361 struct spdk_bdev_ext_io_opts *opts); 362 static int 363 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 364 struct iovec *iov, int iovcnt, void *md_buf, 365 uint64_t offset_blocks, uint64_t num_blocks, 366 spdk_bdev_io_completion_cb cb, void *cb_arg, 367 struct spdk_bdev_ext_io_opts *opts); 368 369 static int 370 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 371 uint64_t offset, uint64_t length, 372 lock_range_cb cb_fn, void *cb_arg); 373 374 static int 375 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 376 uint64_t offset, uint64_t length, 377 lock_range_cb cb_fn, void *cb_arg); 378 379 static inline void bdev_io_complete(void *ctx); 380 381 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 382 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 383 384 void 385 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 386 { 387 if (!opts) { 388 SPDK_ERRLOG("opts should not be NULL\n"); 389 return; 390 } 391 392 if (!opts_size) { 393 SPDK_ERRLOG("opts_size should not be zero value\n"); 394 return; 395 } 396 397 opts->opts_size = opts_size; 398 399 #define SET_FIELD(field) \ 400 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 401 opts->field = g_bdev_opts.field; \ 402 } \ 403 404 SET_FIELD(bdev_io_pool_size); 405 SET_FIELD(bdev_io_cache_size); 406 SET_FIELD(bdev_auto_examine); 407 SET_FIELD(small_buf_pool_size); 408 SET_FIELD(large_buf_pool_size); 409 410 /* Do not remove this statement, you should always update this statement when you adding a new field, 411 * and do not forget to add the SET_FIELD statement for your added field. */ 412 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 413 414 #undef SET_FIELD 415 } 416 417 int 418 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 419 { 420 uint32_t min_pool_size; 421 422 if (!opts) { 423 SPDK_ERRLOG("opts cannot be NULL\n"); 424 return -1; 425 } 426 427 if (!opts->opts_size) { 428 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 429 return -1; 430 } 431 432 /* 433 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 434 * initialization. A second mgmt_ch will be created on the same thread when the application starts 435 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 436 */ 437 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 438 if (opts->bdev_io_pool_size < min_pool_size) { 439 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 440 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 441 spdk_thread_get_count()); 442 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 443 return -1; 444 } 445 446 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 447 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 448 return -1; 449 } 450 451 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 452 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 453 return -1; 454 } 455 456 #define SET_FIELD(field) \ 457 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 458 g_bdev_opts.field = opts->field; \ 459 } \ 460 461 SET_FIELD(bdev_io_pool_size); 462 SET_FIELD(bdev_io_cache_size); 463 SET_FIELD(bdev_auto_examine); 464 SET_FIELD(small_buf_pool_size); 465 SET_FIELD(large_buf_pool_size); 466 467 g_bdev_opts.opts_size = opts->opts_size; 468 469 #undef SET_FIELD 470 471 return 0; 472 } 473 474 static struct spdk_bdev * 475 bdev_get_by_name(const char *bdev_name) 476 { 477 struct spdk_bdev_name find; 478 struct spdk_bdev_name *res; 479 480 find.name = (char *)bdev_name; 481 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 482 if (res != NULL) { 483 return res->bdev; 484 } 485 486 return NULL; 487 } 488 489 struct spdk_bdev * 490 spdk_bdev_get_by_name(const char *bdev_name) 491 { 492 struct spdk_bdev *bdev; 493 494 pthread_mutex_lock(&g_bdev_mgr.mutex); 495 bdev = bdev_get_by_name(bdev_name); 496 pthread_mutex_unlock(&g_bdev_mgr.mutex); 497 498 return bdev; 499 } 500 501 struct spdk_bdev_wait_for_examine_ctx { 502 struct spdk_poller *poller; 503 spdk_bdev_wait_for_examine_cb cb_fn; 504 void *cb_arg; 505 }; 506 507 static bool 508 bdev_module_all_actions_completed(void); 509 510 static int 511 bdev_wait_for_examine_cb(void *arg) 512 { 513 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 514 515 if (!bdev_module_all_actions_completed()) { 516 return SPDK_POLLER_IDLE; 517 } 518 519 spdk_poller_unregister(&ctx->poller); 520 ctx->cb_fn(ctx->cb_arg); 521 free(ctx); 522 523 return SPDK_POLLER_BUSY; 524 } 525 526 int 527 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 528 { 529 struct spdk_bdev_wait_for_examine_ctx *ctx; 530 531 ctx = calloc(1, sizeof(*ctx)); 532 if (ctx == NULL) { 533 return -ENOMEM; 534 } 535 ctx->cb_fn = cb_fn; 536 ctx->cb_arg = cb_arg; 537 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 538 539 return 0; 540 } 541 542 struct spdk_bdev_examine_item { 543 char *name; 544 TAILQ_ENTRY(spdk_bdev_examine_item) link; 545 }; 546 547 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 548 549 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 550 g_bdev_examine_allowlist); 551 552 static inline bool 553 bdev_examine_allowlist_check(const char *name) 554 { 555 struct spdk_bdev_examine_item *item; 556 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 557 if (strcmp(name, item->name) == 0) { 558 return true; 559 } 560 } 561 return false; 562 } 563 564 static inline void 565 bdev_examine_allowlist_free(void) 566 { 567 struct spdk_bdev_examine_item *item; 568 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 569 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 570 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 571 free(item->name); 572 free(item); 573 } 574 } 575 576 static inline bool 577 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 578 { 579 struct spdk_bdev_alias *tmp; 580 if (bdev_examine_allowlist_check(bdev->name)) { 581 return true; 582 } 583 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 584 if (bdev_examine_allowlist_check(tmp->alias.name)) { 585 return true; 586 } 587 } 588 return false; 589 } 590 591 static inline bool 592 bdev_ok_to_examine(struct spdk_bdev *bdev) 593 { 594 if (g_bdev_opts.bdev_auto_examine) { 595 return true; 596 } else { 597 return bdev_in_examine_allowlist(bdev); 598 } 599 } 600 601 static void 602 bdev_examine(struct spdk_bdev *bdev) 603 { 604 struct spdk_bdev_module *module; 605 uint32_t action; 606 607 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 608 if (module->examine_config && bdev_ok_to_examine(bdev)) { 609 action = module->internal.action_in_progress; 610 module->internal.action_in_progress++; 611 module->examine_config(bdev); 612 if (action != module->internal.action_in_progress) { 613 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 614 module->name); 615 } 616 } 617 } 618 619 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 620 if (bdev->internal.claim_module->examine_disk) { 621 bdev->internal.claim_module->internal.action_in_progress++; 622 bdev->internal.claim_module->examine_disk(bdev); 623 } 624 return; 625 } 626 627 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 628 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 629 module->internal.action_in_progress++; 630 module->examine_disk(bdev); 631 } 632 } 633 } 634 635 int 636 spdk_bdev_examine(const char *name) 637 { 638 struct spdk_bdev *bdev; 639 struct spdk_bdev_examine_item *item; 640 641 if (g_bdev_opts.bdev_auto_examine) { 642 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 643 return -EINVAL; 644 } 645 646 if (bdev_examine_allowlist_check(name)) { 647 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 648 return -EEXIST; 649 } 650 651 item = calloc(1, sizeof(*item)); 652 if (!item) { 653 return -ENOMEM; 654 } 655 item->name = strdup(name); 656 if (!item->name) { 657 free(item); 658 return -ENOMEM; 659 } 660 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 661 662 bdev = spdk_bdev_get_by_name(name); 663 if (bdev) { 664 bdev_examine(bdev); 665 } 666 return 0; 667 } 668 669 static inline void 670 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 671 { 672 struct spdk_bdev_examine_item *item; 673 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 674 spdk_json_write_object_begin(w); 675 spdk_json_write_named_string(w, "method", "bdev_examine"); 676 spdk_json_write_named_object_begin(w, "params"); 677 spdk_json_write_named_string(w, "name", item->name); 678 spdk_json_write_object_end(w); 679 spdk_json_write_object_end(w); 680 } 681 } 682 683 struct spdk_bdev * 684 spdk_bdev_first(void) 685 { 686 struct spdk_bdev *bdev; 687 688 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 689 if (bdev) { 690 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 691 } 692 693 return bdev; 694 } 695 696 struct spdk_bdev * 697 spdk_bdev_next(struct spdk_bdev *prev) 698 { 699 struct spdk_bdev *bdev; 700 701 bdev = TAILQ_NEXT(prev, internal.link); 702 if (bdev) { 703 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 704 } 705 706 return bdev; 707 } 708 709 static struct spdk_bdev * 710 _bdev_next_leaf(struct spdk_bdev *bdev) 711 { 712 while (bdev != NULL) { 713 if (bdev->internal.claim_module == NULL) { 714 return bdev; 715 } else { 716 bdev = TAILQ_NEXT(bdev, internal.link); 717 } 718 } 719 720 return bdev; 721 } 722 723 struct spdk_bdev * 724 spdk_bdev_first_leaf(void) 725 { 726 struct spdk_bdev *bdev; 727 728 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 729 730 if (bdev) { 731 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 732 } 733 734 return bdev; 735 } 736 737 struct spdk_bdev * 738 spdk_bdev_next_leaf(struct spdk_bdev *prev) 739 { 740 struct spdk_bdev *bdev; 741 742 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 743 744 if (bdev) { 745 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 746 } 747 748 return bdev; 749 } 750 751 void 752 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 753 { 754 struct iovec *iovs; 755 756 if (bdev_io->u.bdev.iovs == NULL) { 757 bdev_io->u.bdev.iovs = &bdev_io->iov; 758 bdev_io->u.bdev.iovcnt = 1; 759 } 760 761 iovs = bdev_io->u.bdev.iovs; 762 763 assert(iovs != NULL); 764 assert(bdev_io->u.bdev.iovcnt >= 1); 765 766 iovs[0].iov_base = buf; 767 iovs[0].iov_len = len; 768 } 769 770 void 771 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 772 { 773 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 774 bdev_io->u.bdev.md_buf = md_buf; 775 } 776 777 static bool 778 _is_buf_allocated(const struct iovec *iovs) 779 { 780 if (iovs == NULL) { 781 return false; 782 } 783 784 return iovs[0].iov_base != NULL; 785 } 786 787 static bool 788 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 789 { 790 int i; 791 uintptr_t iov_base; 792 793 if (spdk_likely(alignment == 1)) { 794 return true; 795 } 796 797 for (i = 0; i < iovcnt; i++) { 798 iov_base = (uintptr_t)iovs[i].iov_base; 799 if ((iov_base & (alignment - 1)) != 0) { 800 return false; 801 } 802 } 803 804 return true; 805 } 806 807 static void 808 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 809 { 810 int i; 811 size_t len; 812 813 for (i = 0; i < iovcnt; i++) { 814 len = spdk_min(iovs[i].iov_len, buf_len); 815 memcpy(buf, iovs[i].iov_base, len); 816 buf += len; 817 buf_len -= len; 818 } 819 } 820 821 static void 822 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 823 { 824 int i; 825 size_t len; 826 827 for (i = 0; i < iovcnt; i++) { 828 len = spdk_min(iovs[i].iov_len, buf_len); 829 memcpy(iovs[i].iov_base, buf, len); 830 buf += len; 831 buf_len -= len; 832 } 833 } 834 835 static void 836 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 837 { 838 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 839 void *buf; 840 841 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 842 buf = bdev_io->internal.buf; 843 bdev_io->internal.buf = NULL; 844 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 845 bdev_io->internal.get_aux_buf_cb = NULL; 846 } else { 847 assert(bdev_io->internal.get_buf_cb != NULL); 848 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 849 bdev_io->internal.get_buf_cb = NULL; 850 } 851 } 852 853 static void 854 _bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 855 { 856 /* save original md_buf */ 857 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 858 bdev_io->internal.orig_md_iov.iov_len = len; 859 /* set bounce md_buf */ 860 bdev_io->u.bdev.md_buf = md_buf; 861 862 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 863 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 864 } 865 } 866 867 static void 868 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 869 { 870 struct spdk_bdev *bdev = bdev_io->bdev; 871 uint64_t md_len; 872 void *buf; 873 874 if (spdk_bdev_is_md_separate(bdev)) { 875 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 876 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 877 878 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 879 880 if (bdev_io->u.bdev.md_buf != NULL) { 881 _bdev_io_set_bounce_md_buf(bdev_io, buf, md_len); 882 return; 883 } else { 884 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 885 } 886 } 887 888 bdev_io_get_buf_complete(bdev_io, true); 889 } 890 891 static void 892 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 893 { 894 /* save original iovec */ 895 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 896 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 897 /* set bounce iov */ 898 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 899 bdev_io->u.bdev.iovcnt = 1; 900 /* set bounce buffer for this operation */ 901 bdev_io->u.bdev.iovs[0].iov_base = buf; 902 bdev_io->u.bdev.iovs[0].iov_len = len; 903 /* if this is write path, copy data from original buffer to bounce buffer */ 904 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 905 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 906 } 907 } 908 909 static void 910 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 911 { 912 struct spdk_bdev *bdev = bdev_io->bdev; 913 bool buf_allocated; 914 uint64_t alignment; 915 void *aligned_buf; 916 917 bdev_io->internal.buf = buf; 918 919 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 920 bdev_io_get_buf_complete(bdev_io, true); 921 return; 922 } 923 924 alignment = spdk_bdev_get_buf_align(bdev); 925 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 926 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 927 928 if (buf_allocated) { 929 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 930 } else { 931 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 932 } 933 934 _bdev_io_set_md_buf(bdev_io); 935 } 936 937 static void 938 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 939 { 940 struct spdk_bdev *bdev = bdev_io->bdev; 941 struct spdk_mempool *pool; 942 struct spdk_bdev_io *tmp; 943 bdev_io_stailq_t *stailq; 944 struct spdk_bdev_mgmt_channel *ch; 945 uint64_t md_len, alignment; 946 947 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 948 alignment = spdk_bdev_get_buf_align(bdev); 949 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 950 951 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 952 SPDK_BDEV_POOL_ALIGNMENT) { 953 pool = g_bdev_mgr.buf_small_pool; 954 stailq = &ch->need_buf_small; 955 } else { 956 pool = g_bdev_mgr.buf_large_pool; 957 stailq = &ch->need_buf_large; 958 } 959 960 if (STAILQ_EMPTY(stailq)) { 961 spdk_mempool_put(pool, buf); 962 } else { 963 tmp = STAILQ_FIRST(stailq); 964 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 965 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 966 } 967 } 968 969 static void 970 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 971 { 972 assert(bdev_io->internal.buf != NULL); 973 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 974 bdev_io->internal.buf = NULL; 975 } 976 977 void 978 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 979 { 980 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 981 982 assert(buf != NULL); 983 _bdev_io_put_buf(bdev_io, buf, len); 984 } 985 986 static void 987 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 988 { 989 struct spdk_bdev *bdev = bdev_ch->bdev; 990 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 991 struct spdk_bdev_io *bdev_io; 992 993 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 994 /* 995 * Allow some more I/O to complete before retrying the nomem_io queue. 996 * Some drivers (such as nvme) cannot immediately take a new I/O in 997 * the context of a completion, because the resources for the I/O are 998 * not released until control returns to the bdev poller. Also, we 999 * may require several small I/O to complete before a larger I/O 1000 * (that requires splitting) can be submitted. 1001 */ 1002 return; 1003 } 1004 1005 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1006 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1007 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1008 bdev_io->internal.ch->io_outstanding++; 1009 shared_resource->io_outstanding++; 1010 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1011 bdev_io->internal.error.nvme.cdw0 = 0; 1012 bdev_io->num_retries++; 1013 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1014 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1015 break; 1016 } 1017 } 1018 } 1019 1020 static inline void 1021 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1022 struct spdk_bdev_shared_resource *shared_resource) 1023 { 1024 assert(bdev_ch->io_outstanding > 0); 1025 assert(shared_resource->io_outstanding > 0); 1026 bdev_ch->io_outstanding--; 1027 shared_resource->io_outstanding--; 1028 } 1029 1030 static inline bool 1031 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1032 { 1033 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1034 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1035 1036 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1037 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1038 /* 1039 * Wait for some of the outstanding I/O to complete before we 1040 * retry any of the nomem_io. Normally we will wait for 1041 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1042 * depth channels we will instead wait for half to complete. 1043 */ 1044 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1045 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1046 return true; 1047 } 1048 1049 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1050 bdev_ch_retry_io(bdev_ch); 1051 } 1052 1053 return false; 1054 } 1055 1056 static inline void 1057 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1058 { 1059 /* do the same for metadata buffer */ 1060 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1061 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1062 1063 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1064 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1065 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1066 bdev_io->internal.orig_md_iov.iov_len); 1067 } 1068 1069 bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_iov.iov_base; 1070 bdev_io->internal.orig_md_iov.iov_base = NULL; 1071 } 1072 1073 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1074 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1075 */ 1076 bdev_io_put_buf(bdev_io); 1077 } 1078 1079 static void 1080 _bdev_io_push_bounce_data_buffer_done(void *ctx) 1081 { 1082 struct spdk_bdev_io *bdev_io = ctx; 1083 1084 /* set original buffer for this io */ 1085 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1086 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1087 /* disable bouncing buffer for this io */ 1088 bdev_io->internal.orig_iovcnt = 0; 1089 bdev_io->internal.orig_iovs = NULL; 1090 1091 _bdev_io_push_bounce_md_buffer(bdev_io); 1092 } 1093 1094 static void 1095 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 1096 { 1097 if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) { 1098 assert(bdev_io->internal.orig_md_iov.iov_base == NULL); 1099 return; 1100 } 1101 1102 /* if this is read path, copy data from bounce buffer to original buffer */ 1103 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1104 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1105 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1106 bdev_io->internal.orig_iovcnt, 1107 bdev_io->internal.bounce_iov.iov_base, 1108 bdev_io->internal.bounce_iov.iov_len); 1109 } 1110 1111 _bdev_io_push_bounce_data_buffer_done(bdev_io); 1112 } 1113 1114 static void 1115 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1116 { 1117 struct spdk_bdev *bdev = bdev_io->bdev; 1118 struct spdk_mempool *pool; 1119 bdev_io_stailq_t *stailq; 1120 struct spdk_bdev_mgmt_channel *mgmt_ch; 1121 uint64_t alignment, md_len; 1122 void *buf; 1123 1124 alignment = spdk_bdev_get_buf_align(bdev); 1125 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1126 1127 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1128 SPDK_BDEV_POOL_ALIGNMENT) { 1129 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1130 len + alignment); 1131 bdev_io_get_buf_complete(bdev_io, false); 1132 return; 1133 } 1134 1135 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1136 1137 bdev_io->internal.buf_len = len; 1138 1139 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1140 SPDK_BDEV_POOL_ALIGNMENT) { 1141 pool = g_bdev_mgr.buf_small_pool; 1142 stailq = &mgmt_ch->need_buf_small; 1143 } else { 1144 pool = g_bdev_mgr.buf_large_pool; 1145 stailq = &mgmt_ch->need_buf_large; 1146 } 1147 1148 buf = spdk_mempool_get(pool); 1149 if (!buf) { 1150 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1151 } else { 1152 _bdev_io_set_buf(bdev_io, buf, len); 1153 } 1154 } 1155 1156 void 1157 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1158 { 1159 struct spdk_bdev *bdev = bdev_io->bdev; 1160 uint64_t alignment; 1161 1162 assert(cb != NULL); 1163 bdev_io->internal.get_buf_cb = cb; 1164 1165 alignment = spdk_bdev_get_buf_align(bdev); 1166 1167 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1168 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1169 /* Buffer already present and aligned */ 1170 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1171 return; 1172 } 1173 1174 bdev_io_get_buf(bdev_io, len); 1175 } 1176 1177 void 1178 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1179 { 1180 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1181 1182 assert(cb != NULL); 1183 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1184 bdev_io->internal.get_aux_buf_cb = cb; 1185 bdev_io_get_buf(bdev_io, len); 1186 } 1187 1188 static int 1189 bdev_module_get_max_ctx_size(void) 1190 { 1191 struct spdk_bdev_module *bdev_module; 1192 int max_bdev_module_size = 0; 1193 1194 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1195 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1196 max_bdev_module_size = bdev_module->get_ctx_size(); 1197 } 1198 } 1199 1200 return max_bdev_module_size; 1201 } 1202 1203 static void 1204 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1205 { 1206 int i; 1207 struct spdk_bdev_qos *qos = bdev->internal.qos; 1208 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1209 1210 if (!qos) { 1211 return; 1212 } 1213 1214 spdk_bdev_get_qos_rate_limits(bdev, limits); 1215 1216 spdk_json_write_object_begin(w); 1217 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1218 1219 spdk_json_write_named_object_begin(w, "params"); 1220 spdk_json_write_named_string(w, "name", bdev->name); 1221 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1222 if (limits[i] > 0) { 1223 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1224 } 1225 } 1226 spdk_json_write_object_end(w); 1227 1228 spdk_json_write_object_end(w); 1229 } 1230 1231 void 1232 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1233 { 1234 struct spdk_bdev_module *bdev_module; 1235 struct spdk_bdev *bdev; 1236 1237 assert(w != NULL); 1238 1239 spdk_json_write_array_begin(w); 1240 1241 spdk_json_write_object_begin(w); 1242 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1243 spdk_json_write_named_object_begin(w, "params"); 1244 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1245 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1246 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1247 spdk_json_write_object_end(w); 1248 spdk_json_write_object_end(w); 1249 1250 bdev_examine_allowlist_config_json(w); 1251 1252 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1253 if (bdev_module->config_json) { 1254 bdev_module->config_json(w); 1255 } 1256 } 1257 1258 pthread_mutex_lock(&g_bdev_mgr.mutex); 1259 1260 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1261 if (bdev->fn_table->write_config_json) { 1262 bdev->fn_table->write_config_json(bdev, w); 1263 } 1264 1265 bdev_qos_config_json(bdev, w); 1266 } 1267 1268 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1269 1270 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1271 spdk_json_write_object_begin(w); 1272 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1273 spdk_json_write_object_end(w); 1274 1275 spdk_json_write_array_end(w); 1276 } 1277 1278 static int 1279 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1280 { 1281 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1282 struct spdk_bdev_io *bdev_io; 1283 uint32_t i; 1284 1285 STAILQ_INIT(&ch->need_buf_small); 1286 STAILQ_INIT(&ch->need_buf_large); 1287 1288 STAILQ_INIT(&ch->per_thread_cache); 1289 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1290 1291 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1292 ch->per_thread_cache_count = 0; 1293 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1294 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1295 assert(bdev_io != NULL); 1296 ch->per_thread_cache_count++; 1297 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1298 } 1299 1300 TAILQ_INIT(&ch->shared_resources); 1301 TAILQ_INIT(&ch->io_wait_queue); 1302 1303 return 0; 1304 } 1305 1306 static void 1307 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1308 { 1309 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1310 struct spdk_bdev_io *bdev_io; 1311 1312 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1313 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1314 } 1315 1316 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1317 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1318 } 1319 1320 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1321 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1322 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1323 ch->per_thread_cache_count--; 1324 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1325 } 1326 1327 assert(ch->per_thread_cache_count == 0); 1328 } 1329 1330 static void 1331 bdev_init_complete(int rc) 1332 { 1333 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1334 void *cb_arg = g_init_cb_arg; 1335 struct spdk_bdev_module *m; 1336 1337 g_bdev_mgr.init_complete = true; 1338 g_init_cb_fn = NULL; 1339 g_init_cb_arg = NULL; 1340 1341 /* 1342 * For modules that need to know when subsystem init is complete, 1343 * inform them now. 1344 */ 1345 if (rc == 0) { 1346 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1347 if (m->init_complete) { 1348 m->init_complete(); 1349 } 1350 } 1351 } 1352 1353 cb_fn(cb_arg, rc); 1354 } 1355 1356 static bool 1357 bdev_module_all_actions_completed(void) 1358 { 1359 struct spdk_bdev_module *m; 1360 1361 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1362 if (m->internal.action_in_progress > 0) { 1363 return false; 1364 } 1365 } 1366 return true; 1367 } 1368 1369 static void 1370 bdev_module_action_complete(void) 1371 { 1372 /* 1373 * Don't finish bdev subsystem initialization if 1374 * module pre-initialization is still in progress, or 1375 * the subsystem been already initialized. 1376 */ 1377 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1378 return; 1379 } 1380 1381 /* 1382 * Check all bdev modules for inits/examinations in progress. If any 1383 * exist, return immediately since we cannot finish bdev subsystem 1384 * initialization until all are completed. 1385 */ 1386 if (!bdev_module_all_actions_completed()) { 1387 return; 1388 } 1389 1390 /* 1391 * Modules already finished initialization - now that all 1392 * the bdev modules have finished their asynchronous I/O 1393 * processing, the entire bdev layer can be marked as complete. 1394 */ 1395 bdev_init_complete(0); 1396 } 1397 1398 static void 1399 bdev_module_action_done(struct spdk_bdev_module *module) 1400 { 1401 assert(module->internal.action_in_progress > 0); 1402 module->internal.action_in_progress--; 1403 bdev_module_action_complete(); 1404 } 1405 1406 void 1407 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1408 { 1409 bdev_module_action_done(module); 1410 } 1411 1412 void 1413 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1414 { 1415 bdev_module_action_done(module); 1416 } 1417 1418 /** The last initialized bdev module */ 1419 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1420 1421 static void 1422 bdev_init_failed(void *cb_arg) 1423 { 1424 struct spdk_bdev_module *module = cb_arg; 1425 1426 module->internal.action_in_progress--; 1427 bdev_init_complete(-1); 1428 } 1429 1430 static int 1431 bdev_modules_init(void) 1432 { 1433 struct spdk_bdev_module *module; 1434 int rc = 0; 1435 1436 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1437 g_resume_bdev_module = module; 1438 if (module->async_init) { 1439 module->internal.action_in_progress = 1; 1440 } 1441 rc = module->module_init(); 1442 if (rc != 0) { 1443 /* Bump action_in_progress to prevent other modules from completion of modules_init 1444 * Send message to defer application shutdown until resources are cleaned up */ 1445 module->internal.action_in_progress = 1; 1446 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1447 return rc; 1448 } 1449 } 1450 1451 g_resume_bdev_module = NULL; 1452 return 0; 1453 } 1454 1455 void 1456 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1457 { 1458 int cache_size; 1459 int rc = 0; 1460 char mempool_name[32]; 1461 1462 assert(cb_fn != NULL); 1463 1464 g_init_cb_fn = cb_fn; 1465 g_init_cb_arg = cb_arg; 1466 1467 spdk_notify_type_register("bdev_register"); 1468 spdk_notify_type_register("bdev_unregister"); 1469 1470 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1471 1472 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1473 g_bdev_opts.bdev_io_pool_size, 1474 sizeof(struct spdk_bdev_io) + 1475 bdev_module_get_max_ctx_size(), 1476 0, 1477 SPDK_ENV_SOCKET_ID_ANY); 1478 1479 if (g_bdev_mgr.bdev_io_pool == NULL) { 1480 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1481 bdev_init_complete(-1); 1482 return; 1483 } 1484 1485 /** 1486 * Ensure no more than half of the total buffers end up local caches, by 1487 * using spdk_env_get_core_count() to determine how many local caches we need 1488 * to account for. 1489 */ 1490 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1491 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1492 1493 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1494 g_bdev_opts.small_buf_pool_size, 1495 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1496 SPDK_BDEV_POOL_ALIGNMENT, 1497 cache_size, 1498 SPDK_ENV_SOCKET_ID_ANY); 1499 if (!g_bdev_mgr.buf_small_pool) { 1500 SPDK_ERRLOG("create rbuf small pool failed\n"); 1501 bdev_init_complete(-1); 1502 return; 1503 } 1504 1505 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1506 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1507 1508 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1509 g_bdev_opts.large_buf_pool_size, 1510 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1511 SPDK_BDEV_POOL_ALIGNMENT, 1512 cache_size, 1513 SPDK_ENV_SOCKET_ID_ANY); 1514 if (!g_bdev_mgr.buf_large_pool) { 1515 SPDK_ERRLOG("create rbuf large pool failed\n"); 1516 bdev_init_complete(-1); 1517 return; 1518 } 1519 1520 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1521 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1522 if (!g_bdev_mgr.zero_buffer) { 1523 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1524 bdev_init_complete(-1); 1525 return; 1526 } 1527 1528 #ifdef SPDK_CONFIG_VTUNE 1529 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1530 #endif 1531 1532 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1533 bdev_mgmt_channel_destroy, 1534 sizeof(struct spdk_bdev_mgmt_channel), 1535 "bdev_mgr"); 1536 1537 rc = bdev_modules_init(); 1538 g_bdev_mgr.module_init_complete = true; 1539 if (rc != 0) { 1540 SPDK_ERRLOG("bdev modules init failed\n"); 1541 return; 1542 } 1543 1544 bdev_module_action_complete(); 1545 } 1546 1547 static void 1548 bdev_mgr_unregister_cb(void *io_device) 1549 { 1550 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1551 1552 if (g_bdev_mgr.bdev_io_pool) { 1553 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1554 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1555 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1556 g_bdev_opts.bdev_io_pool_size); 1557 } 1558 1559 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1560 } 1561 1562 if (g_bdev_mgr.buf_small_pool) { 1563 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1564 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1565 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1566 g_bdev_opts.small_buf_pool_size); 1567 assert(false); 1568 } 1569 1570 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1571 } 1572 1573 if (g_bdev_mgr.buf_large_pool) { 1574 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1575 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1576 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1577 g_bdev_opts.large_buf_pool_size); 1578 assert(false); 1579 } 1580 1581 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1582 } 1583 1584 spdk_free(g_bdev_mgr.zero_buffer); 1585 1586 bdev_examine_allowlist_free(); 1587 1588 cb_fn(g_fini_cb_arg); 1589 g_fini_cb_fn = NULL; 1590 g_fini_cb_arg = NULL; 1591 g_bdev_mgr.init_complete = false; 1592 g_bdev_mgr.module_init_complete = false; 1593 } 1594 1595 static void 1596 bdev_module_fini_iter(void *arg) 1597 { 1598 struct spdk_bdev_module *bdev_module; 1599 1600 /* FIXME: Handling initialization failures is broken now, 1601 * so we won't even try cleaning up after successfully 1602 * initialized modules. if module_init_complete is false, 1603 * just call spdk_bdev_mgr_unregister_cb 1604 */ 1605 if (!g_bdev_mgr.module_init_complete) { 1606 bdev_mgr_unregister_cb(NULL); 1607 return; 1608 } 1609 1610 /* Start iterating from the last touched module */ 1611 if (!g_resume_bdev_module) { 1612 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1613 } else { 1614 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1615 internal.tailq); 1616 } 1617 1618 while (bdev_module) { 1619 if (bdev_module->async_fini) { 1620 /* Save our place so we can resume later. We must 1621 * save the variable here, before calling module_fini() 1622 * below, because in some cases the module may immediately 1623 * call spdk_bdev_module_fini_done() and re-enter 1624 * this function to continue iterating. */ 1625 g_resume_bdev_module = bdev_module; 1626 } 1627 1628 if (bdev_module->module_fini) { 1629 bdev_module->module_fini(); 1630 } 1631 1632 if (bdev_module->async_fini) { 1633 return; 1634 } 1635 1636 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1637 internal.tailq); 1638 } 1639 1640 g_resume_bdev_module = NULL; 1641 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1642 } 1643 1644 void 1645 spdk_bdev_module_fini_done(void) 1646 { 1647 if (spdk_get_thread() != g_fini_thread) { 1648 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1649 } else { 1650 bdev_module_fini_iter(NULL); 1651 } 1652 } 1653 1654 static void 1655 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1656 { 1657 struct spdk_bdev *bdev = cb_arg; 1658 1659 if (bdeverrno && bdev) { 1660 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1661 bdev->name); 1662 1663 /* 1664 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1665 * bdev; try to continue by manually removing this bdev from the list and continue 1666 * with the next bdev in the list. 1667 */ 1668 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1669 } 1670 1671 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1672 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1673 /* 1674 * Bdev module finish need to be deferred as we might be in the middle of some context 1675 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1676 * after returning. 1677 */ 1678 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1679 return; 1680 } 1681 1682 /* 1683 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1684 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1685 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1686 * base bdevs. 1687 * 1688 * Also, walk the list in the reverse order. 1689 */ 1690 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1691 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1692 if (bdev->internal.claim_module != NULL) { 1693 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1694 bdev->name, bdev->internal.claim_module->name); 1695 continue; 1696 } 1697 1698 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1699 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1700 return; 1701 } 1702 1703 /* 1704 * If any bdev fails to unclaim underlying bdev properly, we may face the 1705 * case of bdev list consisting of claimed bdevs only (if claims are managed 1706 * correctly, this would mean there's a loop in the claims graph which is 1707 * clearly impossible). Warn and unregister last bdev on the list then. 1708 */ 1709 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1710 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1711 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1712 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1713 return; 1714 } 1715 } 1716 1717 static void 1718 bdev_module_fini_start_iter(void *arg) 1719 { 1720 struct spdk_bdev_module *bdev_module; 1721 1722 if (!g_resume_bdev_module) { 1723 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1724 } else { 1725 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1726 } 1727 1728 while (bdev_module) { 1729 if (bdev_module->async_fini_start) { 1730 /* Save our place so we can resume later. We must 1731 * save the variable here, before calling fini_start() 1732 * below, because in some cases the module may immediately 1733 * call spdk_bdev_module_fini_start_done() and re-enter 1734 * this function to continue iterating. */ 1735 g_resume_bdev_module = bdev_module; 1736 } 1737 1738 if (bdev_module->fini_start) { 1739 bdev_module->fini_start(); 1740 } 1741 1742 if (bdev_module->async_fini_start) { 1743 return; 1744 } 1745 1746 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1747 } 1748 1749 g_resume_bdev_module = NULL; 1750 1751 bdev_finish_unregister_bdevs_iter(NULL, 0); 1752 } 1753 1754 void 1755 spdk_bdev_module_fini_start_done(void) 1756 { 1757 if (spdk_get_thread() != g_fini_thread) { 1758 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1759 } else { 1760 bdev_module_fini_start_iter(NULL); 1761 } 1762 } 1763 1764 void 1765 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1766 { 1767 assert(cb_fn != NULL); 1768 1769 g_fini_thread = spdk_get_thread(); 1770 1771 g_fini_cb_fn = cb_fn; 1772 g_fini_cb_arg = cb_arg; 1773 1774 bdev_module_fini_start_iter(NULL); 1775 } 1776 1777 struct spdk_bdev_io * 1778 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1779 { 1780 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1781 struct spdk_bdev_io *bdev_io; 1782 1783 if (ch->per_thread_cache_count > 0) { 1784 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1785 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1786 ch->per_thread_cache_count--; 1787 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1788 /* 1789 * Don't try to look for bdev_ios in the global pool if there are 1790 * waiters on bdev_ios - we don't want this caller to jump the line. 1791 */ 1792 bdev_io = NULL; 1793 } else { 1794 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1795 } 1796 1797 return bdev_io; 1798 } 1799 1800 void 1801 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1802 { 1803 struct spdk_bdev_mgmt_channel *ch; 1804 1805 assert(bdev_io != NULL); 1806 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1807 1808 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1809 1810 if (bdev_io->internal.buf != NULL) { 1811 bdev_io_put_buf(bdev_io); 1812 } 1813 1814 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1815 ch->per_thread_cache_count++; 1816 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1817 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1818 struct spdk_bdev_io_wait_entry *entry; 1819 1820 entry = TAILQ_FIRST(&ch->io_wait_queue); 1821 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1822 entry->cb_fn(entry->cb_arg); 1823 } 1824 } else { 1825 /* We should never have a full cache with entries on the io wait queue. */ 1826 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1827 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1828 } 1829 } 1830 1831 static bool 1832 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1833 { 1834 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1835 1836 switch (limit) { 1837 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1838 return true; 1839 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1840 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1841 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1842 return false; 1843 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1844 default: 1845 return false; 1846 } 1847 } 1848 1849 static bool 1850 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1851 { 1852 switch (bdev_io->type) { 1853 case SPDK_BDEV_IO_TYPE_NVME_IO: 1854 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1855 case SPDK_BDEV_IO_TYPE_READ: 1856 case SPDK_BDEV_IO_TYPE_WRITE: 1857 return true; 1858 case SPDK_BDEV_IO_TYPE_ZCOPY: 1859 if (bdev_io->u.bdev.zcopy.start) { 1860 return true; 1861 } else { 1862 return false; 1863 } 1864 default: 1865 return false; 1866 } 1867 } 1868 1869 static bool 1870 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1871 { 1872 switch (bdev_io->type) { 1873 case SPDK_BDEV_IO_TYPE_NVME_IO: 1874 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1875 /* Bit 1 (0x2) set for read operation */ 1876 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1877 return true; 1878 } else { 1879 return false; 1880 } 1881 case SPDK_BDEV_IO_TYPE_READ: 1882 return true; 1883 case SPDK_BDEV_IO_TYPE_ZCOPY: 1884 /* Populate to read from disk */ 1885 if (bdev_io->u.bdev.zcopy.populate) { 1886 return true; 1887 } else { 1888 return false; 1889 } 1890 default: 1891 return false; 1892 } 1893 } 1894 1895 static uint64_t 1896 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1897 { 1898 struct spdk_bdev *bdev = bdev_io->bdev; 1899 1900 switch (bdev_io->type) { 1901 case SPDK_BDEV_IO_TYPE_NVME_IO: 1902 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1903 return bdev_io->u.nvme_passthru.nbytes; 1904 case SPDK_BDEV_IO_TYPE_READ: 1905 case SPDK_BDEV_IO_TYPE_WRITE: 1906 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1907 case SPDK_BDEV_IO_TYPE_ZCOPY: 1908 /* Track the data in the start phase only */ 1909 if (bdev_io->u.bdev.zcopy.start) { 1910 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1911 } else { 1912 return 0; 1913 } 1914 default: 1915 return 0; 1916 } 1917 } 1918 1919 static bool 1920 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1921 { 1922 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1923 return true; 1924 } else { 1925 return false; 1926 } 1927 } 1928 1929 static bool 1930 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1931 { 1932 if (bdev_is_read_io(io) == false) { 1933 return false; 1934 } 1935 1936 return bdev_qos_rw_queue_io(limit, io); 1937 } 1938 1939 static bool 1940 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1941 { 1942 if (bdev_is_read_io(io) == true) { 1943 return false; 1944 } 1945 1946 return bdev_qos_rw_queue_io(limit, io); 1947 } 1948 1949 static void 1950 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1951 { 1952 limit->remaining_this_timeslice--; 1953 } 1954 1955 static void 1956 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1957 { 1958 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 1959 } 1960 1961 static void 1962 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1963 { 1964 if (bdev_is_read_io(io) == false) { 1965 return; 1966 } 1967 1968 return bdev_qos_rw_bps_update_quota(limit, io); 1969 } 1970 1971 static void 1972 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1973 { 1974 if (bdev_is_read_io(io) == true) { 1975 return; 1976 } 1977 1978 return bdev_qos_rw_bps_update_quota(limit, io); 1979 } 1980 1981 static void 1982 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1983 { 1984 int i; 1985 1986 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1987 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1988 qos->rate_limits[i].queue_io = NULL; 1989 qos->rate_limits[i].update_quota = NULL; 1990 continue; 1991 } 1992 1993 switch (i) { 1994 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1995 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1996 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 1997 break; 1998 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1999 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2000 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2001 break; 2002 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2003 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2004 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2005 break; 2006 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2007 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2008 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2009 break; 2010 default: 2011 break; 2012 } 2013 } 2014 } 2015 2016 static void 2017 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2018 struct spdk_bdev_io *bdev_io, 2019 enum spdk_bdev_io_status status) 2020 { 2021 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2022 2023 bdev_io->internal.in_submit_request = true; 2024 bdev_ch->io_outstanding++; 2025 shared_resource->io_outstanding++; 2026 spdk_bdev_io_complete(bdev_io, status); 2027 bdev_io->internal.in_submit_request = false; 2028 } 2029 2030 static inline void 2031 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2032 { 2033 struct spdk_bdev *bdev = bdev_io->bdev; 2034 struct spdk_io_channel *ch = bdev_ch->channel; 2035 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2036 2037 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2038 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2039 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2040 2041 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2042 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 2043 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 2044 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2045 SPDK_BDEV_IO_STATUS_SUCCESS); 2046 return; 2047 } 2048 } 2049 2050 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2051 bdev_ch->io_outstanding++; 2052 shared_resource->io_outstanding++; 2053 bdev_io->internal.in_submit_request = true; 2054 bdev->fn_table->submit_request(ch, bdev_io); 2055 bdev_io->internal.in_submit_request = false; 2056 } else { 2057 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2058 } 2059 } 2060 2061 static int 2062 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2063 { 2064 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2065 int i, submitted_ios = 0; 2066 2067 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2068 if (bdev_qos_io_to_limit(bdev_io) == true) { 2069 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2070 if (!qos->rate_limits[i].queue_io) { 2071 continue; 2072 } 2073 2074 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2075 bdev_io) == true) { 2076 return submitted_ios; 2077 } 2078 } 2079 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2080 if (!qos->rate_limits[i].update_quota) { 2081 continue; 2082 } 2083 2084 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2085 } 2086 } 2087 2088 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2089 bdev_io_do_submit(ch, bdev_io); 2090 submitted_ios++; 2091 } 2092 2093 return submitted_ios; 2094 } 2095 2096 static void 2097 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2098 { 2099 int rc; 2100 2101 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2102 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2103 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2104 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2105 &bdev_io->internal.waitq_entry); 2106 if (rc != 0) { 2107 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2108 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2109 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2110 } 2111 } 2112 2113 static bool 2114 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2115 { 2116 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 2117 uint32_t max_size = bdev_io->bdev->max_segment_size; 2118 int max_segs = bdev_io->bdev->max_num_segments; 2119 2120 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 2121 2122 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2123 return false; 2124 } 2125 2126 if (io_boundary) { 2127 uint64_t start_stripe, end_stripe; 2128 2129 start_stripe = bdev_io->u.bdev.offset_blocks; 2130 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2131 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2132 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2133 start_stripe >>= spdk_u32log2(io_boundary); 2134 end_stripe >>= spdk_u32log2(io_boundary); 2135 } else { 2136 start_stripe /= io_boundary; 2137 end_stripe /= io_boundary; 2138 } 2139 2140 if (start_stripe != end_stripe) { 2141 return true; 2142 } 2143 } 2144 2145 if (max_segs) { 2146 if (bdev_io->u.bdev.iovcnt > max_segs) { 2147 return true; 2148 } 2149 } 2150 2151 if (max_size) { 2152 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2153 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2154 return true; 2155 } 2156 } 2157 } 2158 2159 return false; 2160 } 2161 2162 static bool 2163 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2164 { 2165 uint32_t num_unmap_segments; 2166 2167 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2168 return false; 2169 } 2170 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2171 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2172 return true; 2173 } 2174 2175 return false; 2176 } 2177 2178 static bool 2179 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2180 { 2181 if (!bdev_io->bdev->max_write_zeroes) { 2182 return false; 2183 } 2184 2185 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2186 return true; 2187 } 2188 2189 return false; 2190 } 2191 2192 static bool 2193 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2194 { 2195 switch (bdev_io->type) { 2196 case SPDK_BDEV_IO_TYPE_READ: 2197 case SPDK_BDEV_IO_TYPE_WRITE: 2198 return bdev_rw_should_split(bdev_io); 2199 case SPDK_BDEV_IO_TYPE_UNMAP: 2200 return bdev_unmap_should_split(bdev_io); 2201 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2202 return bdev_write_zeroes_should_split(bdev_io); 2203 default: 2204 return false; 2205 } 2206 } 2207 2208 static uint32_t 2209 _to_next_boundary(uint64_t offset, uint32_t boundary) 2210 { 2211 return (boundary - (offset % boundary)); 2212 } 2213 2214 static void 2215 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2216 2217 static void 2218 _bdev_rw_split(void *_bdev_io); 2219 2220 static void 2221 bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2222 2223 static void 2224 _bdev_unmap_split(void *_bdev_io) 2225 { 2226 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2227 } 2228 2229 static void 2230 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2231 2232 static void 2233 _bdev_write_zeroes_split(void *_bdev_io) 2234 { 2235 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2236 } 2237 2238 static int 2239 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2240 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2241 { 2242 int rc; 2243 uint64_t current_offset, current_remaining; 2244 spdk_bdev_io_wait_cb io_wait_fn; 2245 2246 current_offset = *offset; 2247 current_remaining = *remaining; 2248 2249 bdev_io->u.bdev.split_outstanding++; 2250 2251 io_wait_fn = _bdev_rw_split; 2252 switch (bdev_io->type) { 2253 case SPDK_BDEV_IO_TYPE_READ: 2254 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2255 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2256 iov, iovcnt, md_buf, current_offset, 2257 num_blocks, 2258 bdev_io_split_done, bdev_io, 2259 bdev_io->internal.ext_opts); 2260 break; 2261 case SPDK_BDEV_IO_TYPE_WRITE: 2262 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2263 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2264 iov, iovcnt, md_buf, current_offset, 2265 num_blocks, 2266 bdev_io_split_done, bdev_io, 2267 bdev_io->internal.ext_opts); 2268 break; 2269 case SPDK_BDEV_IO_TYPE_UNMAP: 2270 io_wait_fn = _bdev_unmap_split; 2271 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2272 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2273 current_offset, num_blocks, 2274 bdev_io_split_done, bdev_io); 2275 break; 2276 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2277 io_wait_fn = _bdev_write_zeroes_split; 2278 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2279 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2280 current_offset, num_blocks, 2281 bdev_io_split_done, bdev_io); 2282 break; 2283 default: 2284 assert(false); 2285 rc = -EINVAL; 2286 break; 2287 } 2288 2289 if (rc == 0) { 2290 current_offset += num_blocks; 2291 current_remaining -= num_blocks; 2292 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2293 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2294 *offset = current_offset; 2295 *remaining = current_remaining; 2296 } else { 2297 bdev_io->u.bdev.split_outstanding--; 2298 if (rc == -ENOMEM) { 2299 if (bdev_io->u.bdev.split_outstanding == 0) { 2300 /* No I/O is outstanding. Hence we should wait here. */ 2301 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2302 } 2303 } else { 2304 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2305 if (bdev_io->u.bdev.split_outstanding == 0) { 2306 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2307 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2308 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2309 } 2310 } 2311 } 2312 2313 return rc; 2314 } 2315 2316 static void 2317 _bdev_rw_split(void *_bdev_io) 2318 { 2319 struct iovec *parent_iov, *iov; 2320 struct spdk_bdev_io *bdev_io = _bdev_io; 2321 struct spdk_bdev *bdev = bdev_io->bdev; 2322 uint64_t parent_offset, current_offset, remaining; 2323 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2324 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2325 uint32_t iovcnt, iov_len, child_iovsize; 2326 uint32_t blocklen = bdev->blocklen; 2327 uint32_t io_boundary = bdev->optimal_io_boundary; 2328 uint32_t max_segment_size = bdev->max_segment_size; 2329 uint32_t max_child_iovcnt = bdev->max_num_segments; 2330 void *md_buf = NULL; 2331 int rc; 2332 2333 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2334 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2335 BDEV_IO_NUM_CHILD_IOV; 2336 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2337 2338 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2339 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2340 parent_offset = bdev_io->u.bdev.offset_blocks; 2341 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2342 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2343 2344 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2345 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2346 if (parent_iov_offset < parent_iov->iov_len) { 2347 break; 2348 } 2349 parent_iov_offset -= parent_iov->iov_len; 2350 } 2351 2352 child_iovcnt = 0; 2353 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2354 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2355 to_next_boundary = spdk_min(remaining, to_next_boundary); 2356 to_next_boundary_bytes = to_next_boundary * blocklen; 2357 2358 iov = &bdev_io->child_iov[child_iovcnt]; 2359 iovcnt = 0; 2360 2361 if (bdev_io->u.bdev.md_buf) { 2362 md_buf = (char *)bdev_io->u.bdev.md_buf + 2363 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2364 } 2365 2366 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2367 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2368 iovcnt < child_iovsize) { 2369 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2370 iov_len = parent_iov->iov_len - parent_iov_offset; 2371 2372 iov_len = spdk_min(iov_len, max_segment_size); 2373 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2374 to_next_boundary_bytes -= iov_len; 2375 2376 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2377 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2378 2379 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2380 parent_iov_offset += iov_len; 2381 } else { 2382 parent_iovpos++; 2383 parent_iov_offset = 0; 2384 } 2385 child_iovcnt++; 2386 iovcnt++; 2387 } 2388 2389 if (to_next_boundary_bytes > 0) { 2390 /* We had to stop this child I/O early because we ran out of 2391 * child_iov space or were limited by max_num_segments. 2392 * Ensure the iovs to be aligned with block size and 2393 * then adjust to_next_boundary before starting the 2394 * child I/O. 2395 */ 2396 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2397 iovcnt == child_iovsize); 2398 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2399 if (to_last_block_bytes != 0) { 2400 uint32_t child_iovpos = child_iovcnt - 1; 2401 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2402 * so the loop will naturally end 2403 */ 2404 2405 to_last_block_bytes = blocklen - to_last_block_bytes; 2406 to_next_boundary_bytes += to_last_block_bytes; 2407 while (to_last_block_bytes > 0 && iovcnt > 0) { 2408 iov_len = spdk_min(to_last_block_bytes, 2409 bdev_io->child_iov[child_iovpos].iov_len); 2410 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2411 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2412 child_iovpos--; 2413 if (--iovcnt == 0) { 2414 /* If the child IO is less than a block size just return. 2415 * If the first child IO of any split round is less than 2416 * a block size, an error exit. 2417 */ 2418 if (bdev_io->u.bdev.split_outstanding == 0) { 2419 SPDK_ERRLOG("The first child io was less than a block size\n"); 2420 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2421 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2422 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2423 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2424 } 2425 2426 return; 2427 } 2428 } 2429 2430 to_last_block_bytes -= iov_len; 2431 2432 if (parent_iov_offset == 0) { 2433 parent_iovpos--; 2434 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2435 } 2436 parent_iov_offset -= iov_len; 2437 } 2438 2439 assert(to_last_block_bytes == 0); 2440 } 2441 to_next_boundary -= to_next_boundary_bytes / blocklen; 2442 } 2443 2444 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2445 ¤t_offset, &remaining); 2446 if (spdk_unlikely(rc)) { 2447 return; 2448 } 2449 } 2450 } 2451 2452 static void 2453 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2454 { 2455 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2456 uint32_t num_children_reqs = 0; 2457 int rc; 2458 2459 offset = bdev_io->u.bdev.split_current_offset_blocks; 2460 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2461 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2462 2463 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2464 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2465 2466 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2467 &offset, &remaining); 2468 if (spdk_likely(rc == 0)) { 2469 num_children_reqs++; 2470 } else { 2471 return; 2472 } 2473 } 2474 } 2475 2476 static void 2477 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2478 { 2479 uint64_t offset, write_zeroes_blocks, remaining; 2480 uint32_t num_children_reqs = 0; 2481 int rc; 2482 2483 offset = bdev_io->u.bdev.split_current_offset_blocks; 2484 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2485 2486 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2487 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2488 2489 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2490 &offset, &remaining); 2491 if (spdk_likely(rc == 0)) { 2492 num_children_reqs++; 2493 } else { 2494 return; 2495 } 2496 } 2497 } 2498 2499 static void 2500 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2501 { 2502 struct spdk_bdev_io *parent_io = cb_arg; 2503 2504 spdk_bdev_free_io(bdev_io); 2505 2506 if (!success) { 2507 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2508 /* If any child I/O failed, stop further splitting process. */ 2509 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2510 parent_io->u.bdev.split_remaining_num_blocks = 0; 2511 } 2512 parent_io->u.bdev.split_outstanding--; 2513 if (parent_io->u.bdev.split_outstanding != 0) { 2514 return; 2515 } 2516 2517 /* 2518 * Parent I/O finishes when all blocks are consumed. 2519 */ 2520 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2521 assert(parent_io->internal.cb != bdev_io_split_done); 2522 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2523 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2524 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2525 parent_io->internal.caller_ctx); 2526 return; 2527 } 2528 2529 /* 2530 * Continue with the splitting process. This function will complete the parent I/O if the 2531 * splitting is done. 2532 */ 2533 switch (parent_io->type) { 2534 case SPDK_BDEV_IO_TYPE_READ: 2535 case SPDK_BDEV_IO_TYPE_WRITE: 2536 _bdev_rw_split(parent_io); 2537 break; 2538 case SPDK_BDEV_IO_TYPE_UNMAP: 2539 bdev_unmap_split(parent_io); 2540 break; 2541 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2542 bdev_write_zeroes_split(parent_io); 2543 break; 2544 default: 2545 assert(false); 2546 break; 2547 } 2548 } 2549 2550 static void 2551 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2552 2553 static void 2554 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2555 { 2556 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2557 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2558 bdev_io->u.bdev.split_outstanding = 0; 2559 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2560 2561 switch (bdev_io->type) { 2562 case SPDK_BDEV_IO_TYPE_READ: 2563 case SPDK_BDEV_IO_TYPE_WRITE: 2564 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2565 _bdev_rw_split(bdev_io); 2566 } else { 2567 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2568 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2569 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2570 } 2571 break; 2572 case SPDK_BDEV_IO_TYPE_UNMAP: 2573 bdev_unmap_split(bdev_io); 2574 break; 2575 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2576 bdev_write_zeroes_split(bdev_io); 2577 break; 2578 default: 2579 assert(false); 2580 break; 2581 } 2582 } 2583 2584 static void 2585 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2586 { 2587 if (!success) { 2588 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2589 return; 2590 } 2591 2592 _bdev_rw_split(bdev_io); 2593 } 2594 2595 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2596 * be inlined, at least on some compilers. 2597 */ 2598 static inline void 2599 _bdev_io_submit(void *ctx) 2600 { 2601 struct spdk_bdev_io *bdev_io = ctx; 2602 struct spdk_bdev *bdev = bdev_io->bdev; 2603 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2604 uint64_t tsc; 2605 2606 tsc = spdk_get_ticks(); 2607 bdev_io->internal.submit_tsc = tsc; 2608 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type, 2609 bdev_io->internal.caller_ctx, bdev_io->u.bdev.offset_blocks, 2610 bdev_io->u.bdev.num_blocks); 2611 2612 if (spdk_likely(bdev_ch->flags == 0)) { 2613 bdev_io_do_submit(bdev_ch, bdev_io); 2614 return; 2615 } 2616 2617 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2618 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2619 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2620 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2621 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2622 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2623 } else { 2624 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2625 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2626 } 2627 } else { 2628 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2629 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2630 } 2631 } 2632 2633 bool 2634 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2635 2636 bool 2637 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2638 { 2639 if (range1->length == 0 || range2->length == 0) { 2640 return false; 2641 } 2642 2643 if (range1->offset + range1->length <= range2->offset) { 2644 return false; 2645 } 2646 2647 if (range2->offset + range2->length <= range1->offset) { 2648 return false; 2649 } 2650 2651 return true; 2652 } 2653 2654 static bool 2655 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2656 { 2657 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2658 struct lba_range r; 2659 2660 switch (bdev_io->type) { 2661 case SPDK_BDEV_IO_TYPE_NVME_IO: 2662 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2663 /* Don't try to decode the NVMe command - just assume worst-case and that 2664 * it overlaps a locked range. 2665 */ 2666 return true; 2667 case SPDK_BDEV_IO_TYPE_WRITE: 2668 case SPDK_BDEV_IO_TYPE_UNMAP: 2669 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2670 case SPDK_BDEV_IO_TYPE_ZCOPY: 2671 r.offset = bdev_io->u.bdev.offset_blocks; 2672 r.length = bdev_io->u.bdev.num_blocks; 2673 if (!bdev_lba_range_overlapped(range, &r)) { 2674 /* This I/O doesn't overlap the specified LBA range. */ 2675 return false; 2676 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2677 /* This I/O overlaps, but the I/O is on the same channel that locked this 2678 * range, and the caller_ctx is the same as the locked_ctx. This means 2679 * that this I/O is associated with the lock, and is allowed to execute. 2680 */ 2681 return false; 2682 } else { 2683 return true; 2684 } 2685 default: 2686 return false; 2687 } 2688 } 2689 2690 void 2691 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2692 { 2693 struct spdk_bdev *bdev = bdev_io->bdev; 2694 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2695 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2696 2697 assert(thread != NULL); 2698 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2699 2700 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2701 struct lba_range *range; 2702 2703 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2704 if (bdev_io_range_is_locked(bdev_io, range)) { 2705 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2706 return; 2707 } 2708 } 2709 } 2710 2711 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2712 2713 if (bdev_io_should_split(bdev_io)) { 2714 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2715 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2716 (uintptr_t)bdev_io, bdev_io->type, bdev_io->internal.caller_ctx, 2717 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2718 bdev_io_split(NULL, bdev_io); 2719 return; 2720 } 2721 2722 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2723 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2724 _bdev_io_submit(bdev_io); 2725 } else { 2726 bdev_io->internal.io_submit_ch = ch; 2727 bdev_io->internal.ch = bdev->internal.qos->ch; 2728 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2729 } 2730 } else { 2731 _bdev_io_submit(bdev_io); 2732 } 2733 } 2734 2735 static void 2736 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2737 { 2738 struct spdk_bdev *bdev = bdev_io->bdev; 2739 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2740 struct spdk_io_channel *ch = bdev_ch->channel; 2741 2742 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2743 2744 bdev_io->internal.in_submit_request = true; 2745 bdev->fn_table->submit_request(ch, bdev_io); 2746 bdev_io->internal.in_submit_request = false; 2747 } 2748 2749 void 2750 bdev_io_init(struct spdk_bdev_io *bdev_io, 2751 struct spdk_bdev *bdev, void *cb_arg, 2752 spdk_bdev_io_completion_cb cb) 2753 { 2754 bdev_io->bdev = bdev; 2755 bdev_io->internal.caller_ctx = cb_arg; 2756 bdev_io->internal.cb = cb; 2757 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2758 bdev_io->internal.in_submit_request = false; 2759 bdev_io->internal.buf = NULL; 2760 bdev_io->internal.io_submit_ch = NULL; 2761 bdev_io->internal.orig_iovs = NULL; 2762 bdev_io->internal.orig_iovcnt = 0; 2763 bdev_io->internal.orig_md_iov.iov_base = NULL; 2764 bdev_io->internal.error.nvme.cdw0 = 0; 2765 bdev_io->num_retries = 0; 2766 bdev_io->internal.get_buf_cb = NULL; 2767 bdev_io->internal.get_aux_buf_cb = NULL; 2768 bdev_io->internal.ext_opts = NULL; 2769 } 2770 2771 static bool 2772 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2773 { 2774 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2775 } 2776 2777 bool 2778 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2779 { 2780 bool supported; 2781 2782 supported = bdev_io_type_supported(bdev, io_type); 2783 2784 if (!supported) { 2785 switch (io_type) { 2786 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2787 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2788 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2789 break; 2790 default: 2791 break; 2792 } 2793 } 2794 2795 return supported; 2796 } 2797 2798 int 2799 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2800 { 2801 if (bdev->fn_table->dump_info_json) { 2802 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2803 } 2804 2805 return 0; 2806 } 2807 2808 static void 2809 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 2810 { 2811 uint32_t max_per_timeslice = 0; 2812 int i; 2813 2814 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2815 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2816 qos->rate_limits[i].max_per_timeslice = 0; 2817 continue; 2818 } 2819 2820 max_per_timeslice = qos->rate_limits[i].limit * 2821 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 2822 2823 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 2824 qos->rate_limits[i].min_per_timeslice); 2825 2826 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 2827 } 2828 2829 bdev_qos_set_ops(qos); 2830 } 2831 2832 static int 2833 bdev_channel_poll_qos(void *arg) 2834 { 2835 struct spdk_bdev_qos *qos = arg; 2836 uint64_t now = spdk_get_ticks(); 2837 int i; 2838 2839 if (now < (qos->last_timeslice + qos->timeslice_size)) { 2840 /* We received our callback earlier than expected - return 2841 * immediately and wait to do accounting until at least one 2842 * timeslice has actually expired. This should never happen 2843 * with a well-behaved timer implementation. 2844 */ 2845 return SPDK_POLLER_IDLE; 2846 } 2847 2848 /* Reset for next round of rate limiting */ 2849 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2850 /* We may have allowed the IOs or bytes to slightly overrun in the last 2851 * timeslice. remaining_this_timeslice is signed, so if it's negative 2852 * here, we'll account for the overrun so that the next timeslice will 2853 * be appropriately reduced. 2854 */ 2855 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 2856 qos->rate_limits[i].remaining_this_timeslice = 0; 2857 } 2858 } 2859 2860 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 2861 qos->last_timeslice += qos->timeslice_size; 2862 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2863 qos->rate_limits[i].remaining_this_timeslice += 2864 qos->rate_limits[i].max_per_timeslice; 2865 } 2866 } 2867 2868 return bdev_qos_io_submit(qos->ch, qos); 2869 } 2870 2871 static void 2872 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 2873 { 2874 struct spdk_bdev_shared_resource *shared_resource; 2875 struct lba_range *range; 2876 2877 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 2878 range = TAILQ_FIRST(&ch->locked_ranges); 2879 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 2880 free(range); 2881 } 2882 2883 spdk_put_io_channel(ch->channel); 2884 2885 shared_resource = ch->shared_resource; 2886 2887 assert(TAILQ_EMPTY(&ch->io_locked)); 2888 assert(TAILQ_EMPTY(&ch->io_submitted)); 2889 assert(ch->io_outstanding == 0); 2890 assert(shared_resource->ref > 0); 2891 shared_resource->ref--; 2892 if (shared_resource->ref == 0) { 2893 assert(shared_resource->io_outstanding == 0); 2894 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 2895 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 2896 free(shared_resource); 2897 } 2898 } 2899 2900 /* Caller must hold bdev->internal.mutex. */ 2901 static void 2902 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 2903 { 2904 struct spdk_bdev_qos *qos = bdev->internal.qos; 2905 int i; 2906 2907 /* Rate limiting on this bdev enabled */ 2908 if (qos) { 2909 if (qos->ch == NULL) { 2910 struct spdk_io_channel *io_ch; 2911 2912 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 2913 bdev->name, spdk_get_thread()); 2914 2915 /* No qos channel has been selected, so set one up */ 2916 2917 /* Take another reference to ch */ 2918 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2919 assert(io_ch != NULL); 2920 qos->ch = ch; 2921 2922 qos->thread = spdk_io_channel_get_thread(io_ch); 2923 2924 TAILQ_INIT(&qos->queued); 2925 2926 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2927 if (bdev_qos_is_iops_rate_limit(i) == true) { 2928 qos->rate_limits[i].min_per_timeslice = 2929 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 2930 } else { 2931 qos->rate_limits[i].min_per_timeslice = 2932 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 2933 } 2934 2935 if (qos->rate_limits[i].limit == 0) { 2936 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 2937 } 2938 } 2939 bdev_qos_update_max_quota_per_timeslice(qos); 2940 qos->timeslice_size = 2941 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 2942 qos->last_timeslice = spdk_get_ticks(); 2943 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 2944 qos, 2945 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 2946 } 2947 2948 ch->flags |= BDEV_CH_QOS_ENABLED; 2949 } 2950 } 2951 2952 struct poll_timeout_ctx { 2953 struct spdk_bdev_desc *desc; 2954 uint64_t timeout_in_sec; 2955 spdk_bdev_io_timeout_cb cb_fn; 2956 void *cb_arg; 2957 }; 2958 2959 static void 2960 bdev_desc_free(struct spdk_bdev_desc *desc) 2961 { 2962 pthread_mutex_destroy(&desc->mutex); 2963 free(desc->media_events_buffer); 2964 free(desc); 2965 } 2966 2967 static void 2968 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 2969 { 2970 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2971 struct spdk_bdev_desc *desc = ctx->desc; 2972 2973 free(ctx); 2974 2975 pthread_mutex_lock(&desc->mutex); 2976 desc->refs--; 2977 if (desc->closed == true && desc->refs == 0) { 2978 pthread_mutex_unlock(&desc->mutex); 2979 bdev_desc_free(desc); 2980 return; 2981 } 2982 pthread_mutex_unlock(&desc->mutex); 2983 } 2984 2985 static void 2986 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 2987 { 2988 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2989 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2990 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 2991 struct spdk_bdev_desc *desc = ctx->desc; 2992 struct spdk_bdev_io *bdev_io; 2993 uint64_t now; 2994 2995 pthread_mutex_lock(&desc->mutex); 2996 if (desc->closed == true) { 2997 pthread_mutex_unlock(&desc->mutex); 2998 spdk_for_each_channel_continue(i, -1); 2999 return; 3000 } 3001 pthread_mutex_unlock(&desc->mutex); 3002 3003 now = spdk_get_ticks(); 3004 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3005 /* Exclude any I/O that are generated via splitting. */ 3006 if (bdev_io->internal.cb == bdev_io_split_done) { 3007 continue; 3008 } 3009 3010 /* Once we find an I/O that has not timed out, we can immediately 3011 * exit the loop. 3012 */ 3013 if (now < (bdev_io->internal.submit_tsc + 3014 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3015 goto end; 3016 } 3017 3018 if (bdev_io->internal.desc == desc) { 3019 ctx->cb_fn(ctx->cb_arg, bdev_io); 3020 } 3021 } 3022 3023 end: 3024 spdk_for_each_channel_continue(i, 0); 3025 } 3026 3027 static int 3028 bdev_poll_timeout_io(void *arg) 3029 { 3030 struct spdk_bdev_desc *desc = arg; 3031 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3032 struct poll_timeout_ctx *ctx; 3033 3034 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3035 if (!ctx) { 3036 SPDK_ERRLOG("failed to allocate memory\n"); 3037 return SPDK_POLLER_BUSY; 3038 } 3039 ctx->desc = desc; 3040 ctx->cb_arg = desc->cb_arg; 3041 ctx->cb_fn = desc->cb_fn; 3042 ctx->timeout_in_sec = desc->timeout_in_sec; 3043 3044 /* Take a ref on the descriptor in case it gets closed while we are checking 3045 * all of the channels. 3046 */ 3047 pthread_mutex_lock(&desc->mutex); 3048 desc->refs++; 3049 pthread_mutex_unlock(&desc->mutex); 3050 3051 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3052 bdev_channel_poll_timeout_io, 3053 ctx, 3054 bdev_channel_poll_timeout_io_done); 3055 3056 return SPDK_POLLER_BUSY; 3057 } 3058 3059 int 3060 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3061 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3062 { 3063 assert(desc->thread == spdk_get_thread()); 3064 3065 spdk_poller_unregister(&desc->io_timeout_poller); 3066 3067 if (timeout_in_sec) { 3068 assert(cb_fn != NULL); 3069 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3070 desc, 3071 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3072 1000); 3073 if (desc->io_timeout_poller == NULL) { 3074 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3075 return -1; 3076 } 3077 } 3078 3079 desc->cb_fn = cb_fn; 3080 desc->cb_arg = cb_arg; 3081 desc->timeout_in_sec = timeout_in_sec; 3082 3083 return 0; 3084 } 3085 3086 static int 3087 bdev_channel_create(void *io_device, void *ctx_buf) 3088 { 3089 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3090 struct spdk_bdev_channel *ch = ctx_buf; 3091 struct spdk_io_channel *mgmt_io_ch; 3092 struct spdk_bdev_mgmt_channel *mgmt_ch; 3093 struct spdk_bdev_shared_resource *shared_resource; 3094 struct lba_range *range; 3095 3096 ch->bdev = bdev; 3097 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3098 if (!ch->channel) { 3099 return -1; 3100 } 3101 3102 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3103 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3104 3105 assert(ch->histogram == NULL); 3106 if (bdev->internal.histogram_enabled) { 3107 ch->histogram = spdk_histogram_data_alloc(); 3108 if (ch->histogram == NULL) { 3109 SPDK_ERRLOG("Could not allocate histogram\n"); 3110 } 3111 } 3112 3113 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3114 if (!mgmt_io_ch) { 3115 spdk_put_io_channel(ch->channel); 3116 return -1; 3117 } 3118 3119 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3120 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3121 if (shared_resource->shared_ch == ch->channel) { 3122 spdk_put_io_channel(mgmt_io_ch); 3123 shared_resource->ref++; 3124 break; 3125 } 3126 } 3127 3128 if (shared_resource == NULL) { 3129 shared_resource = calloc(1, sizeof(*shared_resource)); 3130 if (shared_resource == NULL) { 3131 spdk_put_io_channel(ch->channel); 3132 spdk_put_io_channel(mgmt_io_ch); 3133 return -1; 3134 } 3135 3136 shared_resource->mgmt_ch = mgmt_ch; 3137 shared_resource->io_outstanding = 0; 3138 TAILQ_INIT(&shared_resource->nomem_io); 3139 shared_resource->nomem_threshold = 0; 3140 shared_resource->shared_ch = ch->channel; 3141 shared_resource->ref = 1; 3142 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3143 } 3144 3145 memset(&ch->stat, 0, sizeof(ch->stat)); 3146 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3147 ch->io_outstanding = 0; 3148 TAILQ_INIT(&ch->queued_resets); 3149 TAILQ_INIT(&ch->locked_ranges); 3150 ch->flags = 0; 3151 ch->shared_resource = shared_resource; 3152 3153 TAILQ_INIT(&ch->io_submitted); 3154 TAILQ_INIT(&ch->io_locked); 3155 3156 #ifdef SPDK_CONFIG_VTUNE 3157 { 3158 char *name; 3159 __itt_init_ittlib(NULL, 0); 3160 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3161 if (!name) { 3162 bdev_channel_destroy_resource(ch); 3163 return -1; 3164 } 3165 ch->handle = __itt_string_handle_create(name); 3166 free(name); 3167 ch->start_tsc = spdk_get_ticks(); 3168 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3169 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3170 } 3171 #endif 3172 3173 pthread_mutex_lock(&bdev->internal.mutex); 3174 bdev_enable_qos(bdev, ch); 3175 3176 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3177 struct lba_range *new_range; 3178 3179 new_range = calloc(1, sizeof(*new_range)); 3180 if (new_range == NULL) { 3181 pthread_mutex_unlock(&bdev->internal.mutex); 3182 bdev_channel_destroy_resource(ch); 3183 return -1; 3184 } 3185 new_range->length = range->length; 3186 new_range->offset = range->offset; 3187 new_range->locked_ctx = range->locked_ctx; 3188 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3189 } 3190 3191 pthread_mutex_unlock(&bdev->internal.mutex); 3192 3193 return 0; 3194 } 3195 3196 /* 3197 * Abort I/O that are waiting on a data buffer. These types of I/O are 3198 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3199 */ 3200 static void 3201 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3202 { 3203 bdev_io_stailq_t tmp; 3204 struct spdk_bdev_io *bdev_io; 3205 3206 STAILQ_INIT(&tmp); 3207 3208 while (!STAILQ_EMPTY(queue)) { 3209 bdev_io = STAILQ_FIRST(queue); 3210 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3211 if (bdev_io->internal.ch == ch) { 3212 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3213 } else { 3214 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3215 } 3216 } 3217 3218 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3219 } 3220 3221 /* 3222 * Abort I/O that are queued waiting for submission. These types of I/O are 3223 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3224 */ 3225 static void 3226 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3227 { 3228 struct spdk_bdev_io *bdev_io, *tmp; 3229 3230 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3231 if (bdev_io->internal.ch == ch) { 3232 TAILQ_REMOVE(queue, bdev_io, internal.link); 3233 /* 3234 * spdk_bdev_io_complete() assumes that the completed I/O had 3235 * been submitted to the bdev module. Since in this case it 3236 * hadn't, bump io_outstanding to account for the decrement 3237 * that spdk_bdev_io_complete() will do. 3238 */ 3239 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3240 ch->io_outstanding++; 3241 ch->shared_resource->io_outstanding++; 3242 } 3243 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3244 } 3245 } 3246 } 3247 3248 static bool 3249 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3250 { 3251 struct spdk_bdev_io *bdev_io; 3252 3253 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3254 if (bdev_io == bio_to_abort) { 3255 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3256 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3257 return true; 3258 } 3259 } 3260 3261 return false; 3262 } 3263 3264 static bool 3265 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3266 { 3267 struct spdk_bdev_io *bdev_io; 3268 3269 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3270 if (bdev_io == bio_to_abort) { 3271 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3272 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3273 return true; 3274 } 3275 } 3276 3277 return false; 3278 } 3279 3280 static void 3281 bdev_qos_channel_destroy(void *cb_arg) 3282 { 3283 struct spdk_bdev_qos *qos = cb_arg; 3284 3285 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3286 spdk_poller_unregister(&qos->poller); 3287 3288 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3289 3290 free(qos); 3291 } 3292 3293 static int 3294 bdev_qos_destroy(struct spdk_bdev *bdev) 3295 { 3296 int i; 3297 3298 /* 3299 * Cleanly shutting down the QoS poller is tricky, because 3300 * during the asynchronous operation the user could open 3301 * a new descriptor and create a new channel, spawning 3302 * a new QoS poller. 3303 * 3304 * The strategy is to create a new QoS structure here and swap it 3305 * in. The shutdown path then continues to refer to the old one 3306 * until it completes and then releases it. 3307 */ 3308 struct spdk_bdev_qos *new_qos, *old_qos; 3309 3310 old_qos = bdev->internal.qos; 3311 3312 new_qos = calloc(1, sizeof(*new_qos)); 3313 if (!new_qos) { 3314 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3315 return -ENOMEM; 3316 } 3317 3318 /* Copy the old QoS data into the newly allocated structure */ 3319 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3320 3321 /* Zero out the key parts of the QoS structure */ 3322 new_qos->ch = NULL; 3323 new_qos->thread = NULL; 3324 new_qos->poller = NULL; 3325 TAILQ_INIT(&new_qos->queued); 3326 /* 3327 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3328 * It will be used later for the new QoS structure. 3329 */ 3330 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3331 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3332 new_qos->rate_limits[i].min_per_timeslice = 0; 3333 new_qos->rate_limits[i].max_per_timeslice = 0; 3334 } 3335 3336 bdev->internal.qos = new_qos; 3337 3338 if (old_qos->thread == NULL) { 3339 free(old_qos); 3340 } else { 3341 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3342 } 3343 3344 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3345 * been destroyed yet. The destruction path will end up waiting for the final 3346 * channel to be put before it releases resources. */ 3347 3348 return 0; 3349 } 3350 3351 static void 3352 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3353 { 3354 total->bytes_read += add->bytes_read; 3355 total->num_read_ops += add->num_read_ops; 3356 total->bytes_written += add->bytes_written; 3357 total->num_write_ops += add->num_write_ops; 3358 total->bytes_unmapped += add->bytes_unmapped; 3359 total->num_unmap_ops += add->num_unmap_ops; 3360 total->read_latency_ticks += add->read_latency_ticks; 3361 total->write_latency_ticks += add->write_latency_ticks; 3362 total->unmap_latency_ticks += add->unmap_latency_ticks; 3363 } 3364 3365 static void 3366 bdev_channel_destroy(void *io_device, void *ctx_buf) 3367 { 3368 struct spdk_bdev_channel *ch = ctx_buf; 3369 struct spdk_bdev_mgmt_channel *mgmt_ch; 3370 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3371 3372 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3373 spdk_get_thread()); 3374 3375 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3376 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3377 3378 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3379 pthread_mutex_lock(&ch->bdev->internal.mutex); 3380 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3381 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3382 3383 mgmt_ch = shared_resource->mgmt_ch; 3384 3385 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3386 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3387 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3388 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3389 3390 if (ch->histogram) { 3391 spdk_histogram_data_free(ch->histogram); 3392 } 3393 3394 bdev_channel_destroy_resource(ch); 3395 } 3396 3397 /* 3398 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3399 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3400 */ 3401 static int 3402 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3403 { 3404 struct spdk_bdev_name *tmp; 3405 3406 bdev_name->name = strdup(name); 3407 if (bdev_name->name == NULL) { 3408 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3409 return -ENOMEM; 3410 } 3411 3412 bdev_name->bdev = bdev; 3413 3414 pthread_mutex_lock(&g_bdev_mgr.mutex); 3415 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3416 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3417 3418 if (tmp != NULL) { 3419 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3420 free(bdev_name->name); 3421 return -EEXIST; 3422 } 3423 3424 return 0; 3425 } 3426 3427 static void 3428 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3429 { 3430 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3431 free(bdev_name->name); 3432 } 3433 3434 static void 3435 bdev_name_del(struct spdk_bdev_name *bdev_name) 3436 { 3437 pthread_mutex_lock(&g_bdev_mgr.mutex); 3438 bdev_name_del_unsafe(bdev_name); 3439 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3440 } 3441 3442 int 3443 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3444 { 3445 struct spdk_bdev_alias *tmp; 3446 int ret; 3447 3448 if (alias == NULL) { 3449 SPDK_ERRLOG("Empty alias passed\n"); 3450 return -EINVAL; 3451 } 3452 3453 tmp = calloc(1, sizeof(*tmp)); 3454 if (tmp == NULL) { 3455 SPDK_ERRLOG("Unable to allocate alias\n"); 3456 return -ENOMEM; 3457 } 3458 3459 ret = bdev_name_add(&tmp->alias, bdev, alias); 3460 if (ret != 0) { 3461 free(tmp); 3462 return ret; 3463 } 3464 3465 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3466 3467 return 0; 3468 } 3469 3470 static int 3471 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3472 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3473 { 3474 struct spdk_bdev_alias *tmp; 3475 3476 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3477 if (strcmp(alias, tmp->alias.name) == 0) { 3478 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3479 alias_del_fn(&tmp->alias); 3480 free(tmp); 3481 return 0; 3482 } 3483 } 3484 3485 return -ENOENT; 3486 } 3487 3488 int 3489 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3490 { 3491 int rc; 3492 3493 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3494 if (rc == -ENOENT) { 3495 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3496 } 3497 3498 return rc; 3499 } 3500 3501 void 3502 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3503 { 3504 struct spdk_bdev_alias *p, *tmp; 3505 3506 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3507 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3508 bdev_name_del(&p->alias); 3509 free(p); 3510 } 3511 } 3512 3513 struct spdk_io_channel * 3514 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3515 { 3516 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3517 } 3518 3519 void * 3520 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3521 { 3522 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3523 void *ctx = NULL; 3524 3525 if (bdev->fn_table->get_module_ctx) { 3526 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3527 } 3528 3529 return ctx; 3530 } 3531 3532 const char * 3533 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3534 { 3535 return bdev->module->name; 3536 } 3537 3538 const char * 3539 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3540 { 3541 return bdev->name; 3542 } 3543 3544 const char * 3545 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3546 { 3547 return bdev->product_name; 3548 } 3549 3550 const struct spdk_bdev_aliases_list * 3551 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3552 { 3553 return &bdev->aliases; 3554 } 3555 3556 uint32_t 3557 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3558 { 3559 return bdev->blocklen; 3560 } 3561 3562 uint32_t 3563 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3564 { 3565 return bdev->write_unit_size; 3566 } 3567 3568 uint64_t 3569 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3570 { 3571 return bdev->blockcnt; 3572 } 3573 3574 const char * 3575 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3576 { 3577 return qos_rpc_type[type]; 3578 } 3579 3580 void 3581 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3582 { 3583 int i; 3584 3585 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3586 3587 pthread_mutex_lock(&bdev->internal.mutex); 3588 if (bdev->internal.qos) { 3589 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3590 if (bdev->internal.qos->rate_limits[i].limit != 3591 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3592 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3593 if (bdev_qos_is_iops_rate_limit(i) == false) { 3594 /* Change from Byte to Megabyte which is user visible. */ 3595 limits[i] = limits[i] / 1024 / 1024; 3596 } 3597 } 3598 } 3599 } 3600 pthread_mutex_unlock(&bdev->internal.mutex); 3601 } 3602 3603 size_t 3604 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3605 { 3606 return 1 << bdev->required_alignment; 3607 } 3608 3609 uint32_t 3610 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3611 { 3612 return bdev->optimal_io_boundary; 3613 } 3614 3615 bool 3616 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3617 { 3618 return bdev->write_cache; 3619 } 3620 3621 const struct spdk_uuid * 3622 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3623 { 3624 return &bdev->uuid; 3625 } 3626 3627 uint16_t 3628 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3629 { 3630 return bdev->acwu; 3631 } 3632 3633 uint32_t 3634 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3635 { 3636 return bdev->md_len; 3637 } 3638 3639 bool 3640 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3641 { 3642 return (bdev->md_len != 0) && bdev->md_interleave; 3643 } 3644 3645 bool 3646 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3647 { 3648 return (bdev->md_len != 0) && !bdev->md_interleave; 3649 } 3650 3651 bool 3652 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3653 { 3654 return bdev->zoned; 3655 } 3656 3657 uint32_t 3658 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3659 { 3660 if (spdk_bdev_is_md_interleaved(bdev)) { 3661 return bdev->blocklen - bdev->md_len; 3662 } else { 3663 return bdev->blocklen; 3664 } 3665 } 3666 3667 uint32_t 3668 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3669 { 3670 return bdev->phys_blocklen; 3671 } 3672 3673 static uint32_t 3674 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3675 { 3676 if (!spdk_bdev_is_md_interleaved(bdev)) { 3677 return bdev->blocklen + bdev->md_len; 3678 } else { 3679 return bdev->blocklen; 3680 } 3681 } 3682 3683 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3684 { 3685 if (bdev->md_len != 0) { 3686 return bdev->dif_type; 3687 } else { 3688 return SPDK_DIF_DISABLE; 3689 } 3690 } 3691 3692 bool 3693 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3694 { 3695 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3696 return bdev->dif_is_head_of_md; 3697 } else { 3698 return false; 3699 } 3700 } 3701 3702 bool 3703 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3704 enum spdk_dif_check_type check_type) 3705 { 3706 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3707 return false; 3708 } 3709 3710 switch (check_type) { 3711 case SPDK_DIF_CHECK_TYPE_REFTAG: 3712 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3713 case SPDK_DIF_CHECK_TYPE_APPTAG: 3714 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3715 case SPDK_DIF_CHECK_TYPE_GUARD: 3716 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3717 default: 3718 return false; 3719 } 3720 } 3721 3722 uint64_t 3723 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3724 { 3725 return bdev->internal.measured_queue_depth; 3726 } 3727 3728 uint64_t 3729 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3730 { 3731 return bdev->internal.period; 3732 } 3733 3734 uint64_t 3735 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3736 { 3737 return bdev->internal.weighted_io_time; 3738 } 3739 3740 uint64_t 3741 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3742 { 3743 return bdev->internal.io_time; 3744 } 3745 3746 static void 3747 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3748 { 3749 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3750 3751 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3752 3753 if (bdev->internal.measured_queue_depth) { 3754 bdev->internal.io_time += bdev->internal.period; 3755 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3756 } 3757 } 3758 3759 static void 3760 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3761 { 3762 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3763 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3764 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3765 3766 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3767 spdk_for_each_channel_continue(i, 0); 3768 } 3769 3770 static int 3771 bdev_calculate_measured_queue_depth(void *ctx) 3772 { 3773 struct spdk_bdev *bdev = ctx; 3774 bdev->internal.temporary_queue_depth = 0; 3775 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3776 _calculate_measured_qd_cpl); 3777 return SPDK_POLLER_BUSY; 3778 } 3779 3780 void 3781 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3782 { 3783 bdev->internal.period = period; 3784 3785 if (bdev->internal.qd_poller != NULL) { 3786 spdk_poller_unregister(&bdev->internal.qd_poller); 3787 bdev->internal.measured_queue_depth = UINT64_MAX; 3788 } 3789 3790 if (period != 0) { 3791 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 3792 period); 3793 } 3794 } 3795 3796 static void 3797 _resize_notify(void *arg) 3798 { 3799 struct spdk_bdev_desc *desc = arg; 3800 3801 pthread_mutex_lock(&desc->mutex); 3802 desc->refs--; 3803 if (!desc->closed) { 3804 pthread_mutex_unlock(&desc->mutex); 3805 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 3806 desc->bdev, 3807 desc->callback.ctx); 3808 return; 3809 } else if (0 == desc->refs) { 3810 /* This descriptor was closed after this resize_notify message was sent. 3811 * spdk_bdev_close() could not free the descriptor since this message was 3812 * in flight, so we free it now using bdev_desc_free(). 3813 */ 3814 pthread_mutex_unlock(&desc->mutex); 3815 bdev_desc_free(desc); 3816 return; 3817 } 3818 pthread_mutex_unlock(&desc->mutex); 3819 } 3820 3821 int 3822 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 3823 { 3824 struct spdk_bdev_desc *desc; 3825 int ret; 3826 3827 if (size == bdev->blockcnt) { 3828 return 0; 3829 } 3830 3831 pthread_mutex_lock(&bdev->internal.mutex); 3832 3833 /* bdev has open descriptors */ 3834 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 3835 bdev->blockcnt > size) { 3836 ret = -EBUSY; 3837 } else { 3838 bdev->blockcnt = size; 3839 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 3840 pthread_mutex_lock(&desc->mutex); 3841 if (!desc->closed) { 3842 desc->refs++; 3843 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 3844 } 3845 pthread_mutex_unlock(&desc->mutex); 3846 } 3847 ret = 0; 3848 } 3849 3850 pthread_mutex_unlock(&bdev->internal.mutex); 3851 3852 return ret; 3853 } 3854 3855 /* 3856 * Convert I/O offset and length from bytes to blocks. 3857 * 3858 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 3859 */ 3860 static uint64_t 3861 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 3862 uint64_t num_bytes, uint64_t *num_blocks) 3863 { 3864 uint32_t block_size = bdev->blocklen; 3865 uint8_t shift_cnt; 3866 3867 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3868 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 3869 shift_cnt = spdk_u32log2(block_size); 3870 *offset_blocks = offset_bytes >> shift_cnt; 3871 *num_blocks = num_bytes >> shift_cnt; 3872 return (offset_bytes - (*offset_blocks << shift_cnt)) | 3873 (num_bytes - (*num_blocks << shift_cnt)); 3874 } else { 3875 *offset_blocks = offset_bytes / block_size; 3876 *num_blocks = num_bytes / block_size; 3877 return (offset_bytes % block_size) | (num_bytes % block_size); 3878 } 3879 } 3880 3881 static bool 3882 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 3883 { 3884 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 3885 * has been an overflow and hence the offset has been wrapped around */ 3886 if (offset_blocks + num_blocks < offset_blocks) { 3887 return false; 3888 } 3889 3890 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 3891 if (offset_blocks + num_blocks > bdev->blockcnt) { 3892 return false; 3893 } 3894 3895 return true; 3896 } 3897 3898 static bool 3899 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 3900 { 3901 return _is_buf_allocated(iovs) == (md_buf != NULL); 3902 } 3903 3904 static int 3905 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 3906 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3907 spdk_bdev_io_completion_cb cb, void *cb_arg) 3908 { 3909 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3910 struct spdk_bdev_io *bdev_io; 3911 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3912 3913 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3914 return -EINVAL; 3915 } 3916 3917 bdev_io = bdev_channel_get_io(channel); 3918 if (!bdev_io) { 3919 return -ENOMEM; 3920 } 3921 3922 bdev_io->internal.ch = channel; 3923 bdev_io->internal.desc = desc; 3924 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3925 bdev_io->u.bdev.iovs = &bdev_io->iov; 3926 bdev_io->u.bdev.iovs[0].iov_base = buf; 3927 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3928 bdev_io->u.bdev.iovcnt = 1; 3929 bdev_io->u.bdev.md_buf = md_buf; 3930 bdev_io->u.bdev.num_blocks = num_blocks; 3931 bdev_io->u.bdev.offset_blocks = offset_blocks; 3932 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3933 3934 bdev_io_submit(bdev_io); 3935 return 0; 3936 } 3937 3938 int 3939 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3940 void *buf, uint64_t offset, uint64_t nbytes, 3941 spdk_bdev_io_completion_cb cb, void *cb_arg) 3942 { 3943 uint64_t offset_blocks, num_blocks; 3944 3945 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3946 nbytes, &num_blocks) != 0) { 3947 return -EINVAL; 3948 } 3949 3950 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3951 } 3952 3953 int 3954 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3955 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3956 spdk_bdev_io_completion_cb cb, void *cb_arg) 3957 { 3958 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 3959 } 3960 3961 int 3962 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3963 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3964 spdk_bdev_io_completion_cb cb, void *cb_arg) 3965 { 3966 struct iovec iov = { 3967 .iov_base = buf, 3968 }; 3969 3970 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3971 return -EINVAL; 3972 } 3973 3974 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3975 return -EINVAL; 3976 } 3977 3978 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3979 cb, cb_arg); 3980 } 3981 3982 int 3983 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3984 struct iovec *iov, int iovcnt, 3985 uint64_t offset, uint64_t nbytes, 3986 spdk_bdev_io_completion_cb cb, void *cb_arg) 3987 { 3988 uint64_t offset_blocks, num_blocks; 3989 3990 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3991 nbytes, &num_blocks) != 0) { 3992 return -EINVAL; 3993 } 3994 3995 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3996 } 3997 3998 static int 3999 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4000 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4001 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4002 struct spdk_bdev_ext_io_opts *opts) 4003 { 4004 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4005 struct spdk_bdev_io *bdev_io; 4006 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4007 4008 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4009 return -EINVAL; 4010 } 4011 4012 bdev_io = bdev_channel_get_io(channel); 4013 if (!bdev_io) { 4014 return -ENOMEM; 4015 } 4016 4017 bdev_io->internal.ch = channel; 4018 bdev_io->internal.desc = desc; 4019 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4020 bdev_io->u.bdev.iovs = iov; 4021 bdev_io->u.bdev.iovcnt = iovcnt; 4022 bdev_io->u.bdev.md_buf = md_buf; 4023 bdev_io->u.bdev.num_blocks = num_blocks; 4024 bdev_io->u.bdev.offset_blocks = offset_blocks; 4025 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4026 bdev_io->internal.ext_opts = opts; 4027 4028 bdev_io_submit(bdev_io); 4029 return 0; 4030 } 4031 4032 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4033 struct iovec *iov, int iovcnt, 4034 uint64_t offset_blocks, uint64_t num_blocks, 4035 spdk_bdev_io_completion_cb cb, void *cb_arg) 4036 { 4037 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4038 num_blocks, cb, cb_arg, NULL); 4039 } 4040 4041 int 4042 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4043 struct iovec *iov, int iovcnt, void *md_buf, 4044 uint64_t offset_blocks, uint64_t num_blocks, 4045 spdk_bdev_io_completion_cb cb, void *cb_arg) 4046 { 4047 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4048 return -EINVAL; 4049 } 4050 4051 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4052 return -EINVAL; 4053 } 4054 4055 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4056 num_blocks, cb, cb_arg, NULL); 4057 } 4058 4059 int 4060 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4061 struct iovec *iov, int iovcnt, 4062 uint64_t offset_blocks, uint64_t num_blocks, 4063 spdk_bdev_io_completion_cb cb, void *cb_arg, 4064 struct spdk_bdev_ext_io_opts *opts) 4065 { 4066 void *md = NULL; 4067 4068 if (opts) { 4069 md = opts->metadata; 4070 } 4071 4072 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4073 return -EINVAL; 4074 } 4075 4076 if (md && !_bdev_io_check_md_buf(iov, md)) { 4077 return -EINVAL; 4078 } 4079 4080 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4081 num_blocks, cb, cb_arg, opts); 4082 } 4083 4084 static int 4085 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4086 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4087 spdk_bdev_io_completion_cb cb, void *cb_arg) 4088 { 4089 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4090 struct spdk_bdev_io *bdev_io; 4091 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4092 4093 if (!desc->write) { 4094 return -EBADF; 4095 } 4096 4097 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4098 return -EINVAL; 4099 } 4100 4101 bdev_io = bdev_channel_get_io(channel); 4102 if (!bdev_io) { 4103 return -ENOMEM; 4104 } 4105 4106 bdev_io->internal.ch = channel; 4107 bdev_io->internal.desc = desc; 4108 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4109 bdev_io->u.bdev.iovs = &bdev_io->iov; 4110 bdev_io->u.bdev.iovs[0].iov_base = buf; 4111 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4112 bdev_io->u.bdev.iovcnt = 1; 4113 bdev_io->u.bdev.md_buf = md_buf; 4114 bdev_io->u.bdev.num_blocks = num_blocks; 4115 bdev_io->u.bdev.offset_blocks = offset_blocks; 4116 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4117 4118 bdev_io_submit(bdev_io); 4119 return 0; 4120 } 4121 4122 int 4123 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4124 void *buf, uint64_t offset, uint64_t nbytes, 4125 spdk_bdev_io_completion_cb cb, void *cb_arg) 4126 { 4127 uint64_t offset_blocks, num_blocks; 4128 4129 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4130 nbytes, &num_blocks) != 0) { 4131 return -EINVAL; 4132 } 4133 4134 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4135 } 4136 4137 int 4138 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4139 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4140 spdk_bdev_io_completion_cb cb, void *cb_arg) 4141 { 4142 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4143 cb, cb_arg); 4144 } 4145 4146 int 4147 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4148 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4149 spdk_bdev_io_completion_cb cb, void *cb_arg) 4150 { 4151 struct iovec iov = { 4152 .iov_base = buf, 4153 }; 4154 4155 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4156 return -EINVAL; 4157 } 4158 4159 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4160 return -EINVAL; 4161 } 4162 4163 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4164 cb, cb_arg); 4165 } 4166 4167 static int 4168 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4169 struct iovec *iov, int iovcnt, void *md_buf, 4170 uint64_t offset_blocks, uint64_t num_blocks, 4171 spdk_bdev_io_completion_cb cb, void *cb_arg, 4172 struct spdk_bdev_ext_io_opts *opts) 4173 { 4174 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4175 struct spdk_bdev_io *bdev_io; 4176 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4177 4178 if (!desc->write) { 4179 return -EBADF; 4180 } 4181 4182 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4183 return -EINVAL; 4184 } 4185 4186 bdev_io = bdev_channel_get_io(channel); 4187 if (!bdev_io) { 4188 return -ENOMEM; 4189 } 4190 4191 bdev_io->internal.ch = channel; 4192 bdev_io->internal.desc = desc; 4193 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4194 bdev_io->u.bdev.iovs = iov; 4195 bdev_io->u.bdev.iovcnt = iovcnt; 4196 bdev_io->u.bdev.md_buf = md_buf; 4197 bdev_io->u.bdev.num_blocks = num_blocks; 4198 bdev_io->u.bdev.offset_blocks = offset_blocks; 4199 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4200 bdev_io->internal.ext_opts = opts; 4201 4202 bdev_io_submit(bdev_io); 4203 return 0; 4204 } 4205 4206 int 4207 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4208 struct iovec *iov, int iovcnt, 4209 uint64_t offset, uint64_t len, 4210 spdk_bdev_io_completion_cb cb, void *cb_arg) 4211 { 4212 uint64_t offset_blocks, num_blocks; 4213 4214 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4215 len, &num_blocks) != 0) { 4216 return -EINVAL; 4217 } 4218 4219 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4220 } 4221 4222 int 4223 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4224 struct iovec *iov, int iovcnt, 4225 uint64_t offset_blocks, uint64_t num_blocks, 4226 spdk_bdev_io_completion_cb cb, void *cb_arg) 4227 { 4228 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4229 num_blocks, cb, cb_arg, NULL); 4230 } 4231 4232 int 4233 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4234 struct iovec *iov, int iovcnt, void *md_buf, 4235 uint64_t offset_blocks, uint64_t num_blocks, 4236 spdk_bdev_io_completion_cb cb, void *cb_arg) 4237 { 4238 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4239 return -EINVAL; 4240 } 4241 4242 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4243 return -EINVAL; 4244 } 4245 4246 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4247 num_blocks, cb, cb_arg, NULL); 4248 } 4249 4250 int 4251 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4252 struct iovec *iov, int iovcnt, 4253 uint64_t offset_blocks, uint64_t num_blocks, 4254 spdk_bdev_io_completion_cb cb, void *cb_arg, 4255 struct spdk_bdev_ext_io_opts *opts) 4256 { 4257 void *md = NULL; 4258 4259 if (opts) { 4260 md = opts->metadata; 4261 } 4262 4263 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4264 return -EINVAL; 4265 } 4266 4267 if (md && !_bdev_io_check_md_buf(iov, md)) { 4268 return -EINVAL; 4269 } 4270 4271 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4272 num_blocks, cb, cb_arg, opts); 4273 } 4274 4275 static void 4276 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4277 { 4278 struct spdk_bdev_io *parent_io = cb_arg; 4279 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4280 int i, rc = 0; 4281 4282 if (!success) { 4283 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4284 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4285 spdk_bdev_free_io(bdev_io); 4286 return; 4287 } 4288 4289 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4290 rc = memcmp(read_buf, 4291 parent_io->u.bdev.iovs[i].iov_base, 4292 parent_io->u.bdev.iovs[i].iov_len); 4293 if (rc) { 4294 break; 4295 } 4296 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4297 } 4298 4299 spdk_bdev_free_io(bdev_io); 4300 4301 if (rc == 0) { 4302 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4303 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4304 } else { 4305 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4306 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4307 } 4308 } 4309 4310 static void 4311 bdev_compare_do_read(void *_bdev_io) 4312 { 4313 struct spdk_bdev_io *bdev_io = _bdev_io; 4314 int rc; 4315 4316 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4317 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4318 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4319 bdev_compare_do_read_done, bdev_io); 4320 4321 if (rc == -ENOMEM) { 4322 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4323 } else if (rc != 0) { 4324 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4325 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4326 } 4327 } 4328 4329 static int 4330 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4331 struct iovec *iov, int iovcnt, void *md_buf, 4332 uint64_t offset_blocks, uint64_t num_blocks, 4333 spdk_bdev_io_completion_cb cb, void *cb_arg) 4334 { 4335 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4336 struct spdk_bdev_io *bdev_io; 4337 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4338 4339 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4340 return -EINVAL; 4341 } 4342 4343 bdev_io = bdev_channel_get_io(channel); 4344 if (!bdev_io) { 4345 return -ENOMEM; 4346 } 4347 4348 bdev_io->internal.ch = channel; 4349 bdev_io->internal.desc = desc; 4350 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4351 bdev_io->u.bdev.iovs = iov; 4352 bdev_io->u.bdev.iovcnt = iovcnt; 4353 bdev_io->u.bdev.md_buf = md_buf; 4354 bdev_io->u.bdev.num_blocks = num_blocks; 4355 bdev_io->u.bdev.offset_blocks = offset_blocks; 4356 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4357 4358 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4359 bdev_io_submit(bdev_io); 4360 return 0; 4361 } 4362 4363 bdev_compare_do_read(bdev_io); 4364 4365 return 0; 4366 } 4367 4368 int 4369 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4370 struct iovec *iov, int iovcnt, 4371 uint64_t offset_blocks, uint64_t num_blocks, 4372 spdk_bdev_io_completion_cb cb, void *cb_arg) 4373 { 4374 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4375 num_blocks, cb, cb_arg); 4376 } 4377 4378 int 4379 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4380 struct iovec *iov, int iovcnt, void *md_buf, 4381 uint64_t offset_blocks, uint64_t num_blocks, 4382 spdk_bdev_io_completion_cb cb, void *cb_arg) 4383 { 4384 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4385 return -EINVAL; 4386 } 4387 4388 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4389 return -EINVAL; 4390 } 4391 4392 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4393 num_blocks, cb, cb_arg); 4394 } 4395 4396 static int 4397 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4398 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4399 spdk_bdev_io_completion_cb cb, void *cb_arg) 4400 { 4401 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4402 struct spdk_bdev_io *bdev_io; 4403 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4404 4405 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4406 return -EINVAL; 4407 } 4408 4409 bdev_io = bdev_channel_get_io(channel); 4410 if (!bdev_io) { 4411 return -ENOMEM; 4412 } 4413 4414 bdev_io->internal.ch = channel; 4415 bdev_io->internal.desc = desc; 4416 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4417 bdev_io->u.bdev.iovs = &bdev_io->iov; 4418 bdev_io->u.bdev.iovs[0].iov_base = buf; 4419 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4420 bdev_io->u.bdev.iovcnt = 1; 4421 bdev_io->u.bdev.md_buf = md_buf; 4422 bdev_io->u.bdev.num_blocks = num_blocks; 4423 bdev_io->u.bdev.offset_blocks = offset_blocks; 4424 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4425 4426 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4427 bdev_io_submit(bdev_io); 4428 return 0; 4429 } 4430 4431 bdev_compare_do_read(bdev_io); 4432 4433 return 0; 4434 } 4435 4436 int 4437 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4438 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4439 spdk_bdev_io_completion_cb cb, void *cb_arg) 4440 { 4441 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4442 cb, cb_arg); 4443 } 4444 4445 int 4446 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4447 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4448 spdk_bdev_io_completion_cb cb, void *cb_arg) 4449 { 4450 struct iovec iov = { 4451 .iov_base = buf, 4452 }; 4453 4454 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4455 return -EINVAL; 4456 } 4457 4458 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4459 return -EINVAL; 4460 } 4461 4462 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4463 cb, cb_arg); 4464 } 4465 4466 static void 4467 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4468 { 4469 struct spdk_bdev_io *bdev_io = ctx; 4470 4471 if (unlock_status) { 4472 SPDK_ERRLOG("LBA range unlock failed\n"); 4473 } 4474 4475 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4476 false, bdev_io->internal.caller_ctx); 4477 } 4478 4479 static void 4480 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4481 { 4482 bdev_io->internal.status = status; 4483 4484 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4485 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4486 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4487 } 4488 4489 static void 4490 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4491 { 4492 struct spdk_bdev_io *parent_io = cb_arg; 4493 4494 if (!success) { 4495 SPDK_ERRLOG("Compare and write operation failed\n"); 4496 } 4497 4498 spdk_bdev_free_io(bdev_io); 4499 4500 bdev_comparev_and_writev_blocks_unlock(parent_io, 4501 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4502 } 4503 4504 static void 4505 bdev_compare_and_write_do_write(void *_bdev_io) 4506 { 4507 struct spdk_bdev_io *bdev_io = _bdev_io; 4508 int rc; 4509 4510 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4511 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4512 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4513 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4514 bdev_compare_and_write_do_write_done, bdev_io); 4515 4516 4517 if (rc == -ENOMEM) { 4518 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4519 } else if (rc != 0) { 4520 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4521 } 4522 } 4523 4524 static void 4525 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4526 { 4527 struct spdk_bdev_io *parent_io = cb_arg; 4528 4529 spdk_bdev_free_io(bdev_io); 4530 4531 if (!success) { 4532 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4533 return; 4534 } 4535 4536 bdev_compare_and_write_do_write(parent_io); 4537 } 4538 4539 static void 4540 bdev_compare_and_write_do_compare(void *_bdev_io) 4541 { 4542 struct spdk_bdev_io *bdev_io = _bdev_io; 4543 int rc; 4544 4545 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4546 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4547 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4548 bdev_compare_and_write_do_compare_done, bdev_io); 4549 4550 if (rc == -ENOMEM) { 4551 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4552 } else if (rc != 0) { 4553 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4554 } 4555 } 4556 4557 static void 4558 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4559 { 4560 struct spdk_bdev_io *bdev_io = ctx; 4561 4562 if (status) { 4563 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4564 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4565 return; 4566 } 4567 4568 bdev_compare_and_write_do_compare(bdev_io); 4569 } 4570 4571 int 4572 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4573 struct iovec *compare_iov, int compare_iovcnt, 4574 struct iovec *write_iov, int write_iovcnt, 4575 uint64_t offset_blocks, uint64_t num_blocks, 4576 spdk_bdev_io_completion_cb cb, void *cb_arg) 4577 { 4578 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4579 struct spdk_bdev_io *bdev_io; 4580 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4581 4582 if (!desc->write) { 4583 return -EBADF; 4584 } 4585 4586 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4587 return -EINVAL; 4588 } 4589 4590 if (num_blocks > bdev->acwu) { 4591 return -EINVAL; 4592 } 4593 4594 bdev_io = bdev_channel_get_io(channel); 4595 if (!bdev_io) { 4596 return -ENOMEM; 4597 } 4598 4599 bdev_io->internal.ch = channel; 4600 bdev_io->internal.desc = desc; 4601 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 4602 bdev_io->u.bdev.iovs = compare_iov; 4603 bdev_io->u.bdev.iovcnt = compare_iovcnt; 4604 bdev_io->u.bdev.fused_iovs = write_iov; 4605 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4606 bdev_io->u.bdev.md_buf = NULL; 4607 bdev_io->u.bdev.num_blocks = num_blocks; 4608 bdev_io->u.bdev.offset_blocks = offset_blocks; 4609 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4610 4611 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4612 bdev_io_submit(bdev_io); 4613 return 0; 4614 } 4615 4616 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4617 bdev_comparev_and_writev_blocks_locked, bdev_io); 4618 } 4619 4620 int 4621 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4622 struct iovec *iov, int iovcnt, 4623 uint64_t offset_blocks, uint64_t num_blocks, 4624 bool populate, 4625 spdk_bdev_io_completion_cb cb, void *cb_arg) 4626 { 4627 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4628 struct spdk_bdev_io *bdev_io; 4629 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4630 4631 if (!desc->write) { 4632 return -EBADF; 4633 } 4634 4635 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4636 return -EINVAL; 4637 } 4638 4639 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4640 return -ENOTSUP; 4641 } 4642 4643 bdev_io = bdev_channel_get_io(channel); 4644 if (!bdev_io) { 4645 return -ENOMEM; 4646 } 4647 4648 bdev_io->internal.ch = channel; 4649 bdev_io->internal.desc = desc; 4650 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4651 bdev_io->u.bdev.num_blocks = num_blocks; 4652 bdev_io->u.bdev.offset_blocks = offset_blocks; 4653 bdev_io->u.bdev.iovs = iov; 4654 bdev_io->u.bdev.iovcnt = iovcnt; 4655 bdev_io->u.bdev.md_buf = NULL; 4656 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4657 bdev_io->u.bdev.zcopy.commit = 0; 4658 bdev_io->u.bdev.zcopy.start = 1; 4659 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4660 4661 bdev_io_submit(bdev_io); 4662 4663 return 0; 4664 } 4665 4666 int 4667 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4668 spdk_bdev_io_completion_cb cb, void *cb_arg) 4669 { 4670 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4671 return -EINVAL; 4672 } 4673 4674 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4675 bdev_io->u.bdev.zcopy.start = 0; 4676 bdev_io->internal.caller_ctx = cb_arg; 4677 bdev_io->internal.cb = cb; 4678 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4679 4680 bdev_io_submit(bdev_io); 4681 4682 return 0; 4683 } 4684 4685 int 4686 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4687 uint64_t offset, uint64_t len, 4688 spdk_bdev_io_completion_cb cb, void *cb_arg) 4689 { 4690 uint64_t offset_blocks, num_blocks; 4691 4692 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4693 len, &num_blocks) != 0) { 4694 return -EINVAL; 4695 } 4696 4697 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4698 } 4699 4700 int 4701 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4702 uint64_t offset_blocks, uint64_t num_blocks, 4703 spdk_bdev_io_completion_cb cb, void *cb_arg) 4704 { 4705 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4706 struct spdk_bdev_io *bdev_io; 4707 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4708 4709 if (!desc->write) { 4710 return -EBADF; 4711 } 4712 4713 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4714 return -EINVAL; 4715 } 4716 4717 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4718 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4719 return -ENOTSUP; 4720 } 4721 4722 bdev_io = bdev_channel_get_io(channel); 4723 4724 if (!bdev_io) { 4725 return -ENOMEM; 4726 } 4727 4728 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 4729 bdev_io->internal.ch = channel; 4730 bdev_io->internal.desc = desc; 4731 bdev_io->u.bdev.offset_blocks = offset_blocks; 4732 bdev_io->u.bdev.num_blocks = num_blocks; 4733 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4734 4735 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 4736 bdev_io_submit(bdev_io); 4737 return 0; 4738 } 4739 4740 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 4741 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 4742 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 4743 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 4744 bdev_write_zero_buffer_next(bdev_io); 4745 4746 return 0; 4747 } 4748 4749 int 4750 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4751 uint64_t offset, uint64_t nbytes, 4752 spdk_bdev_io_completion_cb cb, void *cb_arg) 4753 { 4754 uint64_t offset_blocks, num_blocks; 4755 4756 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4757 nbytes, &num_blocks) != 0) { 4758 return -EINVAL; 4759 } 4760 4761 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4762 } 4763 4764 int 4765 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4766 uint64_t offset_blocks, uint64_t num_blocks, 4767 spdk_bdev_io_completion_cb cb, void *cb_arg) 4768 { 4769 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4770 struct spdk_bdev_io *bdev_io; 4771 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4772 4773 if (!desc->write) { 4774 return -EBADF; 4775 } 4776 4777 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4778 return -EINVAL; 4779 } 4780 4781 if (num_blocks == 0) { 4782 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 4783 return -EINVAL; 4784 } 4785 4786 bdev_io = bdev_channel_get_io(channel); 4787 if (!bdev_io) { 4788 return -ENOMEM; 4789 } 4790 4791 bdev_io->internal.ch = channel; 4792 bdev_io->internal.desc = desc; 4793 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 4794 4795 bdev_io->u.bdev.iovs = &bdev_io->iov; 4796 bdev_io->u.bdev.iovs[0].iov_base = NULL; 4797 bdev_io->u.bdev.iovs[0].iov_len = 0; 4798 bdev_io->u.bdev.iovcnt = 1; 4799 4800 bdev_io->u.bdev.offset_blocks = offset_blocks; 4801 bdev_io->u.bdev.num_blocks = num_blocks; 4802 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4803 4804 bdev_io_submit(bdev_io); 4805 return 0; 4806 } 4807 4808 int 4809 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4810 uint64_t offset, uint64_t length, 4811 spdk_bdev_io_completion_cb cb, void *cb_arg) 4812 { 4813 uint64_t offset_blocks, num_blocks; 4814 4815 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4816 length, &num_blocks) != 0) { 4817 return -EINVAL; 4818 } 4819 4820 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4821 } 4822 4823 int 4824 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4825 uint64_t offset_blocks, uint64_t num_blocks, 4826 spdk_bdev_io_completion_cb cb, void *cb_arg) 4827 { 4828 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4829 struct spdk_bdev_io *bdev_io; 4830 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4831 4832 if (!desc->write) { 4833 return -EBADF; 4834 } 4835 4836 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4837 return -EINVAL; 4838 } 4839 4840 bdev_io = bdev_channel_get_io(channel); 4841 if (!bdev_io) { 4842 return -ENOMEM; 4843 } 4844 4845 bdev_io->internal.ch = channel; 4846 bdev_io->internal.desc = desc; 4847 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 4848 bdev_io->u.bdev.iovs = NULL; 4849 bdev_io->u.bdev.iovcnt = 0; 4850 bdev_io->u.bdev.offset_blocks = offset_blocks; 4851 bdev_io->u.bdev.num_blocks = num_blocks; 4852 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4853 4854 bdev_io_submit(bdev_io); 4855 return 0; 4856 } 4857 4858 static void 4859 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 4860 { 4861 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 4862 struct spdk_bdev_io *bdev_io; 4863 4864 bdev_io = TAILQ_FIRST(&ch->queued_resets); 4865 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 4866 bdev_io_submit_reset(bdev_io); 4867 } 4868 4869 static void 4870 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 4871 { 4872 struct spdk_io_channel *ch; 4873 struct spdk_bdev_channel *channel; 4874 struct spdk_bdev_mgmt_channel *mgmt_channel; 4875 struct spdk_bdev_shared_resource *shared_resource; 4876 bdev_io_tailq_t tmp_queued; 4877 4878 TAILQ_INIT(&tmp_queued); 4879 4880 ch = spdk_io_channel_iter_get_channel(i); 4881 channel = spdk_io_channel_get_ctx(ch); 4882 shared_resource = channel->shared_resource; 4883 mgmt_channel = shared_resource->mgmt_ch; 4884 4885 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 4886 4887 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 4888 /* The QoS object is always valid and readable while 4889 * the channel flag is set, so the lock here should not 4890 * be necessary. We're not in the fast path though, so 4891 * just take it anyway. */ 4892 pthread_mutex_lock(&channel->bdev->internal.mutex); 4893 if (channel->bdev->internal.qos->ch == channel) { 4894 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 4895 } 4896 pthread_mutex_unlock(&channel->bdev->internal.mutex); 4897 } 4898 4899 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 4900 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 4901 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 4902 bdev_abort_all_queued_io(&tmp_queued, channel); 4903 4904 spdk_for_each_channel_continue(i, 0); 4905 } 4906 4907 static void 4908 bdev_start_reset(void *ctx) 4909 { 4910 struct spdk_bdev_channel *ch = ctx; 4911 4912 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 4913 ch, bdev_reset_dev); 4914 } 4915 4916 static void 4917 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 4918 { 4919 struct spdk_bdev *bdev = ch->bdev; 4920 4921 assert(!TAILQ_EMPTY(&ch->queued_resets)); 4922 4923 pthread_mutex_lock(&bdev->internal.mutex); 4924 if (bdev->internal.reset_in_progress == NULL) { 4925 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 4926 /* 4927 * Take a channel reference for the target bdev for the life of this 4928 * reset. This guards against the channel getting destroyed while 4929 * spdk_for_each_channel() calls related to this reset IO are in 4930 * progress. We will release the reference when this reset is 4931 * completed. 4932 */ 4933 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4934 bdev_start_reset(ch); 4935 } 4936 pthread_mutex_unlock(&bdev->internal.mutex); 4937 } 4938 4939 int 4940 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4941 spdk_bdev_io_completion_cb cb, void *cb_arg) 4942 { 4943 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4944 struct spdk_bdev_io *bdev_io; 4945 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4946 4947 bdev_io = bdev_channel_get_io(channel); 4948 if (!bdev_io) { 4949 return -ENOMEM; 4950 } 4951 4952 bdev_io->internal.ch = channel; 4953 bdev_io->internal.desc = desc; 4954 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4955 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 4956 bdev_io->u.reset.ch_ref = NULL; 4957 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4958 4959 pthread_mutex_lock(&bdev->internal.mutex); 4960 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 4961 pthread_mutex_unlock(&bdev->internal.mutex); 4962 4963 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 4964 internal.ch_link); 4965 4966 bdev_channel_start_reset(channel); 4967 4968 return 0; 4969 } 4970 4971 void 4972 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4973 struct spdk_bdev_io_stat *stat) 4974 { 4975 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4976 4977 *stat = channel->stat; 4978 } 4979 4980 static void 4981 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 4982 { 4983 void *io_device = spdk_io_channel_iter_get_io_device(i); 4984 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4985 4986 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 4987 bdev_iostat_ctx->cb_arg, 0); 4988 free(bdev_iostat_ctx); 4989 } 4990 4991 static void 4992 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 4993 { 4994 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4995 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4996 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4997 4998 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 4999 spdk_for_each_channel_continue(i, 0); 5000 } 5001 5002 void 5003 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5004 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5005 { 5006 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5007 5008 assert(bdev != NULL); 5009 assert(stat != NULL); 5010 assert(cb != NULL); 5011 5012 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5013 if (bdev_iostat_ctx == NULL) { 5014 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5015 cb(bdev, stat, cb_arg, -ENOMEM); 5016 return; 5017 } 5018 5019 bdev_iostat_ctx->stat = stat; 5020 bdev_iostat_ctx->cb = cb; 5021 bdev_iostat_ctx->cb_arg = cb_arg; 5022 5023 /* Start with the statistics from previously deleted channels. */ 5024 pthread_mutex_lock(&bdev->internal.mutex); 5025 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 5026 pthread_mutex_unlock(&bdev->internal.mutex); 5027 5028 /* Then iterate and add the statistics from each existing channel. */ 5029 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5030 bdev_get_each_channel_stat, 5031 bdev_iostat_ctx, 5032 bdev_get_device_stat_done); 5033 } 5034 5035 int 5036 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5037 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5038 spdk_bdev_io_completion_cb cb, void *cb_arg) 5039 { 5040 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5041 struct spdk_bdev_io *bdev_io; 5042 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5043 5044 if (!desc->write) { 5045 return -EBADF; 5046 } 5047 5048 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5049 return -ENOTSUP; 5050 } 5051 5052 bdev_io = bdev_channel_get_io(channel); 5053 if (!bdev_io) { 5054 return -ENOMEM; 5055 } 5056 5057 bdev_io->internal.ch = channel; 5058 bdev_io->internal.desc = desc; 5059 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5060 bdev_io->u.nvme_passthru.cmd = *cmd; 5061 bdev_io->u.nvme_passthru.buf = buf; 5062 bdev_io->u.nvme_passthru.nbytes = nbytes; 5063 bdev_io->u.nvme_passthru.md_buf = NULL; 5064 bdev_io->u.nvme_passthru.md_len = 0; 5065 5066 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5067 5068 bdev_io_submit(bdev_io); 5069 return 0; 5070 } 5071 5072 int 5073 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5074 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5075 spdk_bdev_io_completion_cb cb, void *cb_arg) 5076 { 5077 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5078 struct spdk_bdev_io *bdev_io; 5079 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5080 5081 if (!desc->write) { 5082 /* 5083 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5084 * to easily determine if the command is a read or write, but for now just 5085 * do not allow io_passthru with a read-only descriptor. 5086 */ 5087 return -EBADF; 5088 } 5089 5090 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5091 return -ENOTSUP; 5092 } 5093 5094 bdev_io = bdev_channel_get_io(channel); 5095 if (!bdev_io) { 5096 return -ENOMEM; 5097 } 5098 5099 bdev_io->internal.ch = channel; 5100 bdev_io->internal.desc = desc; 5101 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5102 bdev_io->u.nvme_passthru.cmd = *cmd; 5103 bdev_io->u.nvme_passthru.buf = buf; 5104 bdev_io->u.nvme_passthru.nbytes = nbytes; 5105 bdev_io->u.nvme_passthru.md_buf = NULL; 5106 bdev_io->u.nvme_passthru.md_len = 0; 5107 5108 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5109 5110 bdev_io_submit(bdev_io); 5111 return 0; 5112 } 5113 5114 int 5115 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5116 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5117 spdk_bdev_io_completion_cb cb, void *cb_arg) 5118 { 5119 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5120 struct spdk_bdev_io *bdev_io; 5121 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5122 5123 if (!desc->write) { 5124 /* 5125 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5126 * to easily determine if the command is a read or write, but for now just 5127 * do not allow io_passthru with a read-only descriptor. 5128 */ 5129 return -EBADF; 5130 } 5131 5132 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5133 return -ENOTSUP; 5134 } 5135 5136 bdev_io = bdev_channel_get_io(channel); 5137 if (!bdev_io) { 5138 return -ENOMEM; 5139 } 5140 5141 bdev_io->internal.ch = channel; 5142 bdev_io->internal.desc = desc; 5143 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5144 bdev_io->u.nvme_passthru.cmd = *cmd; 5145 bdev_io->u.nvme_passthru.buf = buf; 5146 bdev_io->u.nvme_passthru.nbytes = nbytes; 5147 bdev_io->u.nvme_passthru.md_buf = md_buf; 5148 bdev_io->u.nvme_passthru.md_len = md_len; 5149 5150 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5151 5152 bdev_io_submit(bdev_io); 5153 return 0; 5154 } 5155 5156 static void bdev_abort_retry(void *ctx); 5157 static void bdev_abort(struct spdk_bdev_io *parent_io); 5158 5159 static void 5160 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5161 { 5162 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5163 struct spdk_bdev_io *parent_io = cb_arg; 5164 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5165 5166 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5167 5168 spdk_bdev_free_io(bdev_io); 5169 5170 if (!success) { 5171 /* Check if the target I/O completed in the meantime. */ 5172 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5173 if (tmp_io == bio_to_abort) { 5174 break; 5175 } 5176 } 5177 5178 /* If the target I/O still exists, set the parent to failed. */ 5179 if (tmp_io != NULL) { 5180 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5181 } 5182 } 5183 5184 parent_io->u.bdev.split_outstanding--; 5185 if (parent_io->u.bdev.split_outstanding == 0) { 5186 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5187 bdev_abort_retry(parent_io); 5188 } else { 5189 bdev_io_complete(parent_io); 5190 } 5191 } 5192 } 5193 5194 static int 5195 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5196 struct spdk_bdev_io *bio_to_abort, 5197 spdk_bdev_io_completion_cb cb, void *cb_arg) 5198 { 5199 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5200 struct spdk_bdev_io *bdev_io; 5201 5202 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5203 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5204 /* TODO: Abort reset or abort request. */ 5205 return -ENOTSUP; 5206 } 5207 5208 bdev_io = bdev_channel_get_io(channel); 5209 if (bdev_io == NULL) { 5210 return -ENOMEM; 5211 } 5212 5213 bdev_io->internal.ch = channel; 5214 bdev_io->internal.desc = desc; 5215 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5216 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5217 5218 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5219 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5220 5221 /* Parent abort request is not submitted directly, but to manage its 5222 * execution add it to the submitted list here. 5223 */ 5224 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5225 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5226 5227 bdev_abort(bdev_io); 5228 5229 return 0; 5230 } 5231 5232 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5233 5234 /* Submit the abort request to the underlying bdev module. */ 5235 bdev_io_submit(bdev_io); 5236 5237 return 0; 5238 } 5239 5240 static uint32_t 5241 _bdev_abort(struct spdk_bdev_io *parent_io) 5242 { 5243 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5244 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5245 void *bio_cb_arg; 5246 struct spdk_bdev_io *bio_to_abort; 5247 uint32_t matched_ios; 5248 int rc; 5249 5250 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5251 5252 /* matched_ios is returned and will be kept by the caller. 5253 * 5254 * This funcion will be used for two cases, 1) the same cb_arg is used for 5255 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5256 * Incrementing split_outstanding directly here may confuse readers especially 5257 * for the 1st case. 5258 * 5259 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5260 * works as expected. 5261 */ 5262 matched_ios = 0; 5263 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5264 5265 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5266 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5267 continue; 5268 } 5269 5270 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5271 /* Any I/O which was submitted after this abort command should be excluded. */ 5272 continue; 5273 } 5274 5275 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5276 if (rc != 0) { 5277 if (rc == -ENOMEM) { 5278 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5279 } else { 5280 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5281 } 5282 break; 5283 } 5284 matched_ios++; 5285 } 5286 5287 return matched_ios; 5288 } 5289 5290 static void 5291 bdev_abort_retry(void *ctx) 5292 { 5293 struct spdk_bdev_io *parent_io = ctx; 5294 uint32_t matched_ios; 5295 5296 matched_ios = _bdev_abort(parent_io); 5297 5298 if (matched_ios == 0) { 5299 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5300 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5301 } else { 5302 /* For retry, the case that no target I/O was found is success 5303 * because it means target I/Os completed in the meantime. 5304 */ 5305 bdev_io_complete(parent_io); 5306 } 5307 return; 5308 } 5309 5310 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5311 parent_io->u.bdev.split_outstanding = matched_ios; 5312 } 5313 5314 static void 5315 bdev_abort(struct spdk_bdev_io *parent_io) 5316 { 5317 uint32_t matched_ios; 5318 5319 matched_ios = _bdev_abort(parent_io); 5320 5321 if (matched_ios == 0) { 5322 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5323 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5324 } else { 5325 /* The case the no target I/O was found is failure. */ 5326 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5327 bdev_io_complete(parent_io); 5328 } 5329 return; 5330 } 5331 5332 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5333 parent_io->u.bdev.split_outstanding = matched_ios; 5334 } 5335 5336 int 5337 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5338 void *bio_cb_arg, 5339 spdk_bdev_io_completion_cb cb, void *cb_arg) 5340 { 5341 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5342 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5343 struct spdk_bdev_io *bdev_io; 5344 5345 if (bio_cb_arg == NULL) { 5346 return -EINVAL; 5347 } 5348 5349 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5350 return -ENOTSUP; 5351 } 5352 5353 bdev_io = bdev_channel_get_io(channel); 5354 if (bdev_io == NULL) { 5355 return -ENOMEM; 5356 } 5357 5358 bdev_io->internal.ch = channel; 5359 bdev_io->internal.desc = desc; 5360 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5361 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5362 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5363 5364 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5365 5366 /* Parent abort request is not submitted directly, but to manage its execution, 5367 * add it to the submitted list here. 5368 */ 5369 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5370 5371 bdev_abort(bdev_io); 5372 5373 return 0; 5374 } 5375 5376 int 5377 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5378 struct spdk_bdev_io_wait_entry *entry) 5379 { 5380 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5381 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5382 5383 if (bdev != entry->bdev) { 5384 SPDK_ERRLOG("bdevs do not match\n"); 5385 return -EINVAL; 5386 } 5387 5388 if (mgmt_ch->per_thread_cache_count > 0) { 5389 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5390 return -EINVAL; 5391 } 5392 5393 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5394 return 0; 5395 } 5396 5397 static inline void 5398 bdev_io_complete(void *ctx) 5399 { 5400 struct spdk_bdev_io *bdev_io = ctx; 5401 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5402 uint64_t tsc, tsc_diff; 5403 5404 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5405 /* 5406 * Send the completion to the thread that originally submitted the I/O, 5407 * which may not be the current thread in the case of QoS. 5408 */ 5409 if (bdev_io->internal.io_submit_ch) { 5410 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5411 bdev_io->internal.io_submit_ch = NULL; 5412 } 5413 5414 /* 5415 * Defer completion to avoid potential infinite recursion if the 5416 * user's completion callback issues a new I/O. 5417 */ 5418 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5419 bdev_io_complete, bdev_io); 5420 return; 5421 } 5422 5423 tsc = spdk_get_ticks(); 5424 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5425 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5426 bdev_io->internal.caller_ctx); 5427 5428 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5429 5430 if (bdev_io->internal.ch->histogram) { 5431 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5432 } 5433 5434 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5435 switch (bdev_io->type) { 5436 case SPDK_BDEV_IO_TYPE_READ: 5437 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5438 bdev_io->internal.ch->stat.num_read_ops++; 5439 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5440 break; 5441 case SPDK_BDEV_IO_TYPE_WRITE: 5442 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5443 bdev_io->internal.ch->stat.num_write_ops++; 5444 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5445 break; 5446 case SPDK_BDEV_IO_TYPE_UNMAP: 5447 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5448 bdev_io->internal.ch->stat.num_unmap_ops++; 5449 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5450 break; 5451 case SPDK_BDEV_IO_TYPE_ZCOPY: 5452 /* Track the data in the start phase only */ 5453 if (bdev_io->u.bdev.zcopy.start) { 5454 if (bdev_io->u.bdev.zcopy.populate) { 5455 bdev_io->internal.ch->stat.bytes_read += 5456 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5457 bdev_io->internal.ch->stat.num_read_ops++; 5458 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5459 } else { 5460 bdev_io->internal.ch->stat.bytes_written += 5461 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5462 bdev_io->internal.ch->stat.num_write_ops++; 5463 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5464 } 5465 } 5466 break; 5467 default: 5468 break; 5469 } 5470 } 5471 5472 #ifdef SPDK_CONFIG_VTUNE 5473 uint64_t now_tsc = spdk_get_ticks(); 5474 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5475 uint64_t data[5]; 5476 5477 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5478 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5479 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5480 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5481 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5482 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5483 5484 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5485 __itt_metadata_u64, 5, data); 5486 5487 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5488 bdev_io->internal.ch->start_tsc = now_tsc; 5489 } 5490 #endif 5491 5492 assert(bdev_io->internal.cb != NULL); 5493 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5494 5495 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5496 bdev_io->internal.caller_ctx); 5497 } 5498 5499 static void 5500 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 5501 { 5502 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5503 5504 if (bdev_io->u.reset.ch_ref != NULL) { 5505 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 5506 bdev_io->u.reset.ch_ref = NULL; 5507 } 5508 5509 bdev_io_complete(bdev_io); 5510 } 5511 5512 static void 5513 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 5514 { 5515 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5516 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5517 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5518 struct spdk_bdev_io *queued_reset; 5519 5520 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 5521 while (!TAILQ_EMPTY(&ch->queued_resets)) { 5522 queued_reset = TAILQ_FIRST(&ch->queued_resets); 5523 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 5524 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 5525 } 5526 5527 spdk_for_each_channel_continue(i, 0); 5528 } 5529 5530 void 5531 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 5532 { 5533 struct spdk_bdev *bdev = bdev_io->bdev; 5534 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5535 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5536 5537 bdev_io->internal.status = status; 5538 5539 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5540 bool unlock_channels = false; 5541 5542 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5543 SPDK_ERRLOG("NOMEM returned for reset\n"); 5544 } 5545 pthread_mutex_lock(&bdev->internal.mutex); 5546 if (bdev_io == bdev->internal.reset_in_progress) { 5547 bdev->internal.reset_in_progress = NULL; 5548 unlock_channels = true; 5549 } 5550 pthread_mutex_unlock(&bdev->internal.mutex); 5551 5552 if (unlock_channels) { 5553 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5554 bdev_io, bdev_reset_complete); 5555 return; 5556 } 5557 } else { 5558 _bdev_io_unset_bounce_buf(bdev_io); 5559 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 5560 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 5561 return; 5562 } 5563 } 5564 5565 bdev_io_complete(bdev_io); 5566 } 5567 5568 void 5569 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5570 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5571 { 5572 if (sc == SPDK_SCSI_STATUS_GOOD) { 5573 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5574 } else { 5575 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5576 bdev_io->internal.error.scsi.sc = sc; 5577 bdev_io->internal.error.scsi.sk = sk; 5578 bdev_io->internal.error.scsi.asc = asc; 5579 bdev_io->internal.error.scsi.ascq = ascq; 5580 } 5581 5582 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5583 } 5584 5585 void 5586 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5587 int *sc, int *sk, int *asc, int *ascq) 5588 { 5589 assert(sc != NULL); 5590 assert(sk != NULL); 5591 assert(asc != NULL); 5592 assert(ascq != NULL); 5593 5594 switch (bdev_io->internal.status) { 5595 case SPDK_BDEV_IO_STATUS_SUCCESS: 5596 *sc = SPDK_SCSI_STATUS_GOOD; 5597 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5598 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5599 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5600 break; 5601 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5602 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5603 break; 5604 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5605 *sc = bdev_io->internal.error.scsi.sc; 5606 *sk = bdev_io->internal.error.scsi.sk; 5607 *asc = bdev_io->internal.error.scsi.asc; 5608 *ascq = bdev_io->internal.error.scsi.ascq; 5609 break; 5610 default: 5611 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5612 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5613 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5614 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5615 break; 5616 } 5617 } 5618 5619 void 5620 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 5621 { 5622 if (aio_result == 0) { 5623 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5624 } else { 5625 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 5626 } 5627 5628 bdev_io->internal.error.aio_result = aio_result; 5629 5630 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5631 } 5632 5633 void 5634 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 5635 { 5636 assert(aio_result != NULL); 5637 5638 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 5639 *aio_result = bdev_io->internal.error.aio_result; 5640 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5641 *aio_result = 0; 5642 } else { 5643 *aio_result = -EIO; 5644 } 5645 } 5646 5647 void 5648 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5649 { 5650 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5651 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5652 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 5653 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 5654 } else { 5655 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5656 } 5657 5658 bdev_io->internal.error.nvme.cdw0 = cdw0; 5659 bdev_io->internal.error.nvme.sct = sct; 5660 bdev_io->internal.error.nvme.sc = sc; 5661 5662 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5663 } 5664 5665 void 5666 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5667 { 5668 assert(sct != NULL); 5669 assert(sc != NULL); 5670 assert(cdw0 != NULL); 5671 5672 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 5673 *sct = SPDK_NVME_SCT_GENERIC; 5674 *sc = SPDK_NVME_SC_SUCCESS; 5675 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5676 *cdw0 = 0; 5677 } else { 5678 *cdw0 = 1U; 5679 } 5680 return; 5681 } 5682 5683 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5684 *sct = bdev_io->internal.error.nvme.sct; 5685 *sc = bdev_io->internal.error.nvme.sc; 5686 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5687 *sct = SPDK_NVME_SCT_GENERIC; 5688 *sc = SPDK_NVME_SC_SUCCESS; 5689 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5690 *sct = SPDK_NVME_SCT_GENERIC; 5691 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5692 } else { 5693 *sct = SPDK_NVME_SCT_GENERIC; 5694 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5695 } 5696 5697 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5698 } 5699 5700 void 5701 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5702 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5703 { 5704 assert(first_sct != NULL); 5705 assert(first_sc != NULL); 5706 assert(second_sct != NULL); 5707 assert(second_sc != NULL); 5708 assert(cdw0 != NULL); 5709 5710 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5711 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5712 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5713 *first_sct = bdev_io->internal.error.nvme.sct; 5714 *first_sc = bdev_io->internal.error.nvme.sc; 5715 *second_sct = SPDK_NVME_SCT_GENERIC; 5716 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5717 } else { 5718 *first_sct = SPDK_NVME_SCT_GENERIC; 5719 *first_sc = SPDK_NVME_SC_SUCCESS; 5720 *second_sct = bdev_io->internal.error.nvme.sct; 5721 *second_sc = bdev_io->internal.error.nvme.sc; 5722 } 5723 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5724 *first_sct = SPDK_NVME_SCT_GENERIC; 5725 *first_sc = SPDK_NVME_SC_SUCCESS; 5726 *second_sct = SPDK_NVME_SCT_GENERIC; 5727 *second_sc = SPDK_NVME_SC_SUCCESS; 5728 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 5729 *first_sct = SPDK_NVME_SCT_GENERIC; 5730 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5731 *second_sct = SPDK_NVME_SCT_GENERIC; 5732 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5733 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 5734 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 5735 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 5736 *second_sct = SPDK_NVME_SCT_GENERIC; 5737 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5738 } else { 5739 *first_sct = SPDK_NVME_SCT_GENERIC; 5740 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5741 *second_sct = SPDK_NVME_SCT_GENERIC; 5742 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5743 } 5744 5745 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5746 } 5747 5748 struct spdk_thread * 5749 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 5750 { 5751 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 5752 } 5753 5754 struct spdk_io_channel * 5755 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 5756 { 5757 return bdev_io->internal.ch->channel; 5758 } 5759 5760 static int 5761 bdev_register(struct spdk_bdev *bdev) 5762 { 5763 char *bdev_name; 5764 char uuid[SPDK_UUID_STRING_LEN]; 5765 int ret; 5766 5767 assert(bdev->module != NULL); 5768 5769 if (!bdev->name) { 5770 SPDK_ERRLOG("Bdev name is NULL\n"); 5771 return -EINVAL; 5772 } 5773 5774 if (!strlen(bdev->name)) { 5775 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 5776 return -EINVAL; 5777 } 5778 5779 /* Users often register their own I/O devices using the bdev name. In 5780 * order to avoid conflicts, prepend bdev_. */ 5781 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 5782 if (!bdev_name) { 5783 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 5784 return -ENOMEM; 5785 } 5786 5787 bdev->internal.status = SPDK_BDEV_STATUS_READY; 5788 bdev->internal.measured_queue_depth = UINT64_MAX; 5789 bdev->internal.claim_module = NULL; 5790 bdev->internal.qd_poller = NULL; 5791 bdev->internal.qos = NULL; 5792 5793 TAILQ_INIT(&bdev->internal.open_descs); 5794 TAILQ_INIT(&bdev->internal.locked_ranges); 5795 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 5796 TAILQ_INIT(&bdev->aliases); 5797 5798 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 5799 if (ret != 0) { 5800 free(bdev_name); 5801 return ret; 5802 } 5803 5804 /* If the user didn't specify a uuid, generate one. */ 5805 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 5806 spdk_uuid_generate(&bdev->uuid); 5807 } 5808 5809 /* Add the UUID alias only if it's different than the name */ 5810 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 5811 if (strcmp(bdev->name, uuid) != 0) { 5812 ret = spdk_bdev_alias_add(bdev, uuid); 5813 if (ret != 0) { 5814 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 5815 bdev_name_del(&bdev->internal.bdev_name); 5816 free(bdev_name); 5817 return ret; 5818 } 5819 } 5820 5821 if (spdk_bdev_get_buf_align(bdev) > 1) { 5822 if (bdev->split_on_optimal_io_boundary) { 5823 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 5824 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 5825 } else { 5826 bdev->split_on_optimal_io_boundary = true; 5827 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 5828 } 5829 } 5830 5831 /* If the user didn't specify a write unit size, set it to one. */ 5832 if (bdev->write_unit_size == 0) { 5833 bdev->write_unit_size = 1; 5834 } 5835 5836 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 5837 if (bdev->acwu == 0) { 5838 bdev->acwu = 1; 5839 } 5840 5841 if (bdev->phys_blocklen == 0) { 5842 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 5843 } 5844 5845 bdev->internal.reset_in_progress = NULL; 5846 5847 spdk_io_device_register(__bdev_to_io_dev(bdev), 5848 bdev_channel_create, bdev_channel_destroy, 5849 sizeof(struct spdk_bdev_channel), 5850 bdev_name); 5851 5852 free(bdev_name); 5853 5854 pthread_mutex_init(&bdev->internal.mutex, NULL); 5855 5856 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 5857 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 5858 5859 return 0; 5860 } 5861 5862 static void 5863 bdev_destroy_cb(void *io_device) 5864 { 5865 int rc; 5866 struct spdk_bdev *bdev; 5867 spdk_bdev_unregister_cb cb_fn; 5868 void *cb_arg; 5869 5870 bdev = __bdev_from_io_dev(io_device); 5871 cb_fn = bdev->internal.unregister_cb; 5872 cb_arg = bdev->internal.unregister_ctx; 5873 5874 pthread_mutex_destroy(&bdev->internal.mutex); 5875 free(bdev->internal.qos); 5876 5877 rc = bdev->fn_table->destruct(bdev->ctxt); 5878 if (rc < 0) { 5879 SPDK_ERRLOG("destruct failed\n"); 5880 } 5881 if (rc <= 0 && cb_fn != NULL) { 5882 cb_fn(cb_arg, rc); 5883 } 5884 } 5885 5886 static void 5887 bdev_register_finished(void *arg) 5888 { 5889 struct spdk_bdev *bdev = arg; 5890 5891 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 5892 } 5893 5894 int 5895 spdk_bdev_register(struct spdk_bdev *bdev) 5896 { 5897 int rc = bdev_register(bdev); 5898 5899 if (rc == 0) { 5900 /* Examine configuration before initializing I/O */ 5901 bdev_examine(bdev); 5902 5903 spdk_bdev_wait_for_examine(bdev_register_finished, bdev); 5904 } 5905 5906 return rc; 5907 } 5908 5909 void 5910 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 5911 { 5912 if (bdev->internal.unregister_cb != NULL) { 5913 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 5914 } 5915 } 5916 5917 static void 5918 _remove_notify(void *arg) 5919 { 5920 struct spdk_bdev_desc *desc = arg; 5921 5922 pthread_mutex_lock(&desc->mutex); 5923 desc->refs--; 5924 5925 if (!desc->closed) { 5926 pthread_mutex_unlock(&desc->mutex); 5927 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 5928 return; 5929 } else if (0 == desc->refs) { 5930 /* This descriptor was closed after this remove_notify message was sent. 5931 * spdk_bdev_close() could not free the descriptor since this message was 5932 * in flight, so we free it now using bdev_desc_free(). 5933 */ 5934 pthread_mutex_unlock(&desc->mutex); 5935 bdev_desc_free(desc); 5936 return; 5937 } 5938 pthread_mutex_unlock(&desc->mutex); 5939 } 5940 5941 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 5942 * returns: 0 - bdev removed and ready to be destructed. 5943 * -EBUSY - bdev can't be destructed yet. */ 5944 static int 5945 bdev_unregister_unsafe(struct spdk_bdev *bdev) 5946 { 5947 struct spdk_bdev_desc *desc, *tmp; 5948 int rc = 0; 5949 char uuid[SPDK_UUID_STRING_LEN]; 5950 5951 /* Notify each descriptor about hotremoval */ 5952 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 5953 rc = -EBUSY; 5954 pthread_mutex_lock(&desc->mutex); 5955 /* 5956 * Defer invocation of the event_cb to a separate message that will 5957 * run later on its thread. This ensures this context unwinds and 5958 * we don't recursively unregister this bdev again if the event_cb 5959 * immediately closes its descriptor. 5960 */ 5961 desc->refs++; 5962 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 5963 pthread_mutex_unlock(&desc->mutex); 5964 } 5965 5966 /* If there are no descriptors, proceed removing the bdev */ 5967 if (rc == 0) { 5968 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 5969 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 5970 5971 /* Delete the name and the UUID alias */ 5972 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 5973 bdev_name_del_unsafe(&bdev->internal.bdev_name); 5974 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 5975 5976 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 5977 } 5978 5979 return rc; 5980 } 5981 5982 void 5983 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 5984 { 5985 struct spdk_thread *thread; 5986 int rc; 5987 5988 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 5989 5990 thread = spdk_get_thread(); 5991 if (!thread) { 5992 /* The user called this from a non-SPDK thread. */ 5993 if (cb_fn != NULL) { 5994 cb_fn(cb_arg, -ENOTSUP); 5995 } 5996 return; 5997 } 5998 5999 pthread_mutex_lock(&g_bdev_mgr.mutex); 6000 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6001 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6002 if (cb_fn) { 6003 cb_fn(cb_arg, -EBUSY); 6004 } 6005 return; 6006 } 6007 6008 pthread_mutex_lock(&bdev->internal.mutex); 6009 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6010 bdev->internal.unregister_cb = cb_fn; 6011 bdev->internal.unregister_ctx = cb_arg; 6012 6013 /* Call under lock. */ 6014 rc = bdev_unregister_unsafe(bdev); 6015 pthread_mutex_unlock(&bdev->internal.mutex); 6016 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6017 6018 if (rc == 0) { 6019 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6020 } 6021 } 6022 6023 static int 6024 bdev_start_qos(struct spdk_bdev *bdev) 6025 { 6026 struct set_qos_limit_ctx *ctx; 6027 6028 /* Enable QoS */ 6029 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6030 ctx = calloc(1, sizeof(*ctx)); 6031 if (ctx == NULL) { 6032 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6033 return -ENOMEM; 6034 } 6035 ctx->bdev = bdev; 6036 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6037 bdev_enable_qos_msg, ctx, 6038 bdev_enable_qos_done); 6039 } 6040 6041 return 0; 6042 } 6043 6044 static int 6045 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6046 { 6047 struct spdk_thread *thread; 6048 int rc = 0; 6049 6050 thread = spdk_get_thread(); 6051 if (!thread) { 6052 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6053 return -ENOTSUP; 6054 } 6055 6056 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6057 spdk_get_thread()); 6058 6059 desc->bdev = bdev; 6060 desc->thread = thread; 6061 desc->write = write; 6062 6063 pthread_mutex_lock(&bdev->internal.mutex); 6064 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6065 pthread_mutex_unlock(&bdev->internal.mutex); 6066 return -ENODEV; 6067 } 6068 6069 if (write && bdev->internal.claim_module) { 6070 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6071 bdev->name, bdev->internal.claim_module->name); 6072 pthread_mutex_unlock(&bdev->internal.mutex); 6073 return -EPERM; 6074 } 6075 6076 rc = bdev_start_qos(bdev); 6077 if (rc != 0) { 6078 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6079 pthread_mutex_unlock(&bdev->internal.mutex); 6080 return rc; 6081 } 6082 6083 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6084 6085 pthread_mutex_unlock(&bdev->internal.mutex); 6086 6087 return 0; 6088 } 6089 6090 int 6091 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6092 void *event_ctx, struct spdk_bdev_desc **_desc) 6093 { 6094 struct spdk_bdev_desc *desc; 6095 struct spdk_bdev *bdev; 6096 unsigned int event_id; 6097 int rc; 6098 6099 if (event_cb == NULL) { 6100 SPDK_ERRLOG("Missing event callback function\n"); 6101 return -EINVAL; 6102 } 6103 6104 pthread_mutex_lock(&g_bdev_mgr.mutex); 6105 6106 bdev = bdev_get_by_name(bdev_name); 6107 6108 if (bdev == NULL) { 6109 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6110 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6111 return -ENODEV; 6112 } 6113 6114 desc = calloc(1, sizeof(*desc)); 6115 if (desc == NULL) { 6116 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6117 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6118 return -ENOMEM; 6119 } 6120 6121 TAILQ_INIT(&desc->pending_media_events); 6122 TAILQ_INIT(&desc->free_media_events); 6123 6124 desc->callback.event_fn = event_cb; 6125 desc->callback.ctx = event_ctx; 6126 pthread_mutex_init(&desc->mutex, NULL); 6127 6128 if (bdev->media_events) { 6129 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6130 sizeof(*desc->media_events_buffer)); 6131 if (desc->media_events_buffer == NULL) { 6132 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6133 bdev_desc_free(desc); 6134 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6135 return -ENOMEM; 6136 } 6137 6138 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6139 TAILQ_INSERT_TAIL(&desc->free_media_events, 6140 &desc->media_events_buffer[event_id], tailq); 6141 } 6142 } 6143 6144 rc = bdev_open(bdev, write, desc); 6145 if (rc != 0) { 6146 bdev_desc_free(desc); 6147 desc = NULL; 6148 } 6149 6150 *_desc = desc; 6151 6152 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6153 6154 return rc; 6155 } 6156 6157 void 6158 spdk_bdev_close(struct spdk_bdev_desc *desc) 6159 { 6160 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6161 int rc; 6162 6163 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6164 spdk_get_thread()); 6165 6166 assert(desc->thread == spdk_get_thread()); 6167 6168 spdk_poller_unregister(&desc->io_timeout_poller); 6169 6170 pthread_mutex_lock(&g_bdev_mgr.mutex); 6171 pthread_mutex_lock(&bdev->internal.mutex); 6172 pthread_mutex_lock(&desc->mutex); 6173 6174 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6175 6176 desc->closed = true; 6177 6178 if (0 == desc->refs) { 6179 pthread_mutex_unlock(&desc->mutex); 6180 bdev_desc_free(desc); 6181 } else { 6182 pthread_mutex_unlock(&desc->mutex); 6183 } 6184 6185 /* If no more descriptors, kill QoS channel */ 6186 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6187 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6188 bdev->name, spdk_get_thread()); 6189 6190 if (bdev_qos_destroy(bdev)) { 6191 /* There isn't anything we can do to recover here. Just let the 6192 * old QoS poller keep running. The QoS handling won't change 6193 * cores when the user allocates a new channel, but it won't break. */ 6194 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6195 } 6196 } 6197 6198 spdk_bdev_set_qd_sampling_period(bdev, 0); 6199 6200 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6201 rc = bdev_unregister_unsafe(bdev); 6202 pthread_mutex_unlock(&bdev->internal.mutex); 6203 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6204 6205 if (rc == 0) { 6206 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6207 } 6208 } else { 6209 pthread_mutex_unlock(&bdev->internal.mutex); 6210 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6211 } 6212 } 6213 6214 int 6215 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6216 struct spdk_bdev_module *module) 6217 { 6218 if (bdev->internal.claim_module != NULL) { 6219 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6220 bdev->internal.claim_module->name); 6221 return -EPERM; 6222 } 6223 6224 if (desc && !desc->write) { 6225 desc->write = true; 6226 } 6227 6228 bdev->internal.claim_module = module; 6229 return 0; 6230 } 6231 6232 void 6233 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6234 { 6235 assert(bdev->internal.claim_module != NULL); 6236 bdev->internal.claim_module = NULL; 6237 } 6238 6239 struct spdk_bdev * 6240 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6241 { 6242 assert(desc != NULL); 6243 return desc->bdev; 6244 } 6245 6246 void 6247 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6248 { 6249 struct iovec *iovs; 6250 int iovcnt; 6251 6252 if (bdev_io == NULL) { 6253 return; 6254 } 6255 6256 switch (bdev_io->type) { 6257 case SPDK_BDEV_IO_TYPE_READ: 6258 case SPDK_BDEV_IO_TYPE_WRITE: 6259 case SPDK_BDEV_IO_TYPE_ZCOPY: 6260 iovs = bdev_io->u.bdev.iovs; 6261 iovcnt = bdev_io->u.bdev.iovcnt; 6262 break; 6263 default: 6264 iovs = NULL; 6265 iovcnt = 0; 6266 break; 6267 } 6268 6269 if (iovp) { 6270 *iovp = iovs; 6271 } 6272 if (iovcntp) { 6273 *iovcntp = iovcnt; 6274 } 6275 } 6276 6277 void * 6278 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 6279 { 6280 if (bdev_io == NULL) { 6281 return NULL; 6282 } 6283 6284 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 6285 return NULL; 6286 } 6287 6288 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 6289 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 6290 return bdev_io->u.bdev.md_buf; 6291 } 6292 6293 return NULL; 6294 } 6295 6296 void * 6297 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 6298 { 6299 if (bdev_io == NULL) { 6300 assert(false); 6301 return NULL; 6302 } 6303 6304 return bdev_io->internal.caller_ctx; 6305 } 6306 6307 void 6308 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 6309 { 6310 6311 if (spdk_bdev_module_list_find(bdev_module->name)) { 6312 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 6313 assert(false); 6314 } 6315 6316 /* 6317 * Modules with examine callbacks must be initialized first, so they are 6318 * ready to handle examine callbacks from later modules that will 6319 * register physical bdevs. 6320 */ 6321 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 6322 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6323 } else { 6324 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6325 } 6326 } 6327 6328 struct spdk_bdev_module * 6329 spdk_bdev_module_list_find(const char *name) 6330 { 6331 struct spdk_bdev_module *bdev_module; 6332 6333 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 6334 if (strcmp(name, bdev_module->name) == 0) { 6335 break; 6336 } 6337 } 6338 6339 return bdev_module; 6340 } 6341 6342 static void 6343 bdev_write_zero_buffer_next(void *_bdev_io) 6344 { 6345 struct spdk_bdev_io *bdev_io = _bdev_io; 6346 uint64_t num_bytes, num_blocks; 6347 void *md_buf = NULL; 6348 int rc; 6349 6350 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 6351 bdev_io->u.bdev.split_remaining_num_blocks, 6352 ZERO_BUFFER_SIZE); 6353 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 6354 6355 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 6356 md_buf = (char *)g_bdev_mgr.zero_buffer + 6357 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 6358 } 6359 6360 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 6361 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6362 g_bdev_mgr.zero_buffer, md_buf, 6363 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 6364 bdev_write_zero_buffer_done, bdev_io); 6365 if (rc == 0) { 6366 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 6367 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 6368 } else if (rc == -ENOMEM) { 6369 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 6370 } else { 6371 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6372 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6373 } 6374 } 6375 6376 static void 6377 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6378 { 6379 struct spdk_bdev_io *parent_io = cb_arg; 6380 6381 spdk_bdev_free_io(bdev_io); 6382 6383 if (!success) { 6384 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6385 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6386 return; 6387 } 6388 6389 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 6390 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6391 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6392 return; 6393 } 6394 6395 bdev_write_zero_buffer_next(parent_io); 6396 } 6397 6398 static void 6399 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 6400 { 6401 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6402 ctx->bdev->internal.qos_mod_in_progress = false; 6403 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6404 6405 if (ctx->cb_fn) { 6406 ctx->cb_fn(ctx->cb_arg, status); 6407 } 6408 free(ctx); 6409 } 6410 6411 static void 6412 bdev_disable_qos_done(void *cb_arg) 6413 { 6414 struct set_qos_limit_ctx *ctx = cb_arg; 6415 struct spdk_bdev *bdev = ctx->bdev; 6416 struct spdk_bdev_io *bdev_io; 6417 struct spdk_bdev_qos *qos; 6418 6419 pthread_mutex_lock(&bdev->internal.mutex); 6420 qos = bdev->internal.qos; 6421 bdev->internal.qos = NULL; 6422 pthread_mutex_unlock(&bdev->internal.mutex); 6423 6424 while (!TAILQ_EMPTY(&qos->queued)) { 6425 /* Send queued I/O back to their original thread for resubmission. */ 6426 bdev_io = TAILQ_FIRST(&qos->queued); 6427 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6428 6429 if (bdev_io->internal.io_submit_ch) { 6430 /* 6431 * Channel was changed when sending it to the QoS thread - change it back 6432 * before sending it back to the original thread. 6433 */ 6434 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6435 bdev_io->internal.io_submit_ch = NULL; 6436 } 6437 6438 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6439 _bdev_io_submit, bdev_io); 6440 } 6441 6442 if (qos->thread != NULL) { 6443 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6444 spdk_poller_unregister(&qos->poller); 6445 } 6446 6447 free(qos); 6448 6449 bdev_set_qos_limit_done(ctx, 0); 6450 } 6451 6452 static void 6453 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6454 { 6455 void *io_device = spdk_io_channel_iter_get_io_device(i); 6456 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6457 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6458 struct spdk_thread *thread; 6459 6460 pthread_mutex_lock(&bdev->internal.mutex); 6461 thread = bdev->internal.qos->thread; 6462 pthread_mutex_unlock(&bdev->internal.mutex); 6463 6464 if (thread != NULL) { 6465 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6466 } else { 6467 bdev_disable_qos_done(ctx); 6468 } 6469 } 6470 6471 static void 6472 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6473 { 6474 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6475 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6476 6477 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6478 6479 spdk_for_each_channel_continue(i, 0); 6480 } 6481 6482 static void 6483 bdev_update_qos_rate_limit_msg(void *cb_arg) 6484 { 6485 struct set_qos_limit_ctx *ctx = cb_arg; 6486 struct spdk_bdev *bdev = ctx->bdev; 6487 6488 pthread_mutex_lock(&bdev->internal.mutex); 6489 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6490 pthread_mutex_unlock(&bdev->internal.mutex); 6491 6492 bdev_set_qos_limit_done(ctx, 0); 6493 } 6494 6495 static void 6496 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6497 { 6498 void *io_device = spdk_io_channel_iter_get_io_device(i); 6499 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6500 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6501 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6502 6503 pthread_mutex_lock(&bdev->internal.mutex); 6504 bdev_enable_qos(bdev, bdev_ch); 6505 pthread_mutex_unlock(&bdev->internal.mutex); 6506 spdk_for_each_channel_continue(i, 0); 6507 } 6508 6509 static void 6510 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6511 { 6512 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6513 6514 bdev_set_qos_limit_done(ctx, status); 6515 } 6516 6517 static void 6518 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6519 { 6520 int i; 6521 6522 assert(bdev->internal.qos != NULL); 6523 6524 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6525 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6526 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6527 6528 if (limits[i] == 0) { 6529 bdev->internal.qos->rate_limits[i].limit = 6530 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6531 } 6532 } 6533 } 6534 } 6535 6536 void 6537 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6538 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6539 { 6540 struct set_qos_limit_ctx *ctx; 6541 uint32_t limit_set_complement; 6542 uint64_t min_limit_per_sec; 6543 int i; 6544 bool disable_rate_limit = true; 6545 6546 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6547 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6548 continue; 6549 } 6550 6551 if (limits[i] > 0) { 6552 disable_rate_limit = false; 6553 } 6554 6555 if (bdev_qos_is_iops_rate_limit(i) == true) { 6556 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 6557 } else { 6558 /* Change from megabyte to byte rate limit */ 6559 limits[i] = limits[i] * 1024 * 1024; 6560 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 6561 } 6562 6563 limit_set_complement = limits[i] % min_limit_per_sec; 6564 if (limit_set_complement) { 6565 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 6566 limits[i], min_limit_per_sec); 6567 limits[i] += min_limit_per_sec - limit_set_complement; 6568 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 6569 } 6570 } 6571 6572 ctx = calloc(1, sizeof(*ctx)); 6573 if (ctx == NULL) { 6574 cb_fn(cb_arg, -ENOMEM); 6575 return; 6576 } 6577 6578 ctx->cb_fn = cb_fn; 6579 ctx->cb_arg = cb_arg; 6580 ctx->bdev = bdev; 6581 6582 pthread_mutex_lock(&bdev->internal.mutex); 6583 if (bdev->internal.qos_mod_in_progress) { 6584 pthread_mutex_unlock(&bdev->internal.mutex); 6585 free(ctx); 6586 cb_fn(cb_arg, -EAGAIN); 6587 return; 6588 } 6589 bdev->internal.qos_mod_in_progress = true; 6590 6591 if (disable_rate_limit == true && bdev->internal.qos) { 6592 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6593 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 6594 (bdev->internal.qos->rate_limits[i].limit > 0 && 6595 bdev->internal.qos->rate_limits[i].limit != 6596 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 6597 disable_rate_limit = false; 6598 break; 6599 } 6600 } 6601 } 6602 6603 if (disable_rate_limit == false) { 6604 if (bdev->internal.qos == NULL) { 6605 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 6606 if (!bdev->internal.qos) { 6607 pthread_mutex_unlock(&bdev->internal.mutex); 6608 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 6609 bdev_set_qos_limit_done(ctx, -ENOMEM); 6610 return; 6611 } 6612 } 6613 6614 if (bdev->internal.qos->thread == NULL) { 6615 /* Enabling */ 6616 bdev_set_qos_rate_limits(bdev, limits); 6617 6618 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6619 bdev_enable_qos_msg, ctx, 6620 bdev_enable_qos_done); 6621 } else { 6622 /* Updating */ 6623 bdev_set_qos_rate_limits(bdev, limits); 6624 6625 spdk_thread_send_msg(bdev->internal.qos->thread, 6626 bdev_update_qos_rate_limit_msg, ctx); 6627 } 6628 } else { 6629 if (bdev->internal.qos != NULL) { 6630 bdev_set_qos_rate_limits(bdev, limits); 6631 6632 /* Disabling */ 6633 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6634 bdev_disable_qos_msg, ctx, 6635 bdev_disable_qos_msg_done); 6636 } else { 6637 pthread_mutex_unlock(&bdev->internal.mutex); 6638 bdev_set_qos_limit_done(ctx, 0); 6639 return; 6640 } 6641 } 6642 6643 pthread_mutex_unlock(&bdev->internal.mutex); 6644 } 6645 6646 struct spdk_bdev_histogram_ctx { 6647 spdk_bdev_histogram_status_cb cb_fn; 6648 void *cb_arg; 6649 struct spdk_bdev *bdev; 6650 int status; 6651 }; 6652 6653 static void 6654 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 6655 { 6656 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6657 6658 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6659 ctx->bdev->internal.histogram_in_progress = false; 6660 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6661 ctx->cb_fn(ctx->cb_arg, ctx->status); 6662 free(ctx); 6663 } 6664 6665 static void 6666 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 6667 { 6668 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6669 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6670 6671 if (ch->histogram != NULL) { 6672 spdk_histogram_data_free(ch->histogram); 6673 ch->histogram = NULL; 6674 } 6675 spdk_for_each_channel_continue(i, 0); 6676 } 6677 6678 static void 6679 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 6680 { 6681 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6682 6683 if (status != 0) { 6684 ctx->status = status; 6685 ctx->bdev->internal.histogram_enabled = false; 6686 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 6687 bdev_histogram_disable_channel_cb); 6688 } else { 6689 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6690 ctx->bdev->internal.histogram_in_progress = false; 6691 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6692 ctx->cb_fn(ctx->cb_arg, ctx->status); 6693 free(ctx); 6694 } 6695 } 6696 6697 static void 6698 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 6699 { 6700 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6701 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6702 int status = 0; 6703 6704 if (ch->histogram == NULL) { 6705 ch->histogram = spdk_histogram_data_alloc(); 6706 if (ch->histogram == NULL) { 6707 status = -ENOMEM; 6708 } 6709 } 6710 6711 spdk_for_each_channel_continue(i, status); 6712 } 6713 6714 void 6715 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 6716 void *cb_arg, bool enable) 6717 { 6718 struct spdk_bdev_histogram_ctx *ctx; 6719 6720 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 6721 if (ctx == NULL) { 6722 cb_fn(cb_arg, -ENOMEM); 6723 return; 6724 } 6725 6726 ctx->bdev = bdev; 6727 ctx->status = 0; 6728 ctx->cb_fn = cb_fn; 6729 ctx->cb_arg = cb_arg; 6730 6731 pthread_mutex_lock(&bdev->internal.mutex); 6732 if (bdev->internal.histogram_in_progress) { 6733 pthread_mutex_unlock(&bdev->internal.mutex); 6734 free(ctx); 6735 cb_fn(cb_arg, -EAGAIN); 6736 return; 6737 } 6738 6739 bdev->internal.histogram_in_progress = true; 6740 pthread_mutex_unlock(&bdev->internal.mutex); 6741 6742 bdev->internal.histogram_enabled = enable; 6743 6744 if (enable) { 6745 /* Allocate histogram for each channel */ 6746 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 6747 bdev_histogram_enable_channel_cb); 6748 } else { 6749 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 6750 bdev_histogram_disable_channel_cb); 6751 } 6752 } 6753 6754 struct spdk_bdev_histogram_data_ctx { 6755 spdk_bdev_histogram_data_cb cb_fn; 6756 void *cb_arg; 6757 struct spdk_bdev *bdev; 6758 /** merged histogram data from all channels */ 6759 struct spdk_histogram_data *histogram; 6760 }; 6761 6762 static void 6763 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 6764 { 6765 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6766 6767 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 6768 free(ctx); 6769 } 6770 6771 static void 6772 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 6773 { 6774 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6775 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6776 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6777 int status = 0; 6778 6779 if (ch->histogram == NULL) { 6780 status = -EFAULT; 6781 } else { 6782 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 6783 } 6784 6785 spdk_for_each_channel_continue(i, status); 6786 } 6787 6788 void 6789 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 6790 spdk_bdev_histogram_data_cb cb_fn, 6791 void *cb_arg) 6792 { 6793 struct spdk_bdev_histogram_data_ctx *ctx; 6794 6795 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 6796 if (ctx == NULL) { 6797 cb_fn(cb_arg, -ENOMEM, NULL); 6798 return; 6799 } 6800 6801 ctx->bdev = bdev; 6802 ctx->cb_fn = cb_fn; 6803 ctx->cb_arg = cb_arg; 6804 6805 ctx->histogram = histogram; 6806 6807 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 6808 bdev_histogram_get_channel_cb); 6809 } 6810 6811 size_t 6812 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 6813 size_t max_events) 6814 { 6815 struct media_event_entry *entry; 6816 size_t num_events = 0; 6817 6818 for (; num_events < max_events; ++num_events) { 6819 entry = TAILQ_FIRST(&desc->pending_media_events); 6820 if (entry == NULL) { 6821 break; 6822 } 6823 6824 events[num_events] = entry->event; 6825 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 6826 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 6827 } 6828 6829 return num_events; 6830 } 6831 6832 int 6833 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 6834 size_t num_events) 6835 { 6836 struct spdk_bdev_desc *desc; 6837 struct media_event_entry *entry; 6838 size_t event_id; 6839 int rc = 0; 6840 6841 assert(bdev->media_events); 6842 6843 pthread_mutex_lock(&bdev->internal.mutex); 6844 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6845 if (desc->write) { 6846 break; 6847 } 6848 } 6849 6850 if (desc == NULL || desc->media_events_buffer == NULL) { 6851 rc = -ENODEV; 6852 goto out; 6853 } 6854 6855 for (event_id = 0; event_id < num_events; ++event_id) { 6856 entry = TAILQ_FIRST(&desc->free_media_events); 6857 if (entry == NULL) { 6858 break; 6859 } 6860 6861 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 6862 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 6863 entry->event = events[event_id]; 6864 } 6865 6866 rc = event_id; 6867 out: 6868 pthread_mutex_unlock(&bdev->internal.mutex); 6869 return rc; 6870 } 6871 6872 void 6873 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 6874 { 6875 struct spdk_bdev_desc *desc; 6876 6877 pthread_mutex_lock(&bdev->internal.mutex); 6878 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6879 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 6880 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 6881 desc->callback.ctx); 6882 } 6883 } 6884 pthread_mutex_unlock(&bdev->internal.mutex); 6885 } 6886 6887 struct locked_lba_range_ctx { 6888 struct lba_range range; 6889 struct spdk_bdev *bdev; 6890 struct lba_range *current_range; 6891 struct lba_range *owner_range; 6892 struct spdk_poller *poller; 6893 lock_range_cb cb_fn; 6894 void *cb_arg; 6895 }; 6896 6897 static void 6898 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 6899 { 6900 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6901 6902 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 6903 free(ctx); 6904 } 6905 6906 static void 6907 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 6908 6909 static void 6910 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6911 { 6912 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6913 struct spdk_bdev *bdev = ctx->bdev; 6914 6915 if (status == -ENOMEM) { 6916 /* One of the channels could not allocate a range object. 6917 * So we have to go back and clean up any ranges that were 6918 * allocated successfully before we return error status to 6919 * the caller. We can reuse the unlock function to do that 6920 * clean up. 6921 */ 6922 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6923 bdev_unlock_lba_range_get_channel, ctx, 6924 bdev_lock_error_cleanup_cb); 6925 return; 6926 } 6927 6928 /* All channels have locked this range and no I/O overlapping the range 6929 * are outstanding! Set the owner_ch for the range object for the 6930 * locking channel, so that this channel will know that it is allowed 6931 * to write to this range. 6932 */ 6933 ctx->owner_range->owner_ch = ctx->range.owner_ch; 6934 ctx->cb_fn(ctx->cb_arg, status); 6935 6936 /* Don't free the ctx here. Its range is in the bdev's global list of 6937 * locked ranges still, and will be removed and freed when this range 6938 * is later unlocked. 6939 */ 6940 } 6941 6942 static int 6943 bdev_lock_lba_range_check_io(void *_i) 6944 { 6945 struct spdk_io_channel_iter *i = _i; 6946 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6947 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6948 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6949 struct lba_range *range = ctx->current_range; 6950 struct spdk_bdev_io *bdev_io; 6951 6952 spdk_poller_unregister(&ctx->poller); 6953 6954 /* The range is now in the locked_ranges, so no new IO can be submitted to this 6955 * range. But we need to wait until any outstanding IO overlapping with this range 6956 * are completed. 6957 */ 6958 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 6959 if (bdev_io_range_is_locked(bdev_io, range)) { 6960 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 6961 return SPDK_POLLER_BUSY; 6962 } 6963 } 6964 6965 spdk_for_each_channel_continue(i, 0); 6966 return SPDK_POLLER_BUSY; 6967 } 6968 6969 static void 6970 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6971 { 6972 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6973 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6974 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6975 struct lba_range *range; 6976 6977 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6978 if (range->length == ctx->range.length && 6979 range->offset == ctx->range.offset && 6980 range->locked_ctx == ctx->range.locked_ctx) { 6981 /* This range already exists on this channel, so don't add 6982 * it again. This can happen when a new channel is created 6983 * while the for_each_channel operation is in progress. 6984 * Do not check for outstanding I/O in that case, since the 6985 * range was locked before any I/O could be submitted to the 6986 * new channel. 6987 */ 6988 spdk_for_each_channel_continue(i, 0); 6989 return; 6990 } 6991 } 6992 6993 range = calloc(1, sizeof(*range)); 6994 if (range == NULL) { 6995 spdk_for_each_channel_continue(i, -ENOMEM); 6996 return; 6997 } 6998 6999 range->length = ctx->range.length; 7000 range->offset = ctx->range.offset; 7001 range->locked_ctx = ctx->range.locked_ctx; 7002 ctx->current_range = range; 7003 if (ctx->range.owner_ch == ch) { 7004 /* This is the range object for the channel that will hold 7005 * the lock. Store it in the ctx object so that we can easily 7006 * set its owner_ch after the lock is finally acquired. 7007 */ 7008 ctx->owner_range = range; 7009 } 7010 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 7011 bdev_lock_lba_range_check_io(i); 7012 } 7013 7014 static void 7015 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 7016 { 7017 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 7018 7019 /* We will add a copy of this range to each channel now. */ 7020 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 7021 bdev_lock_lba_range_cb); 7022 } 7023 7024 static bool 7025 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 7026 { 7027 struct lba_range *r; 7028 7029 TAILQ_FOREACH(r, tailq, tailq) { 7030 if (bdev_lba_range_overlapped(range, r)) { 7031 return true; 7032 } 7033 } 7034 return false; 7035 } 7036 7037 static int 7038 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7039 uint64_t offset, uint64_t length, 7040 lock_range_cb cb_fn, void *cb_arg) 7041 { 7042 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7043 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7044 struct locked_lba_range_ctx *ctx; 7045 7046 if (cb_arg == NULL) { 7047 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7048 return -EINVAL; 7049 } 7050 7051 ctx = calloc(1, sizeof(*ctx)); 7052 if (ctx == NULL) { 7053 return -ENOMEM; 7054 } 7055 7056 ctx->range.offset = offset; 7057 ctx->range.length = length; 7058 ctx->range.owner_ch = ch; 7059 ctx->range.locked_ctx = cb_arg; 7060 ctx->bdev = bdev; 7061 ctx->cb_fn = cb_fn; 7062 ctx->cb_arg = cb_arg; 7063 7064 pthread_mutex_lock(&bdev->internal.mutex); 7065 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7066 /* There is an active lock overlapping with this range. 7067 * Put it on the pending list until this range no 7068 * longer overlaps with another. 7069 */ 7070 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7071 } else { 7072 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7073 bdev_lock_lba_range_ctx(bdev, ctx); 7074 } 7075 pthread_mutex_unlock(&bdev->internal.mutex); 7076 return 0; 7077 } 7078 7079 static void 7080 bdev_lock_lba_range_ctx_msg(void *_ctx) 7081 { 7082 struct locked_lba_range_ctx *ctx = _ctx; 7083 7084 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7085 } 7086 7087 static void 7088 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7089 { 7090 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7091 struct locked_lba_range_ctx *pending_ctx; 7092 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7093 struct spdk_bdev *bdev = ch->bdev; 7094 struct lba_range *range, *tmp; 7095 7096 pthread_mutex_lock(&bdev->internal.mutex); 7097 /* Check if there are any pending locked ranges that overlap with this range 7098 * that was just unlocked. If there are, check that it doesn't overlap with any 7099 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7100 * the lock process. 7101 */ 7102 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7103 if (bdev_lba_range_overlapped(range, &ctx->range) && 7104 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7105 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7106 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7107 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7108 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7109 bdev_lock_lba_range_ctx_msg, pending_ctx); 7110 } 7111 } 7112 pthread_mutex_unlock(&bdev->internal.mutex); 7113 7114 ctx->cb_fn(ctx->cb_arg, status); 7115 free(ctx); 7116 } 7117 7118 static void 7119 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7120 { 7121 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7122 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7123 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7124 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7125 struct spdk_bdev_io *bdev_io; 7126 struct lba_range *range; 7127 7128 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7129 if (ctx->range.offset == range->offset && 7130 ctx->range.length == range->length && 7131 ctx->range.locked_ctx == range->locked_ctx) { 7132 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7133 free(range); 7134 break; 7135 } 7136 } 7137 7138 /* Note: we should almost always be able to assert that the range specified 7139 * was found. But there are some very rare corner cases where a new channel 7140 * gets created simultaneously with a range unlock, where this function 7141 * would execute on that new channel and wouldn't have the range. 7142 * We also use this to clean up range allocations when a later allocation 7143 * fails in the locking path. 7144 * So we can't actually assert() here. 7145 */ 7146 7147 /* Swap the locked IO into a temporary list, and then try to submit them again. 7148 * We could hyper-optimize this to only resubmit locked I/O that overlap 7149 * with the range that was just unlocked, but this isn't a performance path so 7150 * we go for simplicity here. 7151 */ 7152 TAILQ_INIT(&io_locked); 7153 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7154 while (!TAILQ_EMPTY(&io_locked)) { 7155 bdev_io = TAILQ_FIRST(&io_locked); 7156 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7157 bdev_io_submit(bdev_io); 7158 } 7159 7160 spdk_for_each_channel_continue(i, 0); 7161 } 7162 7163 static int 7164 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7165 uint64_t offset, uint64_t length, 7166 lock_range_cb cb_fn, void *cb_arg) 7167 { 7168 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7169 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7170 struct locked_lba_range_ctx *ctx; 7171 struct lba_range *range; 7172 bool range_found = false; 7173 7174 /* Let's make sure the specified channel actually has a lock on 7175 * the specified range. Note that the range must match exactly. 7176 */ 7177 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7178 if (range->offset == offset && range->length == length && 7179 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7180 range_found = true; 7181 break; 7182 } 7183 } 7184 7185 if (!range_found) { 7186 return -EINVAL; 7187 } 7188 7189 pthread_mutex_lock(&bdev->internal.mutex); 7190 /* We confirmed that this channel has locked the specified range. To 7191 * start the unlock the process, we find the range in the bdev's locked_ranges 7192 * and remove it. This ensures new channels don't inherit the locked range. 7193 * Then we will send a message to each channel (including the one specified 7194 * here) to remove the range from its per-channel list. 7195 */ 7196 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7197 if (range->offset == offset && range->length == length && 7198 range->locked_ctx == cb_arg) { 7199 break; 7200 } 7201 } 7202 if (range == NULL) { 7203 assert(false); 7204 pthread_mutex_unlock(&bdev->internal.mutex); 7205 return -EINVAL; 7206 } 7207 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7208 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7209 pthread_mutex_unlock(&bdev->internal.mutex); 7210 7211 ctx->cb_fn = cb_fn; 7212 ctx->cb_arg = cb_arg; 7213 7214 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7215 bdev_unlock_lba_range_cb); 7216 return 0; 7217 } 7218 7219 int 7220 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7221 int array_size) 7222 { 7223 if (!bdev) { 7224 return -EINVAL; 7225 } 7226 7227 if (bdev->fn_table->get_memory_domains) { 7228 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7229 } 7230 7231 return 0; 7232 } 7233 7234 SPDK_LOG_REGISTER_COMPONENT(bdev) 7235 7236 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 7237 { 7238 struct spdk_trace_tpoint_opts opts[] = { 7239 { 7240 "BDEV_IO_START", TRACE_BDEV_IO_START, 7241 OWNER_BDEV, OBJECT_BDEV_IO, 1, 7242 { 7243 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7244 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 7245 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7246 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 } 7247 } 7248 }, 7249 { 7250 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 7251 OWNER_BDEV, OBJECT_BDEV_IO, 0, 7252 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7253 }, 7254 { 7255 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 7256 OWNER_BDEV, OBJECT_NONE, 1, 7257 { 7258 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7259 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7260 } 7261 }, 7262 { 7263 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 7264 OWNER_BDEV, OBJECT_NONE, 0, 7265 { 7266 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7267 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7268 } 7269 }, 7270 }; 7271 7272 7273 spdk_trace_register_owner(OWNER_BDEV, 'b'); 7274 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 7275 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7276 } 7277