1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/notify.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk/log.h" 52 #include "spdk/string.h" 53 54 #include "bdev_internal.h" 55 #include "spdk_internal/trace_defs.h" 56 57 #ifdef SPDK_CONFIG_VTUNE 58 #include "ittnotify.h" 59 #include "ittnotify_types.h" 60 int __itt_init_ittlib(const char *, __itt_group_id); 61 #endif 62 63 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 64 #define SPDK_BDEV_IO_CACHE_SIZE 256 65 #define SPDK_BDEV_AUTO_EXAMINE true 66 #define BUF_SMALL_POOL_SIZE 8191 67 #define BUF_LARGE_POOL_SIZE 1023 68 #define NOMEM_THRESHOLD_COUNT 8 69 #define ZERO_BUFFER_SIZE 0x100000 70 71 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 72 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 73 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 74 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 75 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 76 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 77 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 78 79 #define SPDK_BDEV_POOL_ALIGNMENT 512 80 81 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 82 * when splitting into children requests at a time. 83 */ 84 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 85 86 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 87 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 88 }; 89 90 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 91 92 RB_HEAD(bdev_name_tree, spdk_bdev_name); 93 94 static int 95 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 96 { 97 return strcmp(name1->name, name2->name); 98 } 99 100 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 101 102 struct spdk_bdev_mgr { 103 struct spdk_mempool *bdev_io_pool; 104 105 struct spdk_mempool *buf_small_pool; 106 struct spdk_mempool *buf_large_pool; 107 108 void *zero_buffer; 109 110 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 111 112 struct spdk_bdev_list bdevs; 113 struct bdev_name_tree bdev_names; 114 115 bool init_complete; 116 bool module_init_complete; 117 118 pthread_mutex_t mutex; 119 120 #ifdef SPDK_CONFIG_VTUNE 121 __itt_domain *domain; 122 #endif 123 }; 124 125 static struct spdk_bdev_mgr g_bdev_mgr = { 126 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 127 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 128 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 129 .init_complete = false, 130 .module_init_complete = false, 131 .mutex = PTHREAD_MUTEX_INITIALIZER, 132 }; 133 134 typedef void (*lock_range_cb)(void *ctx, int status); 135 136 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 137 138 struct lba_range { 139 uint64_t offset; 140 uint64_t length; 141 void *locked_ctx; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 }; 145 146 static struct spdk_bdev_opts g_bdev_opts = { 147 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 148 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 149 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 150 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 151 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 152 }; 153 154 static spdk_bdev_init_cb g_init_cb_fn = NULL; 155 static void *g_init_cb_arg = NULL; 156 157 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 158 static void *g_fini_cb_arg = NULL; 159 static struct spdk_thread *g_fini_thread = NULL; 160 161 struct spdk_bdev_qos_limit { 162 /** IOs or bytes allowed per second (i.e., 1s). */ 163 uint64_t limit; 164 165 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 166 * For remaining bytes, allowed to run negative if an I/O is submitted when 167 * some bytes are remaining, but the I/O is bigger than that amount. The 168 * excess will be deducted from the next timeslice. 169 */ 170 int64_t remaining_this_timeslice; 171 172 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t min_per_timeslice; 174 175 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 176 uint32_t max_per_timeslice; 177 178 /** Function to check whether to queue the IO. */ 179 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 181 /** Function to update for the submitted IO. */ 182 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 }; 184 185 struct spdk_bdev_qos { 186 /** Types of structure of rate limits. */ 187 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 188 189 /** The channel that all I/O are funneled through. */ 190 struct spdk_bdev_channel *ch; 191 192 /** The thread on which the poller is running. */ 193 struct spdk_thread *thread; 194 195 /** Queue of I/O waiting to be issued. */ 196 bdev_io_tailq_t queued; 197 198 /** Size of a timeslice in tsc ticks. */ 199 uint64_t timeslice_size; 200 201 /** Timestamp of start of last timeslice. */ 202 uint64_t last_timeslice; 203 204 /** Poller that processes queued I/O commands each time slice. */ 205 struct spdk_poller *poller; 206 }; 207 208 struct spdk_bdev_mgmt_channel { 209 bdev_io_stailq_t need_buf_small; 210 bdev_io_stailq_t need_buf_large; 211 212 /* 213 * Each thread keeps a cache of bdev_io - this allows 214 * bdev threads which are *not* DPDK threads to still 215 * benefit from a per-thread bdev_io cache. Without 216 * this, non-DPDK threads fetching from the mempool 217 * incur a cmpxchg on get and put. 218 */ 219 bdev_io_stailq_t per_thread_cache; 220 uint32_t per_thread_cache_count; 221 uint32_t bdev_io_cache_size; 222 223 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 224 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 225 }; 226 227 /* 228 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 229 * will queue here their IO that awaits retry. It makes it possible to retry sending 230 * IO to one bdev after IO from other bdev completes. 231 */ 232 struct spdk_bdev_shared_resource { 233 /* The bdev management channel */ 234 struct spdk_bdev_mgmt_channel *mgmt_ch; 235 236 /* 237 * Count of I/O submitted to bdev module and waiting for completion. 238 * Incremented before submit_request() is called on an spdk_bdev_io. 239 */ 240 uint64_t io_outstanding; 241 242 /* 243 * Queue of IO awaiting retry because of a previous NOMEM status returned 244 * on this channel. 245 */ 246 bdev_io_tailq_t nomem_io; 247 248 /* 249 * Threshold which io_outstanding must drop to before retrying nomem_io. 250 */ 251 uint64_t nomem_threshold; 252 253 /* I/O channel allocated by a bdev module */ 254 struct spdk_io_channel *shared_ch; 255 256 /* Refcount of bdev channels using this resource */ 257 uint32_t ref; 258 259 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 260 }; 261 262 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 263 #define BDEV_CH_QOS_ENABLED (1 << 1) 264 265 struct spdk_bdev_channel { 266 struct spdk_bdev *bdev; 267 268 /* The channel for the underlying device */ 269 struct spdk_io_channel *channel; 270 271 /* Per io_device per thread data */ 272 struct spdk_bdev_shared_resource *shared_resource; 273 274 struct spdk_bdev_io_stat stat; 275 276 /* 277 * Count of I/O submitted to the underlying dev module through this channel 278 * and waiting for completion. 279 */ 280 uint64_t io_outstanding; 281 282 /* 283 * List of all submitted I/Os including I/O that are generated via splitting. 284 */ 285 bdev_io_tailq_t io_submitted; 286 287 /* 288 * List of spdk_bdev_io that are currently queued because they write to a locked 289 * LBA range. 290 */ 291 bdev_io_tailq_t io_locked; 292 293 uint32_t flags; 294 295 struct spdk_histogram_data *histogram; 296 297 #ifdef SPDK_CONFIG_VTUNE 298 uint64_t start_tsc; 299 uint64_t interval_tsc; 300 __itt_string_handle *handle; 301 struct spdk_bdev_io_stat prev_stat; 302 #endif 303 304 bdev_io_tailq_t queued_resets; 305 306 lba_range_tailq_t locked_ranges; 307 }; 308 309 struct media_event_entry { 310 struct spdk_bdev_media_event event; 311 TAILQ_ENTRY(media_event_entry) tailq; 312 }; 313 314 #define MEDIA_EVENT_POOL_SIZE 64 315 316 struct spdk_bdev_desc { 317 struct spdk_bdev *bdev; 318 struct spdk_thread *thread; 319 struct { 320 spdk_bdev_event_cb_t event_fn; 321 void *ctx; 322 } callback; 323 bool closed; 324 bool write; 325 pthread_mutex_t mutex; 326 uint32_t refs; 327 TAILQ_HEAD(, media_event_entry) pending_media_events; 328 TAILQ_HEAD(, media_event_entry) free_media_events; 329 struct media_event_entry *media_events_buffer; 330 TAILQ_ENTRY(spdk_bdev_desc) link; 331 332 uint64_t timeout_in_sec; 333 spdk_bdev_io_timeout_cb cb_fn; 334 void *cb_arg; 335 struct spdk_poller *io_timeout_poller; 336 }; 337 338 struct spdk_bdev_iostat_ctx { 339 struct spdk_bdev_io_stat *stat; 340 spdk_bdev_get_device_stat_cb cb; 341 void *cb_arg; 342 }; 343 344 struct set_qos_limit_ctx { 345 void (*cb_fn)(void *cb_arg, int status); 346 void *cb_arg; 347 struct spdk_bdev *bdev; 348 }; 349 350 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 351 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 352 353 static inline void bdev_io_complete(void *ctx); 354 355 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 356 static void bdev_write_zero_buffer_next(void *_bdev_io); 357 358 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 359 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 360 361 static int 362 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 363 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 364 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 365 struct spdk_bdev_ext_io_opts *opts); 366 static int 367 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 368 struct iovec *iov, int iovcnt, void *md_buf, 369 uint64_t offset_blocks, uint64_t num_blocks, 370 spdk_bdev_io_completion_cb cb, void *cb_arg, 371 struct spdk_bdev_ext_io_opts *opts); 372 373 static int 374 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 375 uint64_t offset, uint64_t length, 376 lock_range_cb cb_fn, void *cb_arg); 377 378 static int 379 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 380 uint64_t offset, uint64_t length, 381 lock_range_cb cb_fn, void *cb_arg); 382 383 static inline void bdev_io_complete(void *ctx); 384 385 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 386 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 387 388 void 389 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 390 { 391 if (!opts) { 392 SPDK_ERRLOG("opts should not be NULL\n"); 393 return; 394 } 395 396 if (!opts_size) { 397 SPDK_ERRLOG("opts_size should not be zero value\n"); 398 return; 399 } 400 401 opts->opts_size = opts_size; 402 403 #define SET_FIELD(field) \ 404 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 405 opts->field = g_bdev_opts.field; \ 406 } \ 407 408 SET_FIELD(bdev_io_pool_size); 409 SET_FIELD(bdev_io_cache_size); 410 SET_FIELD(bdev_auto_examine); 411 SET_FIELD(small_buf_pool_size); 412 SET_FIELD(large_buf_pool_size); 413 414 /* Do not remove this statement, you should always update this statement when you adding a new field, 415 * and do not forget to add the SET_FIELD statement for your added field. */ 416 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 417 418 #undef SET_FIELD 419 } 420 421 int 422 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 423 { 424 uint32_t min_pool_size; 425 426 if (!opts) { 427 SPDK_ERRLOG("opts cannot be NULL\n"); 428 return -1; 429 } 430 431 if (!opts->opts_size) { 432 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 433 return -1; 434 } 435 436 /* 437 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 438 * initialization. A second mgmt_ch will be created on the same thread when the application starts 439 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 440 */ 441 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 442 if (opts->bdev_io_pool_size < min_pool_size) { 443 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 444 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 445 spdk_thread_get_count()); 446 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 447 return -1; 448 } 449 450 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 451 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 452 return -1; 453 } 454 455 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 456 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 457 return -1; 458 } 459 460 #define SET_FIELD(field) \ 461 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 462 g_bdev_opts.field = opts->field; \ 463 } \ 464 465 SET_FIELD(bdev_io_pool_size); 466 SET_FIELD(bdev_io_cache_size); 467 SET_FIELD(bdev_auto_examine); 468 SET_FIELD(small_buf_pool_size); 469 SET_FIELD(large_buf_pool_size); 470 471 g_bdev_opts.opts_size = opts->opts_size; 472 473 #undef SET_FIELD 474 475 return 0; 476 } 477 478 static struct spdk_bdev * 479 bdev_get_by_name(const char *bdev_name) 480 { 481 struct spdk_bdev_name find; 482 struct spdk_bdev_name *res; 483 484 find.name = (char *)bdev_name; 485 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 486 if (res != NULL) { 487 return res->bdev; 488 } 489 490 return NULL; 491 } 492 493 struct spdk_bdev * 494 spdk_bdev_get_by_name(const char *bdev_name) 495 { 496 struct spdk_bdev *bdev; 497 498 pthread_mutex_lock(&g_bdev_mgr.mutex); 499 bdev = bdev_get_by_name(bdev_name); 500 pthread_mutex_unlock(&g_bdev_mgr.mutex); 501 502 return bdev; 503 } 504 505 struct spdk_bdev_wait_for_examine_ctx { 506 struct spdk_poller *poller; 507 spdk_bdev_wait_for_examine_cb cb_fn; 508 void *cb_arg; 509 }; 510 511 static bool 512 bdev_module_all_actions_completed(void); 513 514 static int 515 bdev_wait_for_examine_cb(void *arg) 516 { 517 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 518 519 if (!bdev_module_all_actions_completed()) { 520 return SPDK_POLLER_IDLE; 521 } 522 523 spdk_poller_unregister(&ctx->poller); 524 ctx->cb_fn(ctx->cb_arg); 525 free(ctx); 526 527 return SPDK_POLLER_BUSY; 528 } 529 530 int 531 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 532 { 533 struct spdk_bdev_wait_for_examine_ctx *ctx; 534 535 ctx = calloc(1, sizeof(*ctx)); 536 if (ctx == NULL) { 537 return -ENOMEM; 538 } 539 ctx->cb_fn = cb_fn; 540 ctx->cb_arg = cb_arg; 541 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 542 543 return 0; 544 } 545 546 struct spdk_bdev_examine_item { 547 char *name; 548 TAILQ_ENTRY(spdk_bdev_examine_item) link; 549 }; 550 551 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 552 553 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 554 g_bdev_examine_allowlist); 555 556 static inline bool 557 bdev_examine_allowlist_check(const char *name) 558 { 559 struct spdk_bdev_examine_item *item; 560 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 561 if (strcmp(name, item->name) == 0) { 562 return true; 563 } 564 } 565 return false; 566 } 567 568 static inline void 569 bdev_examine_allowlist_free(void) 570 { 571 struct spdk_bdev_examine_item *item; 572 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 573 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 574 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 575 free(item->name); 576 free(item); 577 } 578 } 579 580 static inline bool 581 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 582 { 583 struct spdk_bdev_alias *tmp; 584 if (bdev_examine_allowlist_check(bdev->name)) { 585 return true; 586 } 587 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 588 if (bdev_examine_allowlist_check(tmp->alias.name)) { 589 return true; 590 } 591 } 592 return false; 593 } 594 595 static inline bool 596 bdev_ok_to_examine(struct spdk_bdev *bdev) 597 { 598 if (g_bdev_opts.bdev_auto_examine) { 599 return true; 600 } else { 601 return bdev_in_examine_allowlist(bdev); 602 } 603 } 604 605 static void 606 bdev_examine(struct spdk_bdev *bdev) 607 { 608 struct spdk_bdev_module *module; 609 uint32_t action; 610 611 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 612 if (module->examine_config && bdev_ok_to_examine(bdev)) { 613 action = module->internal.action_in_progress; 614 module->internal.action_in_progress++; 615 module->examine_config(bdev); 616 if (action != module->internal.action_in_progress) { 617 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 618 module->name); 619 } 620 } 621 } 622 623 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 624 if (bdev->internal.claim_module->examine_disk) { 625 bdev->internal.claim_module->internal.action_in_progress++; 626 bdev->internal.claim_module->examine_disk(bdev); 627 } 628 return; 629 } 630 631 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 632 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 633 module->internal.action_in_progress++; 634 module->examine_disk(bdev); 635 } 636 } 637 } 638 639 int 640 spdk_bdev_examine(const char *name) 641 { 642 struct spdk_bdev *bdev; 643 struct spdk_bdev_examine_item *item; 644 645 if (g_bdev_opts.bdev_auto_examine) { 646 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 647 return -EINVAL; 648 } 649 650 if (bdev_examine_allowlist_check(name)) { 651 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 652 return -EEXIST; 653 } 654 655 item = calloc(1, sizeof(*item)); 656 if (!item) { 657 return -ENOMEM; 658 } 659 item->name = strdup(name); 660 if (!item->name) { 661 free(item); 662 return -ENOMEM; 663 } 664 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 665 666 bdev = spdk_bdev_get_by_name(name); 667 if (bdev) { 668 bdev_examine(bdev); 669 } 670 return 0; 671 } 672 673 static inline void 674 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 675 { 676 struct spdk_bdev_examine_item *item; 677 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 678 spdk_json_write_object_begin(w); 679 spdk_json_write_named_string(w, "method", "bdev_examine"); 680 spdk_json_write_named_object_begin(w, "params"); 681 spdk_json_write_named_string(w, "name", item->name); 682 spdk_json_write_object_end(w); 683 spdk_json_write_object_end(w); 684 } 685 } 686 687 struct spdk_bdev * 688 spdk_bdev_first(void) 689 { 690 struct spdk_bdev *bdev; 691 692 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 693 if (bdev) { 694 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 695 } 696 697 return bdev; 698 } 699 700 struct spdk_bdev * 701 spdk_bdev_next(struct spdk_bdev *prev) 702 { 703 struct spdk_bdev *bdev; 704 705 bdev = TAILQ_NEXT(prev, internal.link); 706 if (bdev) { 707 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 708 } 709 710 return bdev; 711 } 712 713 static struct spdk_bdev * 714 _bdev_next_leaf(struct spdk_bdev *bdev) 715 { 716 while (bdev != NULL) { 717 if (bdev->internal.claim_module == NULL) { 718 return bdev; 719 } else { 720 bdev = TAILQ_NEXT(bdev, internal.link); 721 } 722 } 723 724 return bdev; 725 } 726 727 struct spdk_bdev * 728 spdk_bdev_first_leaf(void) 729 { 730 struct spdk_bdev *bdev; 731 732 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 733 734 if (bdev) { 735 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 736 } 737 738 return bdev; 739 } 740 741 struct spdk_bdev * 742 spdk_bdev_next_leaf(struct spdk_bdev *prev) 743 { 744 struct spdk_bdev *bdev; 745 746 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 747 748 if (bdev) { 749 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 750 } 751 752 return bdev; 753 } 754 755 void 756 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 757 { 758 struct iovec *iovs; 759 760 if (bdev_io->u.bdev.iovs == NULL) { 761 bdev_io->u.bdev.iovs = &bdev_io->iov; 762 bdev_io->u.bdev.iovcnt = 1; 763 } 764 765 iovs = bdev_io->u.bdev.iovs; 766 767 assert(iovs != NULL); 768 assert(bdev_io->u.bdev.iovcnt >= 1); 769 770 iovs[0].iov_base = buf; 771 iovs[0].iov_len = len; 772 } 773 774 void 775 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 776 { 777 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 778 bdev_io->u.bdev.md_buf = md_buf; 779 } 780 781 static bool 782 _is_buf_allocated(const struct iovec *iovs) 783 { 784 if (iovs == NULL) { 785 return false; 786 } 787 788 return iovs[0].iov_base != NULL; 789 } 790 791 static bool 792 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 793 { 794 int i; 795 uintptr_t iov_base; 796 797 if (spdk_likely(alignment == 1)) { 798 return true; 799 } 800 801 for (i = 0; i < iovcnt; i++) { 802 iov_base = (uintptr_t)iovs[i].iov_base; 803 if ((iov_base & (alignment - 1)) != 0) { 804 return false; 805 } 806 } 807 808 return true; 809 } 810 811 static void 812 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 813 { 814 int i; 815 size_t len; 816 817 for (i = 0; i < iovcnt; i++) { 818 len = spdk_min(iovs[i].iov_len, buf_len); 819 memcpy(buf, iovs[i].iov_base, len); 820 buf += len; 821 buf_len -= len; 822 } 823 } 824 825 static void 826 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 827 { 828 int i; 829 size_t len; 830 831 for (i = 0; i < iovcnt; i++) { 832 len = spdk_min(iovs[i].iov_len, buf_len); 833 memcpy(iovs[i].iov_base, buf, len); 834 buf += len; 835 buf_len -= len; 836 } 837 } 838 839 static void 840 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 841 { 842 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 843 void *buf; 844 845 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 846 buf = bdev_io->internal.buf; 847 bdev_io->internal.buf = NULL; 848 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 849 bdev_io->internal.get_aux_buf_cb = NULL; 850 } else { 851 assert(bdev_io->internal.get_buf_cb != NULL); 852 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 853 bdev_io->internal.get_buf_cb = NULL; 854 } 855 } 856 857 static void 858 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 859 { 860 struct spdk_bdev_io *bdev_io = ctx; 861 862 if (rc) { 863 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 864 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 865 } 866 bdev_io_get_buf_complete(bdev_io, !rc); 867 } 868 869 static void 870 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 871 { 872 /* save original md_buf */ 873 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 874 bdev_io->internal.orig_md_iov.iov_len = len; 875 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 876 bdev_io->internal.bounce_md_iov.iov_len = len; 877 /* set bounce md_buf */ 878 bdev_io->u.bdev.md_buf = md_buf; 879 880 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 881 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 882 } 883 884 assert(bdev_io->internal.data_transfer_cpl); 885 bdev_io->internal.data_transfer_cpl(bdev_io, 0); 886 } 887 888 static void 889 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 890 { 891 struct spdk_bdev *bdev = bdev_io->bdev; 892 uint64_t md_len; 893 void *buf; 894 895 if (spdk_bdev_is_md_separate(bdev)) { 896 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 897 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 898 899 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 900 901 if (bdev_io->u.bdev.md_buf != NULL) { 902 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 903 return; 904 } else { 905 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 906 } 907 } 908 909 bdev_io_get_buf_complete(bdev_io, true); 910 } 911 912 static void 913 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 914 { 915 struct spdk_bdev_io *bdev_io = ctx; 916 917 if (rc) { 918 SPDK_ERRLOG("Failed to get data buffer\n"); 919 assert(bdev_io->internal.data_transfer_cpl); 920 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 921 return; 922 } 923 924 _bdev_io_set_md_buf(bdev_io); 925 } 926 927 static void 928 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 929 bdev_copy_bounce_buffer_cpl cpl_cb) 930 { 931 bdev_io->internal.data_transfer_cpl = cpl_cb; 932 /* save original iovec */ 933 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 934 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 935 /* set bounce iov */ 936 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 937 bdev_io->u.bdev.iovcnt = 1; 938 /* set bounce buffer for this operation */ 939 bdev_io->u.bdev.iovs[0].iov_base = buf; 940 bdev_io->u.bdev.iovs[0].iov_len = len; 941 /* if this is write path, copy data from original buffer to bounce buffer */ 942 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 943 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 944 } 945 946 _bdev_io_pull_bounce_data_buf_done(bdev_io, 0); 947 } 948 949 static void 950 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 951 { 952 struct spdk_bdev *bdev = bdev_io->bdev; 953 bool buf_allocated; 954 uint64_t alignment; 955 void *aligned_buf; 956 957 bdev_io->internal.buf = buf; 958 959 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 960 bdev_io_get_buf_complete(bdev_io, true); 961 return; 962 } 963 964 alignment = spdk_bdev_get_buf_align(bdev); 965 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 966 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 967 968 if (buf_allocated) { 969 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 970 /* Continue in completion callback */ 971 return; 972 } else { 973 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 974 } 975 976 _bdev_io_set_md_buf(bdev_io); 977 } 978 979 static void 980 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 981 { 982 struct spdk_bdev *bdev = bdev_io->bdev; 983 struct spdk_mempool *pool; 984 struct spdk_bdev_io *tmp; 985 bdev_io_stailq_t *stailq; 986 struct spdk_bdev_mgmt_channel *ch; 987 uint64_t md_len, alignment; 988 989 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 990 alignment = spdk_bdev_get_buf_align(bdev); 991 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 992 993 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 994 SPDK_BDEV_POOL_ALIGNMENT) { 995 pool = g_bdev_mgr.buf_small_pool; 996 stailq = &ch->need_buf_small; 997 } else { 998 pool = g_bdev_mgr.buf_large_pool; 999 stailq = &ch->need_buf_large; 1000 } 1001 1002 if (STAILQ_EMPTY(stailq)) { 1003 spdk_mempool_put(pool, buf); 1004 } else { 1005 tmp = STAILQ_FIRST(stailq); 1006 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 1007 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 1008 } 1009 } 1010 1011 static void 1012 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1013 { 1014 assert(bdev_io->internal.buf != NULL); 1015 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1016 bdev_io->internal.buf = NULL; 1017 } 1018 1019 void 1020 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1021 { 1022 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1023 1024 assert(buf != NULL); 1025 _bdev_io_put_buf(bdev_io, buf, len); 1026 } 1027 1028 static void 1029 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1030 { 1031 struct spdk_bdev *bdev = bdev_ch->bdev; 1032 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1033 struct spdk_bdev_io *bdev_io; 1034 1035 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1036 /* 1037 * Allow some more I/O to complete before retrying the nomem_io queue. 1038 * Some drivers (such as nvme) cannot immediately take a new I/O in 1039 * the context of a completion, because the resources for the I/O are 1040 * not released until control returns to the bdev poller. Also, we 1041 * may require several small I/O to complete before a larger I/O 1042 * (that requires splitting) can be submitted. 1043 */ 1044 return; 1045 } 1046 1047 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1048 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1049 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1050 bdev_io->internal.ch->io_outstanding++; 1051 shared_resource->io_outstanding++; 1052 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1053 bdev_io->internal.error.nvme.cdw0 = 0; 1054 bdev_io->num_retries++; 1055 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1056 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1057 break; 1058 } 1059 } 1060 } 1061 1062 static inline void 1063 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1064 struct spdk_bdev_shared_resource *shared_resource) 1065 { 1066 assert(bdev_ch->io_outstanding > 0); 1067 assert(shared_resource->io_outstanding > 0); 1068 bdev_ch->io_outstanding--; 1069 shared_resource->io_outstanding--; 1070 } 1071 1072 static inline bool 1073 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1074 { 1075 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1076 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1077 1078 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1079 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1080 /* 1081 * Wait for some of the outstanding I/O to complete before we 1082 * retry any of the nomem_io. Normally we will wait for 1083 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1084 * depth channels we will instead wait for half to complete. 1085 */ 1086 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1087 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1088 return true; 1089 } 1090 1091 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1092 bdev_ch_retry_io(bdev_ch); 1093 } 1094 1095 return false; 1096 } 1097 1098 static void 1099 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1100 { 1101 struct spdk_bdev_io *bdev_io = ctx; 1102 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1103 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1104 1105 if (rc) { 1106 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1107 } 1108 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1109 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1110 */ 1111 bdev_io_put_buf(bdev_io); 1112 1113 /* Continue with IO completion flow */ 1114 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1115 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1116 return; 1117 } 1118 1119 bdev_io_complete(bdev_io); 1120 } 1121 1122 static inline void 1123 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1124 { 1125 /* do the same for metadata buffer */ 1126 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1127 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1128 1129 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1130 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1131 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1132 bdev_io->internal.orig_md_iov.iov_len); 1133 } 1134 } 1135 1136 assert(bdev_io->internal.data_transfer_cpl); 1137 bdev_io->internal.data_transfer_cpl(bdev_io, 0); 1138 } 1139 1140 static void 1141 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1142 { 1143 struct spdk_bdev_io *bdev_io = ctx; 1144 1145 assert(bdev_io->internal.data_transfer_cpl); 1146 1147 if (rc) { 1148 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1149 return; 1150 } 1151 1152 /* set original buffer for this io */ 1153 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1154 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1155 /* disable bouncing buffer for this io */ 1156 bdev_io->internal.orig_iovcnt = 0; 1157 bdev_io->internal.orig_iovs = NULL; 1158 1159 _bdev_io_push_bounce_md_buffer(bdev_io); 1160 } 1161 1162 static inline void 1163 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1164 { 1165 bdev_io->internal.data_transfer_cpl = cpl_cb; 1166 1167 /* if this is read path, copy data from bounce buffer to original buffer */ 1168 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1169 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1170 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1171 bdev_io->internal.orig_iovcnt, 1172 bdev_io->internal.bounce_iov.iov_base, 1173 bdev_io->internal.bounce_iov.iov_len); 1174 } 1175 1176 _bdev_io_push_bounce_data_buffer_done(bdev_io, 0); 1177 } 1178 1179 static void 1180 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1181 { 1182 struct spdk_bdev *bdev = bdev_io->bdev; 1183 struct spdk_mempool *pool; 1184 bdev_io_stailq_t *stailq; 1185 struct spdk_bdev_mgmt_channel *mgmt_ch; 1186 uint64_t alignment, md_len; 1187 void *buf; 1188 1189 alignment = spdk_bdev_get_buf_align(bdev); 1190 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1191 1192 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1193 SPDK_BDEV_POOL_ALIGNMENT) { 1194 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1195 len + alignment); 1196 bdev_io_get_buf_complete(bdev_io, false); 1197 return; 1198 } 1199 1200 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1201 1202 bdev_io->internal.buf_len = len; 1203 1204 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1205 SPDK_BDEV_POOL_ALIGNMENT) { 1206 pool = g_bdev_mgr.buf_small_pool; 1207 stailq = &mgmt_ch->need_buf_small; 1208 } else { 1209 pool = g_bdev_mgr.buf_large_pool; 1210 stailq = &mgmt_ch->need_buf_large; 1211 } 1212 1213 buf = spdk_mempool_get(pool); 1214 if (!buf) { 1215 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1216 } else { 1217 _bdev_io_set_buf(bdev_io, buf, len); 1218 } 1219 } 1220 1221 void 1222 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1223 { 1224 struct spdk_bdev *bdev = bdev_io->bdev; 1225 uint64_t alignment; 1226 1227 assert(cb != NULL); 1228 bdev_io->internal.get_buf_cb = cb; 1229 1230 alignment = spdk_bdev_get_buf_align(bdev); 1231 1232 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1233 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1234 /* Buffer already present and aligned */ 1235 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1236 return; 1237 } 1238 1239 bdev_io_get_buf(bdev_io, len); 1240 } 1241 1242 void 1243 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1244 { 1245 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1246 1247 assert(cb != NULL); 1248 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1249 bdev_io->internal.get_aux_buf_cb = cb; 1250 bdev_io_get_buf(bdev_io, len); 1251 } 1252 1253 static int 1254 bdev_module_get_max_ctx_size(void) 1255 { 1256 struct spdk_bdev_module *bdev_module; 1257 int max_bdev_module_size = 0; 1258 1259 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1260 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1261 max_bdev_module_size = bdev_module->get_ctx_size(); 1262 } 1263 } 1264 1265 return max_bdev_module_size; 1266 } 1267 1268 static void 1269 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1270 { 1271 int i; 1272 struct spdk_bdev_qos *qos = bdev->internal.qos; 1273 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1274 1275 if (!qos) { 1276 return; 1277 } 1278 1279 spdk_bdev_get_qos_rate_limits(bdev, limits); 1280 1281 spdk_json_write_object_begin(w); 1282 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1283 1284 spdk_json_write_named_object_begin(w, "params"); 1285 spdk_json_write_named_string(w, "name", bdev->name); 1286 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1287 if (limits[i] > 0) { 1288 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1289 } 1290 } 1291 spdk_json_write_object_end(w); 1292 1293 spdk_json_write_object_end(w); 1294 } 1295 1296 void 1297 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1298 { 1299 struct spdk_bdev_module *bdev_module; 1300 struct spdk_bdev *bdev; 1301 1302 assert(w != NULL); 1303 1304 spdk_json_write_array_begin(w); 1305 1306 spdk_json_write_object_begin(w); 1307 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1308 spdk_json_write_named_object_begin(w, "params"); 1309 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1310 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1311 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1312 spdk_json_write_object_end(w); 1313 spdk_json_write_object_end(w); 1314 1315 bdev_examine_allowlist_config_json(w); 1316 1317 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1318 if (bdev_module->config_json) { 1319 bdev_module->config_json(w); 1320 } 1321 } 1322 1323 pthread_mutex_lock(&g_bdev_mgr.mutex); 1324 1325 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1326 if (bdev->fn_table->write_config_json) { 1327 bdev->fn_table->write_config_json(bdev, w); 1328 } 1329 1330 bdev_qos_config_json(bdev, w); 1331 } 1332 1333 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1334 1335 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1336 spdk_json_write_object_begin(w); 1337 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1338 spdk_json_write_object_end(w); 1339 1340 spdk_json_write_array_end(w); 1341 } 1342 1343 static int 1344 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1345 { 1346 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1347 struct spdk_bdev_io *bdev_io; 1348 uint32_t i; 1349 1350 STAILQ_INIT(&ch->need_buf_small); 1351 STAILQ_INIT(&ch->need_buf_large); 1352 1353 STAILQ_INIT(&ch->per_thread_cache); 1354 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1355 1356 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1357 ch->per_thread_cache_count = 0; 1358 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1359 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1360 assert(bdev_io != NULL); 1361 ch->per_thread_cache_count++; 1362 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1363 } 1364 1365 TAILQ_INIT(&ch->shared_resources); 1366 TAILQ_INIT(&ch->io_wait_queue); 1367 1368 return 0; 1369 } 1370 1371 static void 1372 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1373 { 1374 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1375 struct spdk_bdev_io *bdev_io; 1376 1377 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1378 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1379 } 1380 1381 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1382 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1383 } 1384 1385 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1386 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1387 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1388 ch->per_thread_cache_count--; 1389 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1390 } 1391 1392 assert(ch->per_thread_cache_count == 0); 1393 } 1394 1395 static void 1396 bdev_init_complete(int rc) 1397 { 1398 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1399 void *cb_arg = g_init_cb_arg; 1400 struct spdk_bdev_module *m; 1401 1402 g_bdev_mgr.init_complete = true; 1403 g_init_cb_fn = NULL; 1404 g_init_cb_arg = NULL; 1405 1406 /* 1407 * For modules that need to know when subsystem init is complete, 1408 * inform them now. 1409 */ 1410 if (rc == 0) { 1411 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1412 if (m->init_complete) { 1413 m->init_complete(); 1414 } 1415 } 1416 } 1417 1418 cb_fn(cb_arg, rc); 1419 } 1420 1421 static bool 1422 bdev_module_all_actions_completed(void) 1423 { 1424 struct spdk_bdev_module *m; 1425 1426 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1427 if (m->internal.action_in_progress > 0) { 1428 return false; 1429 } 1430 } 1431 return true; 1432 } 1433 1434 static void 1435 bdev_module_action_complete(void) 1436 { 1437 /* 1438 * Don't finish bdev subsystem initialization if 1439 * module pre-initialization is still in progress, or 1440 * the subsystem been already initialized. 1441 */ 1442 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1443 return; 1444 } 1445 1446 /* 1447 * Check all bdev modules for inits/examinations in progress. If any 1448 * exist, return immediately since we cannot finish bdev subsystem 1449 * initialization until all are completed. 1450 */ 1451 if (!bdev_module_all_actions_completed()) { 1452 return; 1453 } 1454 1455 /* 1456 * Modules already finished initialization - now that all 1457 * the bdev modules have finished their asynchronous I/O 1458 * processing, the entire bdev layer can be marked as complete. 1459 */ 1460 bdev_init_complete(0); 1461 } 1462 1463 static void 1464 bdev_module_action_done(struct spdk_bdev_module *module) 1465 { 1466 assert(module->internal.action_in_progress > 0); 1467 module->internal.action_in_progress--; 1468 bdev_module_action_complete(); 1469 } 1470 1471 void 1472 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1473 { 1474 bdev_module_action_done(module); 1475 } 1476 1477 void 1478 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1479 { 1480 bdev_module_action_done(module); 1481 } 1482 1483 /** The last initialized bdev module */ 1484 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1485 1486 static void 1487 bdev_init_failed(void *cb_arg) 1488 { 1489 struct spdk_bdev_module *module = cb_arg; 1490 1491 module->internal.action_in_progress--; 1492 bdev_init_complete(-1); 1493 } 1494 1495 static int 1496 bdev_modules_init(void) 1497 { 1498 struct spdk_bdev_module *module; 1499 int rc = 0; 1500 1501 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1502 g_resume_bdev_module = module; 1503 if (module->async_init) { 1504 module->internal.action_in_progress = 1; 1505 } 1506 rc = module->module_init(); 1507 if (rc != 0) { 1508 /* Bump action_in_progress to prevent other modules from completion of modules_init 1509 * Send message to defer application shutdown until resources are cleaned up */ 1510 module->internal.action_in_progress = 1; 1511 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1512 return rc; 1513 } 1514 } 1515 1516 g_resume_bdev_module = NULL; 1517 return 0; 1518 } 1519 1520 void 1521 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1522 { 1523 int cache_size; 1524 int rc = 0; 1525 char mempool_name[32]; 1526 1527 assert(cb_fn != NULL); 1528 1529 g_init_cb_fn = cb_fn; 1530 g_init_cb_arg = cb_arg; 1531 1532 spdk_notify_type_register("bdev_register"); 1533 spdk_notify_type_register("bdev_unregister"); 1534 1535 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1536 1537 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1538 g_bdev_opts.bdev_io_pool_size, 1539 sizeof(struct spdk_bdev_io) + 1540 bdev_module_get_max_ctx_size(), 1541 0, 1542 SPDK_ENV_SOCKET_ID_ANY); 1543 1544 if (g_bdev_mgr.bdev_io_pool == NULL) { 1545 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1546 bdev_init_complete(-1); 1547 return; 1548 } 1549 1550 /** 1551 * Ensure no more than half of the total buffers end up local caches, by 1552 * using spdk_env_get_core_count() to determine how many local caches we need 1553 * to account for. 1554 */ 1555 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1556 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1557 1558 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1559 g_bdev_opts.small_buf_pool_size, 1560 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1561 SPDK_BDEV_POOL_ALIGNMENT, 1562 cache_size, 1563 SPDK_ENV_SOCKET_ID_ANY); 1564 if (!g_bdev_mgr.buf_small_pool) { 1565 SPDK_ERRLOG("create rbuf small pool failed\n"); 1566 bdev_init_complete(-1); 1567 return; 1568 } 1569 1570 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1571 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1572 1573 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1574 g_bdev_opts.large_buf_pool_size, 1575 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1576 SPDK_BDEV_POOL_ALIGNMENT, 1577 cache_size, 1578 SPDK_ENV_SOCKET_ID_ANY); 1579 if (!g_bdev_mgr.buf_large_pool) { 1580 SPDK_ERRLOG("create rbuf large pool failed\n"); 1581 bdev_init_complete(-1); 1582 return; 1583 } 1584 1585 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1586 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1587 if (!g_bdev_mgr.zero_buffer) { 1588 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1589 bdev_init_complete(-1); 1590 return; 1591 } 1592 1593 #ifdef SPDK_CONFIG_VTUNE 1594 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1595 #endif 1596 1597 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1598 bdev_mgmt_channel_destroy, 1599 sizeof(struct spdk_bdev_mgmt_channel), 1600 "bdev_mgr"); 1601 1602 rc = bdev_modules_init(); 1603 g_bdev_mgr.module_init_complete = true; 1604 if (rc != 0) { 1605 SPDK_ERRLOG("bdev modules init failed\n"); 1606 return; 1607 } 1608 1609 bdev_module_action_complete(); 1610 } 1611 1612 static void 1613 bdev_mgr_unregister_cb(void *io_device) 1614 { 1615 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1616 1617 if (g_bdev_mgr.bdev_io_pool) { 1618 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1619 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1620 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1621 g_bdev_opts.bdev_io_pool_size); 1622 } 1623 1624 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1625 } 1626 1627 if (g_bdev_mgr.buf_small_pool) { 1628 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1629 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1630 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1631 g_bdev_opts.small_buf_pool_size); 1632 assert(false); 1633 } 1634 1635 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1636 } 1637 1638 if (g_bdev_mgr.buf_large_pool) { 1639 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1640 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1641 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1642 g_bdev_opts.large_buf_pool_size); 1643 assert(false); 1644 } 1645 1646 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1647 } 1648 1649 spdk_free(g_bdev_mgr.zero_buffer); 1650 1651 bdev_examine_allowlist_free(); 1652 1653 cb_fn(g_fini_cb_arg); 1654 g_fini_cb_fn = NULL; 1655 g_fini_cb_arg = NULL; 1656 g_bdev_mgr.init_complete = false; 1657 g_bdev_mgr.module_init_complete = false; 1658 } 1659 1660 static void 1661 bdev_module_fini_iter(void *arg) 1662 { 1663 struct spdk_bdev_module *bdev_module; 1664 1665 /* FIXME: Handling initialization failures is broken now, 1666 * so we won't even try cleaning up after successfully 1667 * initialized modules. if module_init_complete is false, 1668 * just call spdk_bdev_mgr_unregister_cb 1669 */ 1670 if (!g_bdev_mgr.module_init_complete) { 1671 bdev_mgr_unregister_cb(NULL); 1672 return; 1673 } 1674 1675 /* Start iterating from the last touched module */ 1676 if (!g_resume_bdev_module) { 1677 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1678 } else { 1679 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1680 internal.tailq); 1681 } 1682 1683 while (bdev_module) { 1684 if (bdev_module->async_fini) { 1685 /* Save our place so we can resume later. We must 1686 * save the variable here, before calling module_fini() 1687 * below, because in some cases the module may immediately 1688 * call spdk_bdev_module_fini_done() and re-enter 1689 * this function to continue iterating. */ 1690 g_resume_bdev_module = bdev_module; 1691 } 1692 1693 if (bdev_module->module_fini) { 1694 bdev_module->module_fini(); 1695 } 1696 1697 if (bdev_module->async_fini) { 1698 return; 1699 } 1700 1701 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1702 internal.tailq); 1703 } 1704 1705 g_resume_bdev_module = NULL; 1706 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1707 } 1708 1709 void 1710 spdk_bdev_module_fini_done(void) 1711 { 1712 if (spdk_get_thread() != g_fini_thread) { 1713 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1714 } else { 1715 bdev_module_fini_iter(NULL); 1716 } 1717 } 1718 1719 static void 1720 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1721 { 1722 struct spdk_bdev *bdev = cb_arg; 1723 1724 if (bdeverrno && bdev) { 1725 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1726 bdev->name); 1727 1728 /* 1729 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1730 * bdev; try to continue by manually removing this bdev from the list and continue 1731 * with the next bdev in the list. 1732 */ 1733 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1734 } 1735 1736 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1737 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1738 /* 1739 * Bdev module finish need to be deferred as we might be in the middle of some context 1740 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1741 * after returning. 1742 */ 1743 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1744 return; 1745 } 1746 1747 /* 1748 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1749 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1750 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1751 * base bdevs. 1752 * 1753 * Also, walk the list in the reverse order. 1754 */ 1755 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1756 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1757 if (bdev->internal.claim_module != NULL) { 1758 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1759 bdev->name, bdev->internal.claim_module->name); 1760 continue; 1761 } 1762 1763 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1764 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1765 return; 1766 } 1767 1768 /* 1769 * If any bdev fails to unclaim underlying bdev properly, we may face the 1770 * case of bdev list consisting of claimed bdevs only (if claims are managed 1771 * correctly, this would mean there's a loop in the claims graph which is 1772 * clearly impossible). Warn and unregister last bdev on the list then. 1773 */ 1774 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1775 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1776 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1777 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1778 return; 1779 } 1780 } 1781 1782 static void 1783 bdev_module_fini_start_iter(void *arg) 1784 { 1785 struct spdk_bdev_module *bdev_module; 1786 1787 if (!g_resume_bdev_module) { 1788 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1789 } else { 1790 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1791 } 1792 1793 while (bdev_module) { 1794 if (bdev_module->async_fini_start) { 1795 /* Save our place so we can resume later. We must 1796 * save the variable here, before calling fini_start() 1797 * below, because in some cases the module may immediately 1798 * call spdk_bdev_module_fini_start_done() and re-enter 1799 * this function to continue iterating. */ 1800 g_resume_bdev_module = bdev_module; 1801 } 1802 1803 if (bdev_module->fini_start) { 1804 bdev_module->fini_start(); 1805 } 1806 1807 if (bdev_module->async_fini_start) { 1808 return; 1809 } 1810 1811 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1812 } 1813 1814 g_resume_bdev_module = NULL; 1815 1816 bdev_finish_unregister_bdevs_iter(NULL, 0); 1817 } 1818 1819 void 1820 spdk_bdev_module_fini_start_done(void) 1821 { 1822 if (spdk_get_thread() != g_fini_thread) { 1823 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1824 } else { 1825 bdev_module_fini_start_iter(NULL); 1826 } 1827 } 1828 1829 void 1830 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1831 { 1832 assert(cb_fn != NULL); 1833 1834 g_fini_thread = spdk_get_thread(); 1835 1836 g_fini_cb_fn = cb_fn; 1837 g_fini_cb_arg = cb_arg; 1838 1839 bdev_module_fini_start_iter(NULL); 1840 } 1841 1842 struct spdk_bdev_io * 1843 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1844 { 1845 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1846 struct spdk_bdev_io *bdev_io; 1847 1848 if (ch->per_thread_cache_count > 0) { 1849 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1850 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1851 ch->per_thread_cache_count--; 1852 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1853 /* 1854 * Don't try to look for bdev_ios in the global pool if there are 1855 * waiters on bdev_ios - we don't want this caller to jump the line. 1856 */ 1857 bdev_io = NULL; 1858 } else { 1859 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1860 } 1861 1862 return bdev_io; 1863 } 1864 1865 void 1866 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1867 { 1868 struct spdk_bdev_mgmt_channel *ch; 1869 1870 assert(bdev_io != NULL); 1871 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1872 1873 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1874 1875 if (bdev_io->internal.buf != NULL) { 1876 bdev_io_put_buf(bdev_io); 1877 } 1878 1879 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1880 ch->per_thread_cache_count++; 1881 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1882 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1883 struct spdk_bdev_io_wait_entry *entry; 1884 1885 entry = TAILQ_FIRST(&ch->io_wait_queue); 1886 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1887 entry->cb_fn(entry->cb_arg); 1888 } 1889 } else { 1890 /* We should never have a full cache with entries on the io wait queue. */ 1891 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1892 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1893 } 1894 } 1895 1896 static bool 1897 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1898 { 1899 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1900 1901 switch (limit) { 1902 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1903 return true; 1904 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1905 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1906 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1907 return false; 1908 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1909 default: 1910 return false; 1911 } 1912 } 1913 1914 static bool 1915 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1916 { 1917 switch (bdev_io->type) { 1918 case SPDK_BDEV_IO_TYPE_NVME_IO: 1919 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1920 case SPDK_BDEV_IO_TYPE_READ: 1921 case SPDK_BDEV_IO_TYPE_WRITE: 1922 return true; 1923 case SPDK_BDEV_IO_TYPE_ZCOPY: 1924 if (bdev_io->u.bdev.zcopy.start) { 1925 return true; 1926 } else { 1927 return false; 1928 } 1929 default: 1930 return false; 1931 } 1932 } 1933 1934 static bool 1935 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1936 { 1937 switch (bdev_io->type) { 1938 case SPDK_BDEV_IO_TYPE_NVME_IO: 1939 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1940 /* Bit 1 (0x2) set for read operation */ 1941 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1942 return true; 1943 } else { 1944 return false; 1945 } 1946 case SPDK_BDEV_IO_TYPE_READ: 1947 return true; 1948 case SPDK_BDEV_IO_TYPE_ZCOPY: 1949 /* Populate to read from disk */ 1950 if (bdev_io->u.bdev.zcopy.populate) { 1951 return true; 1952 } else { 1953 return false; 1954 } 1955 default: 1956 return false; 1957 } 1958 } 1959 1960 static uint64_t 1961 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1962 { 1963 struct spdk_bdev *bdev = bdev_io->bdev; 1964 1965 switch (bdev_io->type) { 1966 case SPDK_BDEV_IO_TYPE_NVME_IO: 1967 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1968 return bdev_io->u.nvme_passthru.nbytes; 1969 case SPDK_BDEV_IO_TYPE_READ: 1970 case SPDK_BDEV_IO_TYPE_WRITE: 1971 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1972 case SPDK_BDEV_IO_TYPE_ZCOPY: 1973 /* Track the data in the start phase only */ 1974 if (bdev_io->u.bdev.zcopy.start) { 1975 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1976 } else { 1977 return 0; 1978 } 1979 default: 1980 return 0; 1981 } 1982 } 1983 1984 static bool 1985 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1986 { 1987 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1988 return true; 1989 } else { 1990 return false; 1991 } 1992 } 1993 1994 static bool 1995 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1996 { 1997 if (bdev_is_read_io(io) == false) { 1998 return false; 1999 } 2000 2001 return bdev_qos_rw_queue_io(limit, io); 2002 } 2003 2004 static bool 2005 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2006 { 2007 if (bdev_is_read_io(io) == true) { 2008 return false; 2009 } 2010 2011 return bdev_qos_rw_queue_io(limit, io); 2012 } 2013 2014 static void 2015 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2016 { 2017 limit->remaining_this_timeslice--; 2018 } 2019 2020 static void 2021 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2022 { 2023 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2024 } 2025 2026 static void 2027 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2028 { 2029 if (bdev_is_read_io(io) == false) { 2030 return; 2031 } 2032 2033 return bdev_qos_rw_bps_update_quota(limit, io); 2034 } 2035 2036 static void 2037 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2038 { 2039 if (bdev_is_read_io(io) == true) { 2040 return; 2041 } 2042 2043 return bdev_qos_rw_bps_update_quota(limit, io); 2044 } 2045 2046 static void 2047 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2048 { 2049 int i; 2050 2051 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2052 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2053 qos->rate_limits[i].queue_io = NULL; 2054 qos->rate_limits[i].update_quota = NULL; 2055 continue; 2056 } 2057 2058 switch (i) { 2059 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2060 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2061 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2062 break; 2063 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2064 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2065 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2066 break; 2067 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2068 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2069 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2070 break; 2071 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2072 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2073 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2074 break; 2075 default: 2076 break; 2077 } 2078 } 2079 } 2080 2081 static void 2082 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2083 struct spdk_bdev_io *bdev_io, 2084 enum spdk_bdev_io_status status) 2085 { 2086 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2087 2088 bdev_io->internal.in_submit_request = true; 2089 bdev_ch->io_outstanding++; 2090 shared_resource->io_outstanding++; 2091 spdk_bdev_io_complete(bdev_io, status); 2092 bdev_io->internal.in_submit_request = false; 2093 } 2094 2095 static inline void 2096 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2097 { 2098 struct spdk_bdev *bdev = bdev_io->bdev; 2099 struct spdk_io_channel *ch = bdev_ch->channel; 2100 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2101 2102 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2103 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2104 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2105 2106 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2107 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 2108 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 2109 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2110 SPDK_BDEV_IO_STATUS_SUCCESS); 2111 return; 2112 } 2113 } 2114 2115 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2116 bdev_ch->io_outstanding++; 2117 shared_resource->io_outstanding++; 2118 bdev_io->internal.in_submit_request = true; 2119 bdev->fn_table->submit_request(ch, bdev_io); 2120 bdev_io->internal.in_submit_request = false; 2121 } else { 2122 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2123 } 2124 } 2125 2126 static int 2127 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2128 { 2129 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2130 int i, submitted_ios = 0; 2131 2132 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2133 if (bdev_qos_io_to_limit(bdev_io) == true) { 2134 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2135 if (!qos->rate_limits[i].queue_io) { 2136 continue; 2137 } 2138 2139 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2140 bdev_io) == true) { 2141 return submitted_ios; 2142 } 2143 } 2144 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2145 if (!qos->rate_limits[i].update_quota) { 2146 continue; 2147 } 2148 2149 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2150 } 2151 } 2152 2153 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2154 bdev_io_do_submit(ch, bdev_io); 2155 submitted_ios++; 2156 } 2157 2158 return submitted_ios; 2159 } 2160 2161 static void 2162 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2163 { 2164 int rc; 2165 2166 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2167 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2168 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2169 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2170 &bdev_io->internal.waitq_entry); 2171 if (rc != 0) { 2172 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2173 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2174 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2175 } 2176 } 2177 2178 static bool 2179 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2180 { 2181 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 2182 uint32_t max_size = bdev_io->bdev->max_segment_size; 2183 int max_segs = bdev_io->bdev->max_num_segments; 2184 2185 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 2186 2187 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2188 return false; 2189 } 2190 2191 if (io_boundary) { 2192 uint64_t start_stripe, end_stripe; 2193 2194 start_stripe = bdev_io->u.bdev.offset_blocks; 2195 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2196 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2197 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2198 start_stripe >>= spdk_u32log2(io_boundary); 2199 end_stripe >>= spdk_u32log2(io_boundary); 2200 } else { 2201 start_stripe /= io_boundary; 2202 end_stripe /= io_boundary; 2203 } 2204 2205 if (start_stripe != end_stripe) { 2206 return true; 2207 } 2208 } 2209 2210 if (max_segs) { 2211 if (bdev_io->u.bdev.iovcnt > max_segs) { 2212 return true; 2213 } 2214 } 2215 2216 if (max_size) { 2217 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2218 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2219 return true; 2220 } 2221 } 2222 } 2223 2224 return false; 2225 } 2226 2227 static bool 2228 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2229 { 2230 uint32_t num_unmap_segments; 2231 2232 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2233 return false; 2234 } 2235 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2236 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2237 return true; 2238 } 2239 2240 return false; 2241 } 2242 2243 static bool 2244 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2245 { 2246 if (!bdev_io->bdev->max_write_zeroes) { 2247 return false; 2248 } 2249 2250 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2251 return true; 2252 } 2253 2254 return false; 2255 } 2256 2257 static bool 2258 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2259 { 2260 switch (bdev_io->type) { 2261 case SPDK_BDEV_IO_TYPE_READ: 2262 case SPDK_BDEV_IO_TYPE_WRITE: 2263 return bdev_rw_should_split(bdev_io); 2264 case SPDK_BDEV_IO_TYPE_UNMAP: 2265 return bdev_unmap_should_split(bdev_io); 2266 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2267 return bdev_write_zeroes_should_split(bdev_io); 2268 default: 2269 return false; 2270 } 2271 } 2272 2273 static uint32_t 2274 _to_next_boundary(uint64_t offset, uint32_t boundary) 2275 { 2276 return (boundary - (offset % boundary)); 2277 } 2278 2279 static void 2280 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2281 2282 static void 2283 _bdev_rw_split(void *_bdev_io); 2284 2285 static void 2286 bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2287 2288 static void 2289 _bdev_unmap_split(void *_bdev_io) 2290 { 2291 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2292 } 2293 2294 static void 2295 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2296 2297 static void 2298 _bdev_write_zeroes_split(void *_bdev_io) 2299 { 2300 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2301 } 2302 2303 static int 2304 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2305 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2306 { 2307 int rc; 2308 uint64_t current_offset, current_remaining; 2309 spdk_bdev_io_wait_cb io_wait_fn; 2310 2311 current_offset = *offset; 2312 current_remaining = *remaining; 2313 2314 bdev_io->u.bdev.split_outstanding++; 2315 2316 io_wait_fn = _bdev_rw_split; 2317 switch (bdev_io->type) { 2318 case SPDK_BDEV_IO_TYPE_READ: 2319 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2320 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2321 iov, iovcnt, md_buf, current_offset, 2322 num_blocks, 2323 bdev_io_split_done, bdev_io, 2324 bdev_io->internal.ext_opts); 2325 break; 2326 case SPDK_BDEV_IO_TYPE_WRITE: 2327 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2328 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2329 iov, iovcnt, md_buf, current_offset, 2330 num_blocks, 2331 bdev_io_split_done, bdev_io, 2332 bdev_io->internal.ext_opts); 2333 break; 2334 case SPDK_BDEV_IO_TYPE_UNMAP: 2335 io_wait_fn = _bdev_unmap_split; 2336 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2337 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2338 current_offset, num_blocks, 2339 bdev_io_split_done, bdev_io); 2340 break; 2341 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2342 io_wait_fn = _bdev_write_zeroes_split; 2343 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2344 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2345 current_offset, num_blocks, 2346 bdev_io_split_done, bdev_io); 2347 break; 2348 default: 2349 assert(false); 2350 rc = -EINVAL; 2351 break; 2352 } 2353 2354 if (rc == 0) { 2355 current_offset += num_blocks; 2356 current_remaining -= num_blocks; 2357 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2358 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2359 *offset = current_offset; 2360 *remaining = current_remaining; 2361 } else { 2362 bdev_io->u.bdev.split_outstanding--; 2363 if (rc == -ENOMEM) { 2364 if (bdev_io->u.bdev.split_outstanding == 0) { 2365 /* No I/O is outstanding. Hence we should wait here. */ 2366 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2367 } 2368 } else { 2369 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2370 if (bdev_io->u.bdev.split_outstanding == 0) { 2371 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2372 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2373 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2374 } 2375 } 2376 } 2377 2378 return rc; 2379 } 2380 2381 static void 2382 _bdev_rw_split(void *_bdev_io) 2383 { 2384 struct iovec *parent_iov, *iov; 2385 struct spdk_bdev_io *bdev_io = _bdev_io; 2386 struct spdk_bdev *bdev = bdev_io->bdev; 2387 uint64_t parent_offset, current_offset, remaining; 2388 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2389 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2390 uint32_t iovcnt, iov_len, child_iovsize; 2391 uint32_t blocklen = bdev->blocklen; 2392 uint32_t io_boundary = bdev->optimal_io_boundary; 2393 uint32_t max_segment_size = bdev->max_segment_size; 2394 uint32_t max_child_iovcnt = bdev->max_num_segments; 2395 void *md_buf = NULL; 2396 int rc; 2397 2398 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2399 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2400 BDEV_IO_NUM_CHILD_IOV; 2401 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2402 2403 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2404 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2405 parent_offset = bdev_io->u.bdev.offset_blocks; 2406 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2407 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2408 2409 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2410 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2411 if (parent_iov_offset < parent_iov->iov_len) { 2412 break; 2413 } 2414 parent_iov_offset -= parent_iov->iov_len; 2415 } 2416 2417 child_iovcnt = 0; 2418 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2419 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2420 to_next_boundary = spdk_min(remaining, to_next_boundary); 2421 to_next_boundary_bytes = to_next_boundary * blocklen; 2422 2423 iov = &bdev_io->child_iov[child_iovcnt]; 2424 iovcnt = 0; 2425 2426 if (bdev_io->u.bdev.md_buf) { 2427 md_buf = (char *)bdev_io->u.bdev.md_buf + 2428 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2429 } 2430 2431 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2432 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2433 iovcnt < child_iovsize) { 2434 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2435 iov_len = parent_iov->iov_len - parent_iov_offset; 2436 2437 iov_len = spdk_min(iov_len, max_segment_size); 2438 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2439 to_next_boundary_bytes -= iov_len; 2440 2441 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2442 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2443 2444 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2445 parent_iov_offset += iov_len; 2446 } else { 2447 parent_iovpos++; 2448 parent_iov_offset = 0; 2449 } 2450 child_iovcnt++; 2451 iovcnt++; 2452 } 2453 2454 if (to_next_boundary_bytes > 0) { 2455 /* We had to stop this child I/O early because we ran out of 2456 * child_iov space or were limited by max_num_segments. 2457 * Ensure the iovs to be aligned with block size and 2458 * then adjust to_next_boundary before starting the 2459 * child I/O. 2460 */ 2461 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2462 iovcnt == child_iovsize); 2463 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2464 if (to_last_block_bytes != 0) { 2465 uint32_t child_iovpos = child_iovcnt - 1; 2466 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2467 * so the loop will naturally end 2468 */ 2469 2470 to_last_block_bytes = blocklen - to_last_block_bytes; 2471 to_next_boundary_bytes += to_last_block_bytes; 2472 while (to_last_block_bytes > 0 && iovcnt > 0) { 2473 iov_len = spdk_min(to_last_block_bytes, 2474 bdev_io->child_iov[child_iovpos].iov_len); 2475 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2476 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2477 child_iovpos--; 2478 if (--iovcnt == 0) { 2479 /* If the child IO is less than a block size just return. 2480 * If the first child IO of any split round is less than 2481 * a block size, an error exit. 2482 */ 2483 if (bdev_io->u.bdev.split_outstanding == 0) { 2484 SPDK_ERRLOG("The first child io was less than a block size\n"); 2485 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2486 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2487 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2488 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2489 } 2490 2491 return; 2492 } 2493 } 2494 2495 to_last_block_bytes -= iov_len; 2496 2497 if (parent_iov_offset == 0) { 2498 parent_iovpos--; 2499 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2500 } 2501 parent_iov_offset -= iov_len; 2502 } 2503 2504 assert(to_last_block_bytes == 0); 2505 } 2506 to_next_boundary -= to_next_boundary_bytes / blocklen; 2507 } 2508 2509 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2510 ¤t_offset, &remaining); 2511 if (spdk_unlikely(rc)) { 2512 return; 2513 } 2514 } 2515 } 2516 2517 static void 2518 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2519 { 2520 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2521 uint32_t num_children_reqs = 0; 2522 int rc; 2523 2524 offset = bdev_io->u.bdev.split_current_offset_blocks; 2525 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2526 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2527 2528 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2529 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2530 2531 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2532 &offset, &remaining); 2533 if (spdk_likely(rc == 0)) { 2534 num_children_reqs++; 2535 } else { 2536 return; 2537 } 2538 } 2539 } 2540 2541 static void 2542 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2543 { 2544 uint64_t offset, write_zeroes_blocks, remaining; 2545 uint32_t num_children_reqs = 0; 2546 int rc; 2547 2548 offset = bdev_io->u.bdev.split_current_offset_blocks; 2549 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2550 2551 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2552 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2553 2554 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2555 &offset, &remaining); 2556 if (spdk_likely(rc == 0)) { 2557 num_children_reqs++; 2558 } else { 2559 return; 2560 } 2561 } 2562 } 2563 2564 static void 2565 parent_bdev_io_complete(void *ctx, int rc) 2566 { 2567 struct spdk_bdev_io *parent_io = ctx; 2568 2569 if (rc) { 2570 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2571 } 2572 2573 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2574 parent_io->internal.caller_ctx); 2575 } 2576 2577 static void 2578 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2579 { 2580 struct spdk_bdev_io *parent_io = cb_arg; 2581 2582 spdk_bdev_free_io(bdev_io); 2583 2584 if (!success) { 2585 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2586 /* If any child I/O failed, stop further splitting process. */ 2587 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2588 parent_io->u.bdev.split_remaining_num_blocks = 0; 2589 } 2590 parent_io->u.bdev.split_outstanding--; 2591 if (parent_io->u.bdev.split_outstanding != 0) { 2592 return; 2593 } 2594 2595 /* 2596 * Parent I/O finishes when all blocks are consumed. 2597 */ 2598 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2599 assert(parent_io->internal.cb != bdev_io_split_done); 2600 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2601 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2602 2603 if (parent_io->internal.orig_iovcnt != 0) { 2604 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2605 /* bdev IO will be completed in the callback */ 2606 } else { 2607 parent_bdev_io_complete(parent_io, 0); 2608 } 2609 return; 2610 } 2611 2612 /* 2613 * Continue with the splitting process. This function will complete the parent I/O if the 2614 * splitting is done. 2615 */ 2616 switch (parent_io->type) { 2617 case SPDK_BDEV_IO_TYPE_READ: 2618 case SPDK_BDEV_IO_TYPE_WRITE: 2619 _bdev_rw_split(parent_io); 2620 break; 2621 case SPDK_BDEV_IO_TYPE_UNMAP: 2622 bdev_unmap_split(parent_io); 2623 break; 2624 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2625 bdev_write_zeroes_split(parent_io); 2626 break; 2627 default: 2628 assert(false); 2629 break; 2630 } 2631 } 2632 2633 static void 2634 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2635 2636 static void 2637 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2638 { 2639 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2640 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2641 bdev_io->u.bdev.split_outstanding = 0; 2642 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2643 2644 switch (bdev_io->type) { 2645 case SPDK_BDEV_IO_TYPE_READ: 2646 case SPDK_BDEV_IO_TYPE_WRITE: 2647 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2648 _bdev_rw_split(bdev_io); 2649 } else { 2650 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2651 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2652 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2653 } 2654 break; 2655 case SPDK_BDEV_IO_TYPE_UNMAP: 2656 bdev_unmap_split(bdev_io); 2657 break; 2658 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2659 bdev_write_zeroes_split(bdev_io); 2660 break; 2661 default: 2662 assert(false); 2663 break; 2664 } 2665 } 2666 2667 static void 2668 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2669 { 2670 if (!success) { 2671 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2672 return; 2673 } 2674 2675 _bdev_rw_split(bdev_io); 2676 } 2677 2678 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2679 * be inlined, at least on some compilers. 2680 */ 2681 static inline void 2682 _bdev_io_submit(void *ctx) 2683 { 2684 struct spdk_bdev_io *bdev_io = ctx; 2685 struct spdk_bdev *bdev = bdev_io->bdev; 2686 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2687 uint64_t tsc; 2688 2689 tsc = spdk_get_ticks(); 2690 bdev_io->internal.submit_tsc = tsc; 2691 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type, 2692 bdev_io->internal.caller_ctx, bdev_io->u.bdev.offset_blocks, 2693 bdev_io->u.bdev.num_blocks); 2694 2695 if (spdk_likely(bdev_ch->flags == 0)) { 2696 bdev_io_do_submit(bdev_ch, bdev_io); 2697 return; 2698 } 2699 2700 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2701 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2702 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2703 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2704 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2705 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2706 } else { 2707 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2708 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2709 } 2710 } else { 2711 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2712 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2713 } 2714 } 2715 2716 bool 2717 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2718 2719 bool 2720 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2721 { 2722 if (range1->length == 0 || range2->length == 0) { 2723 return false; 2724 } 2725 2726 if (range1->offset + range1->length <= range2->offset) { 2727 return false; 2728 } 2729 2730 if (range2->offset + range2->length <= range1->offset) { 2731 return false; 2732 } 2733 2734 return true; 2735 } 2736 2737 static bool 2738 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2739 { 2740 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2741 struct lba_range r; 2742 2743 switch (bdev_io->type) { 2744 case SPDK_BDEV_IO_TYPE_NVME_IO: 2745 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2746 /* Don't try to decode the NVMe command - just assume worst-case and that 2747 * it overlaps a locked range. 2748 */ 2749 return true; 2750 case SPDK_BDEV_IO_TYPE_WRITE: 2751 case SPDK_BDEV_IO_TYPE_UNMAP: 2752 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2753 case SPDK_BDEV_IO_TYPE_ZCOPY: 2754 r.offset = bdev_io->u.bdev.offset_blocks; 2755 r.length = bdev_io->u.bdev.num_blocks; 2756 if (!bdev_lba_range_overlapped(range, &r)) { 2757 /* This I/O doesn't overlap the specified LBA range. */ 2758 return false; 2759 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2760 /* This I/O overlaps, but the I/O is on the same channel that locked this 2761 * range, and the caller_ctx is the same as the locked_ctx. This means 2762 * that this I/O is associated with the lock, and is allowed to execute. 2763 */ 2764 return false; 2765 } else { 2766 return true; 2767 } 2768 default: 2769 return false; 2770 } 2771 } 2772 2773 void 2774 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2775 { 2776 struct spdk_bdev *bdev = bdev_io->bdev; 2777 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2778 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2779 2780 assert(thread != NULL); 2781 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2782 2783 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2784 struct lba_range *range; 2785 2786 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2787 if (bdev_io_range_is_locked(bdev_io, range)) { 2788 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2789 return; 2790 } 2791 } 2792 } 2793 2794 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2795 2796 if (bdev_io_should_split(bdev_io)) { 2797 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2798 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2799 (uintptr_t)bdev_io, bdev_io->type, bdev_io->internal.caller_ctx, 2800 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2801 bdev_io_split(NULL, bdev_io); 2802 return; 2803 } 2804 2805 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2806 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2807 _bdev_io_submit(bdev_io); 2808 } else { 2809 bdev_io->internal.io_submit_ch = ch; 2810 bdev_io->internal.ch = bdev->internal.qos->ch; 2811 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2812 } 2813 } else { 2814 _bdev_io_submit(bdev_io); 2815 } 2816 } 2817 2818 static void 2819 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2820 { 2821 struct spdk_bdev *bdev = bdev_io->bdev; 2822 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2823 struct spdk_io_channel *ch = bdev_ch->channel; 2824 2825 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2826 2827 bdev_io->internal.in_submit_request = true; 2828 bdev->fn_table->submit_request(ch, bdev_io); 2829 bdev_io->internal.in_submit_request = false; 2830 } 2831 2832 void 2833 bdev_io_init(struct spdk_bdev_io *bdev_io, 2834 struct spdk_bdev *bdev, void *cb_arg, 2835 spdk_bdev_io_completion_cb cb) 2836 { 2837 bdev_io->bdev = bdev; 2838 bdev_io->internal.caller_ctx = cb_arg; 2839 bdev_io->internal.cb = cb; 2840 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2841 bdev_io->internal.in_submit_request = false; 2842 bdev_io->internal.buf = NULL; 2843 bdev_io->internal.io_submit_ch = NULL; 2844 bdev_io->internal.orig_iovs = NULL; 2845 bdev_io->internal.orig_iovcnt = 0; 2846 bdev_io->internal.orig_md_iov.iov_base = NULL; 2847 bdev_io->internal.error.nvme.cdw0 = 0; 2848 bdev_io->num_retries = 0; 2849 bdev_io->internal.get_buf_cb = NULL; 2850 bdev_io->internal.get_aux_buf_cb = NULL; 2851 bdev_io->internal.ext_opts = NULL; 2852 bdev_io->internal.data_transfer_cpl = NULL; 2853 } 2854 2855 static bool 2856 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2857 { 2858 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2859 } 2860 2861 bool 2862 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2863 { 2864 bool supported; 2865 2866 supported = bdev_io_type_supported(bdev, io_type); 2867 2868 if (!supported) { 2869 switch (io_type) { 2870 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2871 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2872 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2873 break; 2874 default: 2875 break; 2876 } 2877 } 2878 2879 return supported; 2880 } 2881 2882 int 2883 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2884 { 2885 if (bdev->fn_table->dump_info_json) { 2886 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2887 } 2888 2889 return 0; 2890 } 2891 2892 static void 2893 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 2894 { 2895 uint32_t max_per_timeslice = 0; 2896 int i; 2897 2898 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2899 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2900 qos->rate_limits[i].max_per_timeslice = 0; 2901 continue; 2902 } 2903 2904 max_per_timeslice = qos->rate_limits[i].limit * 2905 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 2906 2907 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 2908 qos->rate_limits[i].min_per_timeslice); 2909 2910 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 2911 } 2912 2913 bdev_qos_set_ops(qos); 2914 } 2915 2916 static int 2917 bdev_channel_poll_qos(void *arg) 2918 { 2919 struct spdk_bdev_qos *qos = arg; 2920 uint64_t now = spdk_get_ticks(); 2921 int i; 2922 2923 if (now < (qos->last_timeslice + qos->timeslice_size)) { 2924 /* We received our callback earlier than expected - return 2925 * immediately and wait to do accounting until at least one 2926 * timeslice has actually expired. This should never happen 2927 * with a well-behaved timer implementation. 2928 */ 2929 return SPDK_POLLER_IDLE; 2930 } 2931 2932 /* Reset for next round of rate limiting */ 2933 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2934 /* We may have allowed the IOs or bytes to slightly overrun in the last 2935 * timeslice. remaining_this_timeslice is signed, so if it's negative 2936 * here, we'll account for the overrun so that the next timeslice will 2937 * be appropriately reduced. 2938 */ 2939 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 2940 qos->rate_limits[i].remaining_this_timeslice = 0; 2941 } 2942 } 2943 2944 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 2945 qos->last_timeslice += qos->timeslice_size; 2946 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2947 qos->rate_limits[i].remaining_this_timeslice += 2948 qos->rate_limits[i].max_per_timeslice; 2949 } 2950 } 2951 2952 return bdev_qos_io_submit(qos->ch, qos); 2953 } 2954 2955 static void 2956 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 2957 { 2958 struct spdk_bdev_shared_resource *shared_resource; 2959 struct lba_range *range; 2960 2961 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 2962 range = TAILQ_FIRST(&ch->locked_ranges); 2963 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 2964 free(range); 2965 } 2966 2967 spdk_put_io_channel(ch->channel); 2968 2969 shared_resource = ch->shared_resource; 2970 2971 assert(TAILQ_EMPTY(&ch->io_locked)); 2972 assert(TAILQ_EMPTY(&ch->io_submitted)); 2973 assert(ch->io_outstanding == 0); 2974 assert(shared_resource->ref > 0); 2975 shared_resource->ref--; 2976 if (shared_resource->ref == 0) { 2977 assert(shared_resource->io_outstanding == 0); 2978 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 2979 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 2980 free(shared_resource); 2981 } 2982 } 2983 2984 /* Caller must hold bdev->internal.mutex. */ 2985 static void 2986 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 2987 { 2988 struct spdk_bdev_qos *qos = bdev->internal.qos; 2989 int i; 2990 2991 /* Rate limiting on this bdev enabled */ 2992 if (qos) { 2993 if (qos->ch == NULL) { 2994 struct spdk_io_channel *io_ch; 2995 2996 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 2997 bdev->name, spdk_get_thread()); 2998 2999 /* No qos channel has been selected, so set one up */ 3000 3001 /* Take another reference to ch */ 3002 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3003 assert(io_ch != NULL); 3004 qos->ch = ch; 3005 3006 qos->thread = spdk_io_channel_get_thread(io_ch); 3007 3008 TAILQ_INIT(&qos->queued); 3009 3010 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3011 if (bdev_qos_is_iops_rate_limit(i) == true) { 3012 qos->rate_limits[i].min_per_timeslice = 3013 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3014 } else { 3015 qos->rate_limits[i].min_per_timeslice = 3016 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3017 } 3018 3019 if (qos->rate_limits[i].limit == 0) { 3020 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3021 } 3022 } 3023 bdev_qos_update_max_quota_per_timeslice(qos); 3024 qos->timeslice_size = 3025 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3026 qos->last_timeslice = spdk_get_ticks(); 3027 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3028 qos, 3029 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3030 } 3031 3032 ch->flags |= BDEV_CH_QOS_ENABLED; 3033 } 3034 } 3035 3036 struct poll_timeout_ctx { 3037 struct spdk_bdev_desc *desc; 3038 uint64_t timeout_in_sec; 3039 spdk_bdev_io_timeout_cb cb_fn; 3040 void *cb_arg; 3041 }; 3042 3043 static void 3044 bdev_desc_free(struct spdk_bdev_desc *desc) 3045 { 3046 pthread_mutex_destroy(&desc->mutex); 3047 free(desc->media_events_buffer); 3048 free(desc); 3049 } 3050 3051 static void 3052 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 3053 { 3054 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3055 struct spdk_bdev_desc *desc = ctx->desc; 3056 3057 free(ctx); 3058 3059 pthread_mutex_lock(&desc->mutex); 3060 desc->refs--; 3061 if (desc->closed == true && desc->refs == 0) { 3062 pthread_mutex_unlock(&desc->mutex); 3063 bdev_desc_free(desc); 3064 return; 3065 } 3066 pthread_mutex_unlock(&desc->mutex); 3067 } 3068 3069 static void 3070 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 3071 { 3072 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3073 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3074 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 3075 struct spdk_bdev_desc *desc = ctx->desc; 3076 struct spdk_bdev_io *bdev_io; 3077 uint64_t now; 3078 3079 pthread_mutex_lock(&desc->mutex); 3080 if (desc->closed == true) { 3081 pthread_mutex_unlock(&desc->mutex); 3082 spdk_for_each_channel_continue(i, -1); 3083 return; 3084 } 3085 pthread_mutex_unlock(&desc->mutex); 3086 3087 now = spdk_get_ticks(); 3088 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3089 /* Exclude any I/O that are generated via splitting. */ 3090 if (bdev_io->internal.cb == bdev_io_split_done) { 3091 continue; 3092 } 3093 3094 /* Once we find an I/O that has not timed out, we can immediately 3095 * exit the loop. 3096 */ 3097 if (now < (bdev_io->internal.submit_tsc + 3098 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3099 goto end; 3100 } 3101 3102 if (bdev_io->internal.desc == desc) { 3103 ctx->cb_fn(ctx->cb_arg, bdev_io); 3104 } 3105 } 3106 3107 end: 3108 spdk_for_each_channel_continue(i, 0); 3109 } 3110 3111 static int 3112 bdev_poll_timeout_io(void *arg) 3113 { 3114 struct spdk_bdev_desc *desc = arg; 3115 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3116 struct poll_timeout_ctx *ctx; 3117 3118 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3119 if (!ctx) { 3120 SPDK_ERRLOG("failed to allocate memory\n"); 3121 return SPDK_POLLER_BUSY; 3122 } 3123 ctx->desc = desc; 3124 ctx->cb_arg = desc->cb_arg; 3125 ctx->cb_fn = desc->cb_fn; 3126 ctx->timeout_in_sec = desc->timeout_in_sec; 3127 3128 /* Take a ref on the descriptor in case it gets closed while we are checking 3129 * all of the channels. 3130 */ 3131 pthread_mutex_lock(&desc->mutex); 3132 desc->refs++; 3133 pthread_mutex_unlock(&desc->mutex); 3134 3135 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3136 bdev_channel_poll_timeout_io, 3137 ctx, 3138 bdev_channel_poll_timeout_io_done); 3139 3140 return SPDK_POLLER_BUSY; 3141 } 3142 3143 int 3144 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3145 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3146 { 3147 assert(desc->thread == spdk_get_thread()); 3148 3149 spdk_poller_unregister(&desc->io_timeout_poller); 3150 3151 if (timeout_in_sec) { 3152 assert(cb_fn != NULL); 3153 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3154 desc, 3155 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3156 1000); 3157 if (desc->io_timeout_poller == NULL) { 3158 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3159 return -1; 3160 } 3161 } 3162 3163 desc->cb_fn = cb_fn; 3164 desc->cb_arg = cb_arg; 3165 desc->timeout_in_sec = timeout_in_sec; 3166 3167 return 0; 3168 } 3169 3170 static int 3171 bdev_channel_create(void *io_device, void *ctx_buf) 3172 { 3173 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3174 struct spdk_bdev_channel *ch = ctx_buf; 3175 struct spdk_io_channel *mgmt_io_ch; 3176 struct spdk_bdev_mgmt_channel *mgmt_ch; 3177 struct spdk_bdev_shared_resource *shared_resource; 3178 struct lba_range *range; 3179 3180 ch->bdev = bdev; 3181 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3182 if (!ch->channel) { 3183 return -1; 3184 } 3185 3186 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3187 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3188 3189 assert(ch->histogram == NULL); 3190 if (bdev->internal.histogram_enabled) { 3191 ch->histogram = spdk_histogram_data_alloc(); 3192 if (ch->histogram == NULL) { 3193 SPDK_ERRLOG("Could not allocate histogram\n"); 3194 } 3195 } 3196 3197 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3198 if (!mgmt_io_ch) { 3199 spdk_put_io_channel(ch->channel); 3200 return -1; 3201 } 3202 3203 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3204 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3205 if (shared_resource->shared_ch == ch->channel) { 3206 spdk_put_io_channel(mgmt_io_ch); 3207 shared_resource->ref++; 3208 break; 3209 } 3210 } 3211 3212 if (shared_resource == NULL) { 3213 shared_resource = calloc(1, sizeof(*shared_resource)); 3214 if (shared_resource == NULL) { 3215 spdk_put_io_channel(ch->channel); 3216 spdk_put_io_channel(mgmt_io_ch); 3217 return -1; 3218 } 3219 3220 shared_resource->mgmt_ch = mgmt_ch; 3221 shared_resource->io_outstanding = 0; 3222 TAILQ_INIT(&shared_resource->nomem_io); 3223 shared_resource->nomem_threshold = 0; 3224 shared_resource->shared_ch = ch->channel; 3225 shared_resource->ref = 1; 3226 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3227 } 3228 3229 memset(&ch->stat, 0, sizeof(ch->stat)); 3230 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3231 ch->io_outstanding = 0; 3232 TAILQ_INIT(&ch->queued_resets); 3233 TAILQ_INIT(&ch->locked_ranges); 3234 ch->flags = 0; 3235 ch->shared_resource = shared_resource; 3236 3237 TAILQ_INIT(&ch->io_submitted); 3238 TAILQ_INIT(&ch->io_locked); 3239 3240 #ifdef SPDK_CONFIG_VTUNE 3241 { 3242 char *name; 3243 __itt_init_ittlib(NULL, 0); 3244 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3245 if (!name) { 3246 bdev_channel_destroy_resource(ch); 3247 return -1; 3248 } 3249 ch->handle = __itt_string_handle_create(name); 3250 free(name); 3251 ch->start_tsc = spdk_get_ticks(); 3252 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3253 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3254 } 3255 #endif 3256 3257 pthread_mutex_lock(&bdev->internal.mutex); 3258 bdev_enable_qos(bdev, ch); 3259 3260 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3261 struct lba_range *new_range; 3262 3263 new_range = calloc(1, sizeof(*new_range)); 3264 if (new_range == NULL) { 3265 pthread_mutex_unlock(&bdev->internal.mutex); 3266 bdev_channel_destroy_resource(ch); 3267 return -1; 3268 } 3269 new_range->length = range->length; 3270 new_range->offset = range->offset; 3271 new_range->locked_ctx = range->locked_ctx; 3272 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3273 } 3274 3275 pthread_mutex_unlock(&bdev->internal.mutex); 3276 3277 return 0; 3278 } 3279 3280 /* 3281 * Abort I/O that are waiting on a data buffer. These types of I/O are 3282 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3283 */ 3284 static void 3285 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3286 { 3287 bdev_io_stailq_t tmp; 3288 struct spdk_bdev_io *bdev_io; 3289 3290 STAILQ_INIT(&tmp); 3291 3292 while (!STAILQ_EMPTY(queue)) { 3293 bdev_io = STAILQ_FIRST(queue); 3294 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3295 if (bdev_io->internal.ch == ch) { 3296 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3297 } else { 3298 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3299 } 3300 } 3301 3302 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3303 } 3304 3305 /* 3306 * Abort I/O that are queued waiting for submission. These types of I/O are 3307 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3308 */ 3309 static void 3310 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3311 { 3312 struct spdk_bdev_io *bdev_io, *tmp; 3313 3314 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3315 if (bdev_io->internal.ch == ch) { 3316 TAILQ_REMOVE(queue, bdev_io, internal.link); 3317 /* 3318 * spdk_bdev_io_complete() assumes that the completed I/O had 3319 * been submitted to the bdev module. Since in this case it 3320 * hadn't, bump io_outstanding to account for the decrement 3321 * that spdk_bdev_io_complete() will do. 3322 */ 3323 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3324 ch->io_outstanding++; 3325 ch->shared_resource->io_outstanding++; 3326 } 3327 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3328 } 3329 } 3330 } 3331 3332 static bool 3333 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3334 { 3335 struct spdk_bdev_io *bdev_io; 3336 3337 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3338 if (bdev_io == bio_to_abort) { 3339 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3340 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3341 return true; 3342 } 3343 } 3344 3345 return false; 3346 } 3347 3348 static bool 3349 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3350 { 3351 struct spdk_bdev_io *bdev_io; 3352 3353 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3354 if (bdev_io == bio_to_abort) { 3355 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3356 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3357 return true; 3358 } 3359 } 3360 3361 return false; 3362 } 3363 3364 static void 3365 bdev_qos_channel_destroy(void *cb_arg) 3366 { 3367 struct spdk_bdev_qos *qos = cb_arg; 3368 3369 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3370 spdk_poller_unregister(&qos->poller); 3371 3372 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3373 3374 free(qos); 3375 } 3376 3377 static int 3378 bdev_qos_destroy(struct spdk_bdev *bdev) 3379 { 3380 int i; 3381 3382 /* 3383 * Cleanly shutting down the QoS poller is tricky, because 3384 * during the asynchronous operation the user could open 3385 * a new descriptor and create a new channel, spawning 3386 * a new QoS poller. 3387 * 3388 * The strategy is to create a new QoS structure here and swap it 3389 * in. The shutdown path then continues to refer to the old one 3390 * until it completes and then releases it. 3391 */ 3392 struct spdk_bdev_qos *new_qos, *old_qos; 3393 3394 old_qos = bdev->internal.qos; 3395 3396 new_qos = calloc(1, sizeof(*new_qos)); 3397 if (!new_qos) { 3398 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3399 return -ENOMEM; 3400 } 3401 3402 /* Copy the old QoS data into the newly allocated structure */ 3403 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3404 3405 /* Zero out the key parts of the QoS structure */ 3406 new_qos->ch = NULL; 3407 new_qos->thread = NULL; 3408 new_qos->poller = NULL; 3409 TAILQ_INIT(&new_qos->queued); 3410 /* 3411 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3412 * It will be used later for the new QoS structure. 3413 */ 3414 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3415 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3416 new_qos->rate_limits[i].min_per_timeslice = 0; 3417 new_qos->rate_limits[i].max_per_timeslice = 0; 3418 } 3419 3420 bdev->internal.qos = new_qos; 3421 3422 if (old_qos->thread == NULL) { 3423 free(old_qos); 3424 } else { 3425 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3426 } 3427 3428 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3429 * been destroyed yet. The destruction path will end up waiting for the final 3430 * channel to be put before it releases resources. */ 3431 3432 return 0; 3433 } 3434 3435 static void 3436 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3437 { 3438 total->bytes_read += add->bytes_read; 3439 total->num_read_ops += add->num_read_ops; 3440 total->bytes_written += add->bytes_written; 3441 total->num_write_ops += add->num_write_ops; 3442 total->bytes_unmapped += add->bytes_unmapped; 3443 total->num_unmap_ops += add->num_unmap_ops; 3444 total->read_latency_ticks += add->read_latency_ticks; 3445 total->write_latency_ticks += add->write_latency_ticks; 3446 total->unmap_latency_ticks += add->unmap_latency_ticks; 3447 } 3448 3449 static void 3450 bdev_channel_destroy(void *io_device, void *ctx_buf) 3451 { 3452 struct spdk_bdev_channel *ch = ctx_buf; 3453 struct spdk_bdev_mgmt_channel *mgmt_ch; 3454 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3455 3456 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3457 spdk_get_thread()); 3458 3459 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3460 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3461 3462 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3463 pthread_mutex_lock(&ch->bdev->internal.mutex); 3464 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3465 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3466 3467 mgmt_ch = shared_resource->mgmt_ch; 3468 3469 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3470 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3471 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3472 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3473 3474 if (ch->histogram) { 3475 spdk_histogram_data_free(ch->histogram); 3476 } 3477 3478 bdev_channel_destroy_resource(ch); 3479 } 3480 3481 /* 3482 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3483 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3484 */ 3485 static int 3486 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3487 { 3488 struct spdk_bdev_name *tmp; 3489 3490 bdev_name->name = strdup(name); 3491 if (bdev_name->name == NULL) { 3492 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3493 return -ENOMEM; 3494 } 3495 3496 bdev_name->bdev = bdev; 3497 3498 pthread_mutex_lock(&g_bdev_mgr.mutex); 3499 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3500 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3501 3502 if (tmp != NULL) { 3503 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3504 free(bdev_name->name); 3505 return -EEXIST; 3506 } 3507 3508 return 0; 3509 } 3510 3511 static void 3512 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3513 { 3514 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3515 free(bdev_name->name); 3516 } 3517 3518 static void 3519 bdev_name_del(struct spdk_bdev_name *bdev_name) 3520 { 3521 pthread_mutex_lock(&g_bdev_mgr.mutex); 3522 bdev_name_del_unsafe(bdev_name); 3523 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3524 } 3525 3526 int 3527 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3528 { 3529 struct spdk_bdev_alias *tmp; 3530 int ret; 3531 3532 if (alias == NULL) { 3533 SPDK_ERRLOG("Empty alias passed\n"); 3534 return -EINVAL; 3535 } 3536 3537 tmp = calloc(1, sizeof(*tmp)); 3538 if (tmp == NULL) { 3539 SPDK_ERRLOG("Unable to allocate alias\n"); 3540 return -ENOMEM; 3541 } 3542 3543 ret = bdev_name_add(&tmp->alias, bdev, alias); 3544 if (ret != 0) { 3545 free(tmp); 3546 return ret; 3547 } 3548 3549 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3550 3551 return 0; 3552 } 3553 3554 static int 3555 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3556 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3557 { 3558 struct spdk_bdev_alias *tmp; 3559 3560 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3561 if (strcmp(alias, tmp->alias.name) == 0) { 3562 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3563 alias_del_fn(&tmp->alias); 3564 free(tmp); 3565 return 0; 3566 } 3567 } 3568 3569 return -ENOENT; 3570 } 3571 3572 int 3573 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3574 { 3575 int rc; 3576 3577 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3578 if (rc == -ENOENT) { 3579 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3580 } 3581 3582 return rc; 3583 } 3584 3585 void 3586 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3587 { 3588 struct spdk_bdev_alias *p, *tmp; 3589 3590 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3591 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3592 bdev_name_del(&p->alias); 3593 free(p); 3594 } 3595 } 3596 3597 struct spdk_io_channel * 3598 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3599 { 3600 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3601 } 3602 3603 void * 3604 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3605 { 3606 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3607 void *ctx = NULL; 3608 3609 if (bdev->fn_table->get_module_ctx) { 3610 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3611 } 3612 3613 return ctx; 3614 } 3615 3616 const char * 3617 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3618 { 3619 return bdev->module->name; 3620 } 3621 3622 const char * 3623 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3624 { 3625 return bdev->name; 3626 } 3627 3628 const char * 3629 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3630 { 3631 return bdev->product_name; 3632 } 3633 3634 const struct spdk_bdev_aliases_list * 3635 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3636 { 3637 return &bdev->aliases; 3638 } 3639 3640 uint32_t 3641 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3642 { 3643 return bdev->blocklen; 3644 } 3645 3646 uint32_t 3647 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3648 { 3649 return bdev->write_unit_size; 3650 } 3651 3652 uint64_t 3653 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3654 { 3655 return bdev->blockcnt; 3656 } 3657 3658 const char * 3659 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3660 { 3661 return qos_rpc_type[type]; 3662 } 3663 3664 void 3665 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3666 { 3667 int i; 3668 3669 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3670 3671 pthread_mutex_lock(&bdev->internal.mutex); 3672 if (bdev->internal.qos) { 3673 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3674 if (bdev->internal.qos->rate_limits[i].limit != 3675 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3676 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3677 if (bdev_qos_is_iops_rate_limit(i) == false) { 3678 /* Change from Byte to Megabyte which is user visible. */ 3679 limits[i] = limits[i] / 1024 / 1024; 3680 } 3681 } 3682 } 3683 } 3684 pthread_mutex_unlock(&bdev->internal.mutex); 3685 } 3686 3687 size_t 3688 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3689 { 3690 return 1 << bdev->required_alignment; 3691 } 3692 3693 uint32_t 3694 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3695 { 3696 return bdev->optimal_io_boundary; 3697 } 3698 3699 bool 3700 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3701 { 3702 return bdev->write_cache; 3703 } 3704 3705 const struct spdk_uuid * 3706 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3707 { 3708 return &bdev->uuid; 3709 } 3710 3711 uint16_t 3712 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3713 { 3714 return bdev->acwu; 3715 } 3716 3717 uint32_t 3718 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3719 { 3720 return bdev->md_len; 3721 } 3722 3723 bool 3724 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3725 { 3726 return (bdev->md_len != 0) && bdev->md_interleave; 3727 } 3728 3729 bool 3730 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3731 { 3732 return (bdev->md_len != 0) && !bdev->md_interleave; 3733 } 3734 3735 bool 3736 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3737 { 3738 return bdev->zoned; 3739 } 3740 3741 uint32_t 3742 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3743 { 3744 if (spdk_bdev_is_md_interleaved(bdev)) { 3745 return bdev->blocklen - bdev->md_len; 3746 } else { 3747 return bdev->blocklen; 3748 } 3749 } 3750 3751 uint32_t 3752 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3753 { 3754 return bdev->phys_blocklen; 3755 } 3756 3757 static uint32_t 3758 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3759 { 3760 if (!spdk_bdev_is_md_interleaved(bdev)) { 3761 return bdev->blocklen + bdev->md_len; 3762 } else { 3763 return bdev->blocklen; 3764 } 3765 } 3766 3767 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3768 { 3769 if (bdev->md_len != 0) { 3770 return bdev->dif_type; 3771 } else { 3772 return SPDK_DIF_DISABLE; 3773 } 3774 } 3775 3776 bool 3777 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3778 { 3779 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3780 return bdev->dif_is_head_of_md; 3781 } else { 3782 return false; 3783 } 3784 } 3785 3786 bool 3787 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3788 enum spdk_dif_check_type check_type) 3789 { 3790 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3791 return false; 3792 } 3793 3794 switch (check_type) { 3795 case SPDK_DIF_CHECK_TYPE_REFTAG: 3796 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3797 case SPDK_DIF_CHECK_TYPE_APPTAG: 3798 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3799 case SPDK_DIF_CHECK_TYPE_GUARD: 3800 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3801 default: 3802 return false; 3803 } 3804 } 3805 3806 uint64_t 3807 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3808 { 3809 return bdev->internal.measured_queue_depth; 3810 } 3811 3812 uint64_t 3813 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3814 { 3815 return bdev->internal.period; 3816 } 3817 3818 uint64_t 3819 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3820 { 3821 return bdev->internal.weighted_io_time; 3822 } 3823 3824 uint64_t 3825 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3826 { 3827 return bdev->internal.io_time; 3828 } 3829 3830 static void 3831 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3832 { 3833 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3834 3835 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3836 3837 if (bdev->internal.measured_queue_depth) { 3838 bdev->internal.io_time += bdev->internal.period; 3839 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3840 } 3841 } 3842 3843 static void 3844 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3845 { 3846 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3847 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3848 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3849 3850 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3851 spdk_for_each_channel_continue(i, 0); 3852 } 3853 3854 static int 3855 bdev_calculate_measured_queue_depth(void *ctx) 3856 { 3857 struct spdk_bdev *bdev = ctx; 3858 bdev->internal.temporary_queue_depth = 0; 3859 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3860 _calculate_measured_qd_cpl); 3861 return SPDK_POLLER_BUSY; 3862 } 3863 3864 void 3865 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3866 { 3867 bdev->internal.period = period; 3868 3869 if (bdev->internal.qd_poller != NULL) { 3870 spdk_poller_unregister(&bdev->internal.qd_poller); 3871 bdev->internal.measured_queue_depth = UINT64_MAX; 3872 } 3873 3874 if (period != 0) { 3875 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 3876 period); 3877 } 3878 } 3879 3880 static void 3881 _resize_notify(void *arg) 3882 { 3883 struct spdk_bdev_desc *desc = arg; 3884 3885 pthread_mutex_lock(&desc->mutex); 3886 desc->refs--; 3887 if (!desc->closed) { 3888 pthread_mutex_unlock(&desc->mutex); 3889 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 3890 desc->bdev, 3891 desc->callback.ctx); 3892 return; 3893 } else if (0 == desc->refs) { 3894 /* This descriptor was closed after this resize_notify message was sent. 3895 * spdk_bdev_close() could not free the descriptor since this message was 3896 * in flight, so we free it now using bdev_desc_free(). 3897 */ 3898 pthread_mutex_unlock(&desc->mutex); 3899 bdev_desc_free(desc); 3900 return; 3901 } 3902 pthread_mutex_unlock(&desc->mutex); 3903 } 3904 3905 int 3906 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 3907 { 3908 struct spdk_bdev_desc *desc; 3909 int ret; 3910 3911 if (size == bdev->blockcnt) { 3912 return 0; 3913 } 3914 3915 pthread_mutex_lock(&bdev->internal.mutex); 3916 3917 /* bdev has open descriptors */ 3918 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 3919 bdev->blockcnt > size) { 3920 ret = -EBUSY; 3921 } else { 3922 bdev->blockcnt = size; 3923 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 3924 pthread_mutex_lock(&desc->mutex); 3925 if (!desc->closed) { 3926 desc->refs++; 3927 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 3928 } 3929 pthread_mutex_unlock(&desc->mutex); 3930 } 3931 ret = 0; 3932 } 3933 3934 pthread_mutex_unlock(&bdev->internal.mutex); 3935 3936 return ret; 3937 } 3938 3939 /* 3940 * Convert I/O offset and length from bytes to blocks. 3941 * 3942 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 3943 */ 3944 static uint64_t 3945 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 3946 uint64_t num_bytes, uint64_t *num_blocks) 3947 { 3948 uint32_t block_size = bdev->blocklen; 3949 uint8_t shift_cnt; 3950 3951 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3952 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 3953 shift_cnt = spdk_u32log2(block_size); 3954 *offset_blocks = offset_bytes >> shift_cnt; 3955 *num_blocks = num_bytes >> shift_cnt; 3956 return (offset_bytes - (*offset_blocks << shift_cnt)) | 3957 (num_bytes - (*num_blocks << shift_cnt)); 3958 } else { 3959 *offset_blocks = offset_bytes / block_size; 3960 *num_blocks = num_bytes / block_size; 3961 return (offset_bytes % block_size) | (num_bytes % block_size); 3962 } 3963 } 3964 3965 static bool 3966 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 3967 { 3968 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 3969 * has been an overflow and hence the offset has been wrapped around */ 3970 if (offset_blocks + num_blocks < offset_blocks) { 3971 return false; 3972 } 3973 3974 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 3975 if (offset_blocks + num_blocks > bdev->blockcnt) { 3976 return false; 3977 } 3978 3979 return true; 3980 } 3981 3982 static bool 3983 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 3984 { 3985 return _is_buf_allocated(iovs) == (md_buf != NULL); 3986 } 3987 3988 static int 3989 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 3990 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3991 spdk_bdev_io_completion_cb cb, void *cb_arg) 3992 { 3993 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3994 struct spdk_bdev_io *bdev_io; 3995 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3996 3997 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3998 return -EINVAL; 3999 } 4000 4001 bdev_io = bdev_channel_get_io(channel); 4002 if (!bdev_io) { 4003 return -ENOMEM; 4004 } 4005 4006 bdev_io->internal.ch = channel; 4007 bdev_io->internal.desc = desc; 4008 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4009 bdev_io->u.bdev.iovs = &bdev_io->iov; 4010 bdev_io->u.bdev.iovs[0].iov_base = buf; 4011 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4012 bdev_io->u.bdev.iovcnt = 1; 4013 bdev_io->u.bdev.md_buf = md_buf; 4014 bdev_io->u.bdev.num_blocks = num_blocks; 4015 bdev_io->u.bdev.offset_blocks = offset_blocks; 4016 bdev_io->u.bdev.ext_opts = NULL; 4017 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4018 4019 bdev_io_submit(bdev_io); 4020 return 0; 4021 } 4022 4023 int 4024 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4025 void *buf, uint64_t offset, uint64_t nbytes, 4026 spdk_bdev_io_completion_cb cb, void *cb_arg) 4027 { 4028 uint64_t offset_blocks, num_blocks; 4029 4030 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4031 nbytes, &num_blocks) != 0) { 4032 return -EINVAL; 4033 } 4034 4035 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4036 } 4037 4038 int 4039 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4040 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4041 spdk_bdev_io_completion_cb cb, void *cb_arg) 4042 { 4043 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4044 } 4045 4046 int 4047 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4048 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4049 spdk_bdev_io_completion_cb cb, void *cb_arg) 4050 { 4051 struct iovec iov = { 4052 .iov_base = buf, 4053 }; 4054 4055 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4056 return -EINVAL; 4057 } 4058 4059 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4060 return -EINVAL; 4061 } 4062 4063 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4064 cb, cb_arg); 4065 } 4066 4067 int 4068 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4069 struct iovec *iov, int iovcnt, 4070 uint64_t offset, uint64_t nbytes, 4071 spdk_bdev_io_completion_cb cb, void *cb_arg) 4072 { 4073 uint64_t offset_blocks, num_blocks; 4074 4075 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4076 nbytes, &num_blocks) != 0) { 4077 return -EINVAL; 4078 } 4079 4080 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4081 } 4082 4083 static int 4084 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4085 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4086 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4087 struct spdk_bdev_ext_io_opts *opts) 4088 { 4089 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4090 struct spdk_bdev_io *bdev_io; 4091 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4092 4093 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4094 return -EINVAL; 4095 } 4096 4097 bdev_io = bdev_channel_get_io(channel); 4098 if (!bdev_io) { 4099 return -ENOMEM; 4100 } 4101 4102 bdev_io->internal.ch = channel; 4103 bdev_io->internal.desc = desc; 4104 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4105 bdev_io->u.bdev.iovs = iov; 4106 bdev_io->u.bdev.iovcnt = iovcnt; 4107 bdev_io->u.bdev.md_buf = md_buf; 4108 bdev_io->u.bdev.num_blocks = num_blocks; 4109 bdev_io->u.bdev.offset_blocks = offset_blocks; 4110 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4111 bdev_io->internal.ext_opts = opts; 4112 bdev_io->u.bdev.ext_opts = opts; 4113 4114 bdev_io_submit(bdev_io); 4115 return 0; 4116 } 4117 4118 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4119 struct iovec *iov, int iovcnt, 4120 uint64_t offset_blocks, uint64_t num_blocks, 4121 spdk_bdev_io_completion_cb cb, void *cb_arg) 4122 { 4123 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4124 num_blocks, cb, cb_arg, NULL); 4125 } 4126 4127 int 4128 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4129 struct iovec *iov, int iovcnt, void *md_buf, 4130 uint64_t offset_blocks, uint64_t num_blocks, 4131 spdk_bdev_io_completion_cb cb, void *cb_arg) 4132 { 4133 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4134 return -EINVAL; 4135 } 4136 4137 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4138 return -EINVAL; 4139 } 4140 4141 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4142 num_blocks, cb, cb_arg, NULL); 4143 } 4144 4145 int 4146 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4147 struct iovec *iov, int iovcnt, 4148 uint64_t offset_blocks, uint64_t num_blocks, 4149 spdk_bdev_io_completion_cb cb, void *cb_arg, 4150 struct spdk_bdev_ext_io_opts *opts) 4151 { 4152 void *md = NULL; 4153 4154 if (opts) { 4155 md = opts->metadata; 4156 } 4157 4158 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4159 return -EINVAL; 4160 } 4161 4162 if (md && !_bdev_io_check_md_buf(iov, md)) { 4163 return -EINVAL; 4164 } 4165 4166 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4167 num_blocks, cb, cb_arg, opts); 4168 } 4169 4170 static int 4171 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4172 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4173 spdk_bdev_io_completion_cb cb, void *cb_arg) 4174 { 4175 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4176 struct spdk_bdev_io *bdev_io; 4177 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4178 4179 if (!desc->write) { 4180 return -EBADF; 4181 } 4182 4183 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4184 return -EINVAL; 4185 } 4186 4187 bdev_io = bdev_channel_get_io(channel); 4188 if (!bdev_io) { 4189 return -ENOMEM; 4190 } 4191 4192 bdev_io->internal.ch = channel; 4193 bdev_io->internal.desc = desc; 4194 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4195 bdev_io->u.bdev.iovs = &bdev_io->iov; 4196 bdev_io->u.bdev.iovs[0].iov_base = buf; 4197 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4198 bdev_io->u.bdev.iovcnt = 1; 4199 bdev_io->u.bdev.md_buf = md_buf; 4200 bdev_io->u.bdev.num_blocks = num_blocks; 4201 bdev_io->u.bdev.offset_blocks = offset_blocks; 4202 bdev_io->u.bdev.ext_opts = NULL; 4203 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4204 4205 bdev_io_submit(bdev_io); 4206 return 0; 4207 } 4208 4209 int 4210 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4211 void *buf, uint64_t offset, uint64_t nbytes, 4212 spdk_bdev_io_completion_cb cb, void *cb_arg) 4213 { 4214 uint64_t offset_blocks, num_blocks; 4215 4216 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4217 nbytes, &num_blocks) != 0) { 4218 return -EINVAL; 4219 } 4220 4221 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4222 } 4223 4224 int 4225 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4226 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4227 spdk_bdev_io_completion_cb cb, void *cb_arg) 4228 { 4229 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4230 cb, cb_arg); 4231 } 4232 4233 int 4234 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4235 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4236 spdk_bdev_io_completion_cb cb, void *cb_arg) 4237 { 4238 struct iovec iov = { 4239 .iov_base = buf, 4240 }; 4241 4242 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4243 return -EINVAL; 4244 } 4245 4246 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4247 return -EINVAL; 4248 } 4249 4250 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4251 cb, cb_arg); 4252 } 4253 4254 static int 4255 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4256 struct iovec *iov, int iovcnt, void *md_buf, 4257 uint64_t offset_blocks, uint64_t num_blocks, 4258 spdk_bdev_io_completion_cb cb, void *cb_arg, 4259 struct spdk_bdev_ext_io_opts *opts) 4260 { 4261 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4262 struct spdk_bdev_io *bdev_io; 4263 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4264 4265 if (!desc->write) { 4266 return -EBADF; 4267 } 4268 4269 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4270 return -EINVAL; 4271 } 4272 4273 bdev_io = bdev_channel_get_io(channel); 4274 if (!bdev_io) { 4275 return -ENOMEM; 4276 } 4277 4278 bdev_io->internal.ch = channel; 4279 bdev_io->internal.desc = desc; 4280 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4281 bdev_io->u.bdev.iovs = iov; 4282 bdev_io->u.bdev.iovcnt = iovcnt; 4283 bdev_io->u.bdev.md_buf = md_buf; 4284 bdev_io->u.bdev.num_blocks = num_blocks; 4285 bdev_io->u.bdev.offset_blocks = offset_blocks; 4286 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4287 bdev_io->internal.ext_opts = opts; 4288 bdev_io->u.bdev.ext_opts = opts; 4289 4290 bdev_io_submit(bdev_io); 4291 return 0; 4292 } 4293 4294 int 4295 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4296 struct iovec *iov, int iovcnt, 4297 uint64_t offset, uint64_t len, 4298 spdk_bdev_io_completion_cb cb, void *cb_arg) 4299 { 4300 uint64_t offset_blocks, num_blocks; 4301 4302 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4303 len, &num_blocks) != 0) { 4304 return -EINVAL; 4305 } 4306 4307 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4308 } 4309 4310 int 4311 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4312 struct iovec *iov, int iovcnt, 4313 uint64_t offset_blocks, uint64_t num_blocks, 4314 spdk_bdev_io_completion_cb cb, void *cb_arg) 4315 { 4316 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4317 num_blocks, cb, cb_arg, NULL); 4318 } 4319 4320 int 4321 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4322 struct iovec *iov, int iovcnt, void *md_buf, 4323 uint64_t offset_blocks, uint64_t num_blocks, 4324 spdk_bdev_io_completion_cb cb, void *cb_arg) 4325 { 4326 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4327 return -EINVAL; 4328 } 4329 4330 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4331 return -EINVAL; 4332 } 4333 4334 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4335 num_blocks, cb, cb_arg, NULL); 4336 } 4337 4338 int 4339 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4340 struct iovec *iov, int iovcnt, 4341 uint64_t offset_blocks, uint64_t num_blocks, 4342 spdk_bdev_io_completion_cb cb, void *cb_arg, 4343 struct spdk_bdev_ext_io_opts *opts) 4344 { 4345 void *md = NULL; 4346 4347 if (opts) { 4348 md = opts->metadata; 4349 } 4350 4351 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4352 return -EINVAL; 4353 } 4354 4355 if (md && !_bdev_io_check_md_buf(iov, md)) { 4356 return -EINVAL; 4357 } 4358 4359 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4360 num_blocks, cb, cb_arg, opts); 4361 } 4362 4363 static void 4364 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4365 { 4366 struct spdk_bdev_io *parent_io = cb_arg; 4367 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4368 int i, rc = 0; 4369 4370 if (!success) { 4371 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4372 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4373 spdk_bdev_free_io(bdev_io); 4374 return; 4375 } 4376 4377 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4378 rc = memcmp(read_buf, 4379 parent_io->u.bdev.iovs[i].iov_base, 4380 parent_io->u.bdev.iovs[i].iov_len); 4381 if (rc) { 4382 break; 4383 } 4384 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4385 } 4386 4387 spdk_bdev_free_io(bdev_io); 4388 4389 if (rc == 0) { 4390 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4391 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4392 } else { 4393 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4394 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4395 } 4396 } 4397 4398 static void 4399 bdev_compare_do_read(void *_bdev_io) 4400 { 4401 struct spdk_bdev_io *bdev_io = _bdev_io; 4402 int rc; 4403 4404 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4405 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4406 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4407 bdev_compare_do_read_done, bdev_io); 4408 4409 if (rc == -ENOMEM) { 4410 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4411 } else if (rc != 0) { 4412 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4413 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4414 } 4415 } 4416 4417 static int 4418 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4419 struct iovec *iov, int iovcnt, void *md_buf, 4420 uint64_t offset_blocks, uint64_t num_blocks, 4421 spdk_bdev_io_completion_cb cb, void *cb_arg) 4422 { 4423 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4424 struct spdk_bdev_io *bdev_io; 4425 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4426 4427 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4428 return -EINVAL; 4429 } 4430 4431 bdev_io = bdev_channel_get_io(channel); 4432 if (!bdev_io) { 4433 return -ENOMEM; 4434 } 4435 4436 bdev_io->internal.ch = channel; 4437 bdev_io->internal.desc = desc; 4438 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4439 bdev_io->u.bdev.iovs = iov; 4440 bdev_io->u.bdev.iovcnt = iovcnt; 4441 bdev_io->u.bdev.md_buf = md_buf; 4442 bdev_io->u.bdev.num_blocks = num_blocks; 4443 bdev_io->u.bdev.offset_blocks = offset_blocks; 4444 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4445 4446 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4447 bdev_io_submit(bdev_io); 4448 return 0; 4449 } 4450 4451 bdev_compare_do_read(bdev_io); 4452 4453 return 0; 4454 } 4455 4456 int 4457 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4458 struct iovec *iov, int iovcnt, 4459 uint64_t offset_blocks, uint64_t num_blocks, 4460 spdk_bdev_io_completion_cb cb, void *cb_arg) 4461 { 4462 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4463 num_blocks, cb, cb_arg); 4464 } 4465 4466 int 4467 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4468 struct iovec *iov, int iovcnt, void *md_buf, 4469 uint64_t offset_blocks, uint64_t num_blocks, 4470 spdk_bdev_io_completion_cb cb, void *cb_arg) 4471 { 4472 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4473 return -EINVAL; 4474 } 4475 4476 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4477 return -EINVAL; 4478 } 4479 4480 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4481 num_blocks, cb, cb_arg); 4482 } 4483 4484 static int 4485 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4486 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4487 spdk_bdev_io_completion_cb cb, void *cb_arg) 4488 { 4489 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4490 struct spdk_bdev_io *bdev_io; 4491 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4492 4493 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4494 return -EINVAL; 4495 } 4496 4497 bdev_io = bdev_channel_get_io(channel); 4498 if (!bdev_io) { 4499 return -ENOMEM; 4500 } 4501 4502 bdev_io->internal.ch = channel; 4503 bdev_io->internal.desc = desc; 4504 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4505 bdev_io->u.bdev.iovs = &bdev_io->iov; 4506 bdev_io->u.bdev.iovs[0].iov_base = buf; 4507 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4508 bdev_io->u.bdev.iovcnt = 1; 4509 bdev_io->u.bdev.md_buf = md_buf; 4510 bdev_io->u.bdev.num_blocks = num_blocks; 4511 bdev_io->u.bdev.offset_blocks = offset_blocks; 4512 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4513 4514 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4515 bdev_io_submit(bdev_io); 4516 return 0; 4517 } 4518 4519 bdev_compare_do_read(bdev_io); 4520 4521 return 0; 4522 } 4523 4524 int 4525 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4526 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4527 spdk_bdev_io_completion_cb cb, void *cb_arg) 4528 { 4529 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4530 cb, cb_arg); 4531 } 4532 4533 int 4534 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4535 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4536 spdk_bdev_io_completion_cb cb, void *cb_arg) 4537 { 4538 struct iovec iov = { 4539 .iov_base = buf, 4540 }; 4541 4542 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4543 return -EINVAL; 4544 } 4545 4546 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4547 return -EINVAL; 4548 } 4549 4550 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4551 cb, cb_arg); 4552 } 4553 4554 static void 4555 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4556 { 4557 struct spdk_bdev_io *bdev_io = ctx; 4558 4559 if (unlock_status) { 4560 SPDK_ERRLOG("LBA range unlock failed\n"); 4561 } 4562 4563 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4564 false, bdev_io->internal.caller_ctx); 4565 } 4566 4567 static void 4568 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4569 { 4570 bdev_io->internal.status = status; 4571 4572 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4573 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4574 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4575 } 4576 4577 static void 4578 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4579 { 4580 struct spdk_bdev_io *parent_io = cb_arg; 4581 4582 if (!success) { 4583 SPDK_ERRLOG("Compare and write operation failed\n"); 4584 } 4585 4586 spdk_bdev_free_io(bdev_io); 4587 4588 bdev_comparev_and_writev_blocks_unlock(parent_io, 4589 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4590 } 4591 4592 static void 4593 bdev_compare_and_write_do_write(void *_bdev_io) 4594 { 4595 struct spdk_bdev_io *bdev_io = _bdev_io; 4596 int rc; 4597 4598 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4599 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4600 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4601 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4602 bdev_compare_and_write_do_write_done, bdev_io); 4603 4604 4605 if (rc == -ENOMEM) { 4606 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4607 } else if (rc != 0) { 4608 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4609 } 4610 } 4611 4612 static void 4613 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4614 { 4615 struct spdk_bdev_io *parent_io = cb_arg; 4616 4617 spdk_bdev_free_io(bdev_io); 4618 4619 if (!success) { 4620 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4621 return; 4622 } 4623 4624 bdev_compare_and_write_do_write(parent_io); 4625 } 4626 4627 static void 4628 bdev_compare_and_write_do_compare(void *_bdev_io) 4629 { 4630 struct spdk_bdev_io *bdev_io = _bdev_io; 4631 int rc; 4632 4633 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4634 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4635 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4636 bdev_compare_and_write_do_compare_done, bdev_io); 4637 4638 if (rc == -ENOMEM) { 4639 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4640 } else if (rc != 0) { 4641 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4642 } 4643 } 4644 4645 static void 4646 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4647 { 4648 struct spdk_bdev_io *bdev_io = ctx; 4649 4650 if (status) { 4651 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4652 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4653 return; 4654 } 4655 4656 bdev_compare_and_write_do_compare(bdev_io); 4657 } 4658 4659 int 4660 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4661 struct iovec *compare_iov, int compare_iovcnt, 4662 struct iovec *write_iov, int write_iovcnt, 4663 uint64_t offset_blocks, uint64_t num_blocks, 4664 spdk_bdev_io_completion_cb cb, void *cb_arg) 4665 { 4666 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4667 struct spdk_bdev_io *bdev_io; 4668 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4669 4670 if (!desc->write) { 4671 return -EBADF; 4672 } 4673 4674 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4675 return -EINVAL; 4676 } 4677 4678 if (num_blocks > bdev->acwu) { 4679 return -EINVAL; 4680 } 4681 4682 bdev_io = bdev_channel_get_io(channel); 4683 if (!bdev_io) { 4684 return -ENOMEM; 4685 } 4686 4687 bdev_io->internal.ch = channel; 4688 bdev_io->internal.desc = desc; 4689 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 4690 bdev_io->u.bdev.iovs = compare_iov; 4691 bdev_io->u.bdev.iovcnt = compare_iovcnt; 4692 bdev_io->u.bdev.fused_iovs = write_iov; 4693 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4694 bdev_io->u.bdev.md_buf = NULL; 4695 bdev_io->u.bdev.num_blocks = num_blocks; 4696 bdev_io->u.bdev.offset_blocks = offset_blocks; 4697 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4698 4699 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4700 bdev_io_submit(bdev_io); 4701 return 0; 4702 } 4703 4704 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4705 bdev_comparev_and_writev_blocks_locked, bdev_io); 4706 } 4707 4708 int 4709 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4710 struct iovec *iov, int iovcnt, 4711 uint64_t offset_blocks, uint64_t num_blocks, 4712 bool populate, 4713 spdk_bdev_io_completion_cb cb, void *cb_arg) 4714 { 4715 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4716 struct spdk_bdev_io *bdev_io; 4717 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4718 4719 if (!desc->write) { 4720 return -EBADF; 4721 } 4722 4723 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4724 return -EINVAL; 4725 } 4726 4727 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4728 return -ENOTSUP; 4729 } 4730 4731 bdev_io = bdev_channel_get_io(channel); 4732 if (!bdev_io) { 4733 return -ENOMEM; 4734 } 4735 4736 bdev_io->internal.ch = channel; 4737 bdev_io->internal.desc = desc; 4738 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4739 bdev_io->u.bdev.num_blocks = num_blocks; 4740 bdev_io->u.bdev.offset_blocks = offset_blocks; 4741 bdev_io->u.bdev.iovs = iov; 4742 bdev_io->u.bdev.iovcnt = iovcnt; 4743 bdev_io->u.bdev.md_buf = NULL; 4744 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4745 bdev_io->u.bdev.zcopy.commit = 0; 4746 bdev_io->u.bdev.zcopy.start = 1; 4747 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4748 4749 bdev_io_submit(bdev_io); 4750 4751 return 0; 4752 } 4753 4754 int 4755 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4756 spdk_bdev_io_completion_cb cb, void *cb_arg) 4757 { 4758 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4759 return -EINVAL; 4760 } 4761 4762 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4763 bdev_io->u.bdev.zcopy.start = 0; 4764 bdev_io->internal.caller_ctx = cb_arg; 4765 bdev_io->internal.cb = cb; 4766 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4767 4768 bdev_io_submit(bdev_io); 4769 4770 return 0; 4771 } 4772 4773 int 4774 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4775 uint64_t offset, uint64_t len, 4776 spdk_bdev_io_completion_cb cb, void *cb_arg) 4777 { 4778 uint64_t offset_blocks, num_blocks; 4779 4780 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4781 len, &num_blocks) != 0) { 4782 return -EINVAL; 4783 } 4784 4785 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4786 } 4787 4788 int 4789 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4790 uint64_t offset_blocks, uint64_t num_blocks, 4791 spdk_bdev_io_completion_cb cb, void *cb_arg) 4792 { 4793 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4794 struct spdk_bdev_io *bdev_io; 4795 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4796 4797 if (!desc->write) { 4798 return -EBADF; 4799 } 4800 4801 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4802 return -EINVAL; 4803 } 4804 4805 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4806 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4807 return -ENOTSUP; 4808 } 4809 4810 bdev_io = bdev_channel_get_io(channel); 4811 4812 if (!bdev_io) { 4813 return -ENOMEM; 4814 } 4815 4816 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 4817 bdev_io->internal.ch = channel; 4818 bdev_io->internal.desc = desc; 4819 bdev_io->u.bdev.offset_blocks = offset_blocks; 4820 bdev_io->u.bdev.num_blocks = num_blocks; 4821 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4822 4823 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 4824 bdev_io_submit(bdev_io); 4825 return 0; 4826 } 4827 4828 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 4829 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 4830 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 4831 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 4832 bdev_write_zero_buffer_next(bdev_io); 4833 4834 return 0; 4835 } 4836 4837 int 4838 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4839 uint64_t offset, uint64_t nbytes, 4840 spdk_bdev_io_completion_cb cb, void *cb_arg) 4841 { 4842 uint64_t offset_blocks, num_blocks; 4843 4844 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4845 nbytes, &num_blocks) != 0) { 4846 return -EINVAL; 4847 } 4848 4849 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4850 } 4851 4852 int 4853 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4854 uint64_t offset_blocks, uint64_t num_blocks, 4855 spdk_bdev_io_completion_cb cb, void *cb_arg) 4856 { 4857 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4858 struct spdk_bdev_io *bdev_io; 4859 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4860 4861 if (!desc->write) { 4862 return -EBADF; 4863 } 4864 4865 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4866 return -EINVAL; 4867 } 4868 4869 if (num_blocks == 0) { 4870 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 4871 return -EINVAL; 4872 } 4873 4874 bdev_io = bdev_channel_get_io(channel); 4875 if (!bdev_io) { 4876 return -ENOMEM; 4877 } 4878 4879 bdev_io->internal.ch = channel; 4880 bdev_io->internal.desc = desc; 4881 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 4882 4883 bdev_io->u.bdev.iovs = &bdev_io->iov; 4884 bdev_io->u.bdev.iovs[0].iov_base = NULL; 4885 bdev_io->u.bdev.iovs[0].iov_len = 0; 4886 bdev_io->u.bdev.iovcnt = 1; 4887 4888 bdev_io->u.bdev.offset_blocks = offset_blocks; 4889 bdev_io->u.bdev.num_blocks = num_blocks; 4890 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4891 4892 bdev_io_submit(bdev_io); 4893 return 0; 4894 } 4895 4896 int 4897 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4898 uint64_t offset, uint64_t length, 4899 spdk_bdev_io_completion_cb cb, void *cb_arg) 4900 { 4901 uint64_t offset_blocks, num_blocks; 4902 4903 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4904 length, &num_blocks) != 0) { 4905 return -EINVAL; 4906 } 4907 4908 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4909 } 4910 4911 int 4912 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4913 uint64_t offset_blocks, uint64_t num_blocks, 4914 spdk_bdev_io_completion_cb cb, void *cb_arg) 4915 { 4916 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4917 struct spdk_bdev_io *bdev_io; 4918 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4919 4920 if (!desc->write) { 4921 return -EBADF; 4922 } 4923 4924 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4925 return -EINVAL; 4926 } 4927 4928 bdev_io = bdev_channel_get_io(channel); 4929 if (!bdev_io) { 4930 return -ENOMEM; 4931 } 4932 4933 bdev_io->internal.ch = channel; 4934 bdev_io->internal.desc = desc; 4935 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 4936 bdev_io->u.bdev.iovs = NULL; 4937 bdev_io->u.bdev.iovcnt = 0; 4938 bdev_io->u.bdev.offset_blocks = offset_blocks; 4939 bdev_io->u.bdev.num_blocks = num_blocks; 4940 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4941 4942 bdev_io_submit(bdev_io); 4943 return 0; 4944 } 4945 4946 static void 4947 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 4948 { 4949 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 4950 struct spdk_bdev_io *bdev_io; 4951 4952 bdev_io = TAILQ_FIRST(&ch->queued_resets); 4953 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 4954 bdev_io_submit_reset(bdev_io); 4955 } 4956 4957 static void 4958 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 4959 { 4960 struct spdk_io_channel *ch; 4961 struct spdk_bdev_channel *channel; 4962 struct spdk_bdev_mgmt_channel *mgmt_channel; 4963 struct spdk_bdev_shared_resource *shared_resource; 4964 bdev_io_tailq_t tmp_queued; 4965 4966 TAILQ_INIT(&tmp_queued); 4967 4968 ch = spdk_io_channel_iter_get_channel(i); 4969 channel = spdk_io_channel_get_ctx(ch); 4970 shared_resource = channel->shared_resource; 4971 mgmt_channel = shared_resource->mgmt_ch; 4972 4973 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 4974 4975 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 4976 /* The QoS object is always valid and readable while 4977 * the channel flag is set, so the lock here should not 4978 * be necessary. We're not in the fast path though, so 4979 * just take it anyway. */ 4980 pthread_mutex_lock(&channel->bdev->internal.mutex); 4981 if (channel->bdev->internal.qos->ch == channel) { 4982 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 4983 } 4984 pthread_mutex_unlock(&channel->bdev->internal.mutex); 4985 } 4986 4987 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 4988 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 4989 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 4990 bdev_abort_all_queued_io(&tmp_queued, channel); 4991 4992 spdk_for_each_channel_continue(i, 0); 4993 } 4994 4995 static void 4996 bdev_start_reset(void *ctx) 4997 { 4998 struct spdk_bdev_channel *ch = ctx; 4999 5000 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 5001 ch, bdev_reset_dev); 5002 } 5003 5004 static void 5005 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5006 { 5007 struct spdk_bdev *bdev = ch->bdev; 5008 5009 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5010 5011 pthread_mutex_lock(&bdev->internal.mutex); 5012 if (bdev->internal.reset_in_progress == NULL) { 5013 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5014 /* 5015 * Take a channel reference for the target bdev for the life of this 5016 * reset. This guards against the channel getting destroyed while 5017 * spdk_for_each_channel() calls related to this reset IO are in 5018 * progress. We will release the reference when this reset is 5019 * completed. 5020 */ 5021 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5022 bdev_start_reset(ch); 5023 } 5024 pthread_mutex_unlock(&bdev->internal.mutex); 5025 } 5026 5027 int 5028 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5029 spdk_bdev_io_completion_cb cb, void *cb_arg) 5030 { 5031 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5032 struct spdk_bdev_io *bdev_io; 5033 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5034 5035 bdev_io = bdev_channel_get_io(channel); 5036 if (!bdev_io) { 5037 return -ENOMEM; 5038 } 5039 5040 bdev_io->internal.ch = channel; 5041 bdev_io->internal.desc = desc; 5042 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5043 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5044 bdev_io->u.reset.ch_ref = NULL; 5045 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5046 5047 pthread_mutex_lock(&bdev->internal.mutex); 5048 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5049 pthread_mutex_unlock(&bdev->internal.mutex); 5050 5051 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5052 internal.ch_link); 5053 5054 bdev_channel_start_reset(channel); 5055 5056 return 0; 5057 } 5058 5059 void 5060 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5061 struct spdk_bdev_io_stat *stat) 5062 { 5063 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5064 5065 *stat = channel->stat; 5066 } 5067 5068 static void 5069 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 5070 { 5071 void *io_device = spdk_io_channel_iter_get_io_device(i); 5072 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5073 5074 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 5075 bdev_iostat_ctx->cb_arg, 0); 5076 free(bdev_iostat_ctx); 5077 } 5078 5079 static void 5080 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 5081 { 5082 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5083 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 5084 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5085 5086 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 5087 spdk_for_each_channel_continue(i, 0); 5088 } 5089 5090 void 5091 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5092 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5093 { 5094 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5095 5096 assert(bdev != NULL); 5097 assert(stat != NULL); 5098 assert(cb != NULL); 5099 5100 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5101 if (bdev_iostat_ctx == NULL) { 5102 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5103 cb(bdev, stat, cb_arg, -ENOMEM); 5104 return; 5105 } 5106 5107 bdev_iostat_ctx->stat = stat; 5108 bdev_iostat_ctx->cb = cb; 5109 bdev_iostat_ctx->cb_arg = cb_arg; 5110 5111 /* Start with the statistics from previously deleted channels. */ 5112 pthread_mutex_lock(&bdev->internal.mutex); 5113 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 5114 pthread_mutex_unlock(&bdev->internal.mutex); 5115 5116 /* Then iterate and add the statistics from each existing channel. */ 5117 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5118 bdev_get_each_channel_stat, 5119 bdev_iostat_ctx, 5120 bdev_get_device_stat_done); 5121 } 5122 5123 int 5124 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5125 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5126 spdk_bdev_io_completion_cb cb, void *cb_arg) 5127 { 5128 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5129 struct spdk_bdev_io *bdev_io; 5130 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5131 5132 if (!desc->write) { 5133 return -EBADF; 5134 } 5135 5136 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5137 return -ENOTSUP; 5138 } 5139 5140 bdev_io = bdev_channel_get_io(channel); 5141 if (!bdev_io) { 5142 return -ENOMEM; 5143 } 5144 5145 bdev_io->internal.ch = channel; 5146 bdev_io->internal.desc = desc; 5147 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5148 bdev_io->u.nvme_passthru.cmd = *cmd; 5149 bdev_io->u.nvme_passthru.buf = buf; 5150 bdev_io->u.nvme_passthru.nbytes = nbytes; 5151 bdev_io->u.nvme_passthru.md_buf = NULL; 5152 bdev_io->u.nvme_passthru.md_len = 0; 5153 5154 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5155 5156 bdev_io_submit(bdev_io); 5157 return 0; 5158 } 5159 5160 int 5161 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5162 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5163 spdk_bdev_io_completion_cb cb, void *cb_arg) 5164 { 5165 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5166 struct spdk_bdev_io *bdev_io; 5167 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5168 5169 if (!desc->write) { 5170 /* 5171 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5172 * to easily determine if the command is a read or write, but for now just 5173 * do not allow io_passthru with a read-only descriptor. 5174 */ 5175 return -EBADF; 5176 } 5177 5178 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5179 return -ENOTSUP; 5180 } 5181 5182 bdev_io = bdev_channel_get_io(channel); 5183 if (!bdev_io) { 5184 return -ENOMEM; 5185 } 5186 5187 bdev_io->internal.ch = channel; 5188 bdev_io->internal.desc = desc; 5189 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5190 bdev_io->u.nvme_passthru.cmd = *cmd; 5191 bdev_io->u.nvme_passthru.buf = buf; 5192 bdev_io->u.nvme_passthru.nbytes = nbytes; 5193 bdev_io->u.nvme_passthru.md_buf = NULL; 5194 bdev_io->u.nvme_passthru.md_len = 0; 5195 5196 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5197 5198 bdev_io_submit(bdev_io); 5199 return 0; 5200 } 5201 5202 int 5203 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5204 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5205 spdk_bdev_io_completion_cb cb, void *cb_arg) 5206 { 5207 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5208 struct spdk_bdev_io *bdev_io; 5209 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5210 5211 if (!desc->write) { 5212 /* 5213 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5214 * to easily determine if the command is a read or write, but for now just 5215 * do not allow io_passthru with a read-only descriptor. 5216 */ 5217 return -EBADF; 5218 } 5219 5220 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5221 return -ENOTSUP; 5222 } 5223 5224 bdev_io = bdev_channel_get_io(channel); 5225 if (!bdev_io) { 5226 return -ENOMEM; 5227 } 5228 5229 bdev_io->internal.ch = channel; 5230 bdev_io->internal.desc = desc; 5231 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5232 bdev_io->u.nvme_passthru.cmd = *cmd; 5233 bdev_io->u.nvme_passthru.buf = buf; 5234 bdev_io->u.nvme_passthru.nbytes = nbytes; 5235 bdev_io->u.nvme_passthru.md_buf = md_buf; 5236 bdev_io->u.nvme_passthru.md_len = md_len; 5237 5238 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5239 5240 bdev_io_submit(bdev_io); 5241 return 0; 5242 } 5243 5244 static void bdev_abort_retry(void *ctx); 5245 static void bdev_abort(struct spdk_bdev_io *parent_io); 5246 5247 static void 5248 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5249 { 5250 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5251 struct spdk_bdev_io *parent_io = cb_arg; 5252 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5253 5254 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5255 5256 spdk_bdev_free_io(bdev_io); 5257 5258 if (!success) { 5259 /* Check if the target I/O completed in the meantime. */ 5260 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5261 if (tmp_io == bio_to_abort) { 5262 break; 5263 } 5264 } 5265 5266 /* If the target I/O still exists, set the parent to failed. */ 5267 if (tmp_io != NULL) { 5268 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5269 } 5270 } 5271 5272 parent_io->u.bdev.split_outstanding--; 5273 if (parent_io->u.bdev.split_outstanding == 0) { 5274 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5275 bdev_abort_retry(parent_io); 5276 } else { 5277 bdev_io_complete(parent_io); 5278 } 5279 } 5280 } 5281 5282 static int 5283 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5284 struct spdk_bdev_io *bio_to_abort, 5285 spdk_bdev_io_completion_cb cb, void *cb_arg) 5286 { 5287 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5288 struct spdk_bdev_io *bdev_io; 5289 5290 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5291 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5292 /* TODO: Abort reset or abort request. */ 5293 return -ENOTSUP; 5294 } 5295 5296 bdev_io = bdev_channel_get_io(channel); 5297 if (bdev_io == NULL) { 5298 return -ENOMEM; 5299 } 5300 5301 bdev_io->internal.ch = channel; 5302 bdev_io->internal.desc = desc; 5303 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5304 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5305 5306 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5307 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5308 5309 /* Parent abort request is not submitted directly, but to manage its 5310 * execution add it to the submitted list here. 5311 */ 5312 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5313 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5314 5315 bdev_abort(bdev_io); 5316 5317 return 0; 5318 } 5319 5320 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5321 5322 /* Submit the abort request to the underlying bdev module. */ 5323 bdev_io_submit(bdev_io); 5324 5325 return 0; 5326 } 5327 5328 static uint32_t 5329 _bdev_abort(struct spdk_bdev_io *parent_io) 5330 { 5331 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5332 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5333 void *bio_cb_arg; 5334 struct spdk_bdev_io *bio_to_abort; 5335 uint32_t matched_ios; 5336 int rc; 5337 5338 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5339 5340 /* matched_ios is returned and will be kept by the caller. 5341 * 5342 * This funcion will be used for two cases, 1) the same cb_arg is used for 5343 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5344 * Incrementing split_outstanding directly here may confuse readers especially 5345 * for the 1st case. 5346 * 5347 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5348 * works as expected. 5349 */ 5350 matched_ios = 0; 5351 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5352 5353 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5354 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5355 continue; 5356 } 5357 5358 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5359 /* Any I/O which was submitted after this abort command should be excluded. */ 5360 continue; 5361 } 5362 5363 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5364 if (rc != 0) { 5365 if (rc == -ENOMEM) { 5366 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5367 } else { 5368 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5369 } 5370 break; 5371 } 5372 matched_ios++; 5373 } 5374 5375 return matched_ios; 5376 } 5377 5378 static void 5379 bdev_abort_retry(void *ctx) 5380 { 5381 struct spdk_bdev_io *parent_io = ctx; 5382 uint32_t matched_ios; 5383 5384 matched_ios = _bdev_abort(parent_io); 5385 5386 if (matched_ios == 0) { 5387 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5388 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5389 } else { 5390 /* For retry, the case that no target I/O was found is success 5391 * because it means target I/Os completed in the meantime. 5392 */ 5393 bdev_io_complete(parent_io); 5394 } 5395 return; 5396 } 5397 5398 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5399 parent_io->u.bdev.split_outstanding = matched_ios; 5400 } 5401 5402 static void 5403 bdev_abort(struct spdk_bdev_io *parent_io) 5404 { 5405 uint32_t matched_ios; 5406 5407 matched_ios = _bdev_abort(parent_io); 5408 5409 if (matched_ios == 0) { 5410 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5411 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5412 } else { 5413 /* The case the no target I/O was found is failure. */ 5414 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5415 bdev_io_complete(parent_io); 5416 } 5417 return; 5418 } 5419 5420 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5421 parent_io->u.bdev.split_outstanding = matched_ios; 5422 } 5423 5424 int 5425 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5426 void *bio_cb_arg, 5427 spdk_bdev_io_completion_cb cb, void *cb_arg) 5428 { 5429 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5430 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5431 struct spdk_bdev_io *bdev_io; 5432 5433 if (bio_cb_arg == NULL) { 5434 return -EINVAL; 5435 } 5436 5437 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5438 return -ENOTSUP; 5439 } 5440 5441 bdev_io = bdev_channel_get_io(channel); 5442 if (bdev_io == NULL) { 5443 return -ENOMEM; 5444 } 5445 5446 bdev_io->internal.ch = channel; 5447 bdev_io->internal.desc = desc; 5448 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5449 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5450 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5451 5452 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5453 5454 /* Parent abort request is not submitted directly, but to manage its execution, 5455 * add it to the submitted list here. 5456 */ 5457 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5458 5459 bdev_abort(bdev_io); 5460 5461 return 0; 5462 } 5463 5464 int 5465 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5466 struct spdk_bdev_io_wait_entry *entry) 5467 { 5468 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5469 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5470 5471 if (bdev != entry->bdev) { 5472 SPDK_ERRLOG("bdevs do not match\n"); 5473 return -EINVAL; 5474 } 5475 5476 if (mgmt_ch->per_thread_cache_count > 0) { 5477 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5478 return -EINVAL; 5479 } 5480 5481 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5482 return 0; 5483 } 5484 5485 static inline void 5486 bdev_io_complete(void *ctx) 5487 { 5488 struct spdk_bdev_io *bdev_io = ctx; 5489 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5490 uint64_t tsc, tsc_diff; 5491 5492 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5493 /* 5494 * Send the completion to the thread that originally submitted the I/O, 5495 * which may not be the current thread in the case of QoS. 5496 */ 5497 if (bdev_io->internal.io_submit_ch) { 5498 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5499 bdev_io->internal.io_submit_ch = NULL; 5500 } 5501 5502 /* 5503 * Defer completion to avoid potential infinite recursion if the 5504 * user's completion callback issues a new I/O. 5505 */ 5506 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5507 bdev_io_complete, bdev_io); 5508 return; 5509 } 5510 5511 tsc = spdk_get_ticks(); 5512 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5513 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5514 bdev_io->internal.caller_ctx); 5515 5516 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5517 5518 if (bdev_io->internal.ch->histogram) { 5519 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5520 } 5521 5522 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5523 switch (bdev_io->type) { 5524 case SPDK_BDEV_IO_TYPE_READ: 5525 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5526 bdev_io->internal.ch->stat.num_read_ops++; 5527 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5528 break; 5529 case SPDK_BDEV_IO_TYPE_WRITE: 5530 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5531 bdev_io->internal.ch->stat.num_write_ops++; 5532 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5533 break; 5534 case SPDK_BDEV_IO_TYPE_UNMAP: 5535 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5536 bdev_io->internal.ch->stat.num_unmap_ops++; 5537 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5538 break; 5539 case SPDK_BDEV_IO_TYPE_ZCOPY: 5540 /* Track the data in the start phase only */ 5541 if (bdev_io->u.bdev.zcopy.start) { 5542 if (bdev_io->u.bdev.zcopy.populate) { 5543 bdev_io->internal.ch->stat.bytes_read += 5544 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5545 bdev_io->internal.ch->stat.num_read_ops++; 5546 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5547 } else { 5548 bdev_io->internal.ch->stat.bytes_written += 5549 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5550 bdev_io->internal.ch->stat.num_write_ops++; 5551 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5552 } 5553 } 5554 break; 5555 default: 5556 break; 5557 } 5558 } 5559 5560 #ifdef SPDK_CONFIG_VTUNE 5561 uint64_t now_tsc = spdk_get_ticks(); 5562 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5563 uint64_t data[5]; 5564 5565 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5566 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5567 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5568 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5569 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5570 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5571 5572 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5573 __itt_metadata_u64, 5, data); 5574 5575 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5576 bdev_io->internal.ch->start_tsc = now_tsc; 5577 } 5578 #endif 5579 5580 assert(bdev_io->internal.cb != NULL); 5581 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5582 5583 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5584 bdev_io->internal.caller_ctx); 5585 } 5586 5587 static void 5588 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 5589 { 5590 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5591 5592 if (bdev_io->u.reset.ch_ref != NULL) { 5593 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 5594 bdev_io->u.reset.ch_ref = NULL; 5595 } 5596 5597 bdev_io_complete(bdev_io); 5598 } 5599 5600 static void 5601 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 5602 { 5603 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5604 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5605 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5606 struct spdk_bdev_io *queued_reset; 5607 5608 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 5609 while (!TAILQ_EMPTY(&ch->queued_resets)) { 5610 queued_reset = TAILQ_FIRST(&ch->queued_resets); 5611 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 5612 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 5613 } 5614 5615 spdk_for_each_channel_continue(i, 0); 5616 } 5617 5618 void 5619 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 5620 { 5621 struct spdk_bdev *bdev = bdev_io->bdev; 5622 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5623 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5624 5625 bdev_io->internal.status = status; 5626 5627 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5628 bool unlock_channels = false; 5629 5630 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5631 SPDK_ERRLOG("NOMEM returned for reset\n"); 5632 } 5633 pthread_mutex_lock(&bdev->internal.mutex); 5634 if (bdev_io == bdev->internal.reset_in_progress) { 5635 bdev->internal.reset_in_progress = NULL; 5636 unlock_channels = true; 5637 } 5638 pthread_mutex_unlock(&bdev->internal.mutex); 5639 5640 if (unlock_channels) { 5641 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5642 bdev_io, bdev_reset_complete); 5643 return; 5644 } 5645 } else { 5646 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 5647 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 5648 /* bdev IO will be completed in the callback */ 5649 return; 5650 } 5651 5652 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 5653 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 5654 return; 5655 } 5656 } 5657 5658 bdev_io_complete(bdev_io); 5659 } 5660 5661 void 5662 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5663 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5664 { 5665 if (sc == SPDK_SCSI_STATUS_GOOD) { 5666 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5667 } else { 5668 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5669 bdev_io->internal.error.scsi.sc = sc; 5670 bdev_io->internal.error.scsi.sk = sk; 5671 bdev_io->internal.error.scsi.asc = asc; 5672 bdev_io->internal.error.scsi.ascq = ascq; 5673 } 5674 5675 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5676 } 5677 5678 void 5679 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5680 int *sc, int *sk, int *asc, int *ascq) 5681 { 5682 assert(sc != NULL); 5683 assert(sk != NULL); 5684 assert(asc != NULL); 5685 assert(ascq != NULL); 5686 5687 switch (bdev_io->internal.status) { 5688 case SPDK_BDEV_IO_STATUS_SUCCESS: 5689 *sc = SPDK_SCSI_STATUS_GOOD; 5690 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5691 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5692 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5693 break; 5694 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5695 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5696 break; 5697 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5698 *sc = bdev_io->internal.error.scsi.sc; 5699 *sk = bdev_io->internal.error.scsi.sk; 5700 *asc = bdev_io->internal.error.scsi.asc; 5701 *ascq = bdev_io->internal.error.scsi.ascq; 5702 break; 5703 default: 5704 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5705 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5706 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5707 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5708 break; 5709 } 5710 } 5711 5712 void 5713 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 5714 { 5715 if (aio_result == 0) { 5716 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5717 } else { 5718 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 5719 } 5720 5721 bdev_io->internal.error.aio_result = aio_result; 5722 5723 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5724 } 5725 5726 void 5727 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 5728 { 5729 assert(aio_result != NULL); 5730 5731 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 5732 *aio_result = bdev_io->internal.error.aio_result; 5733 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5734 *aio_result = 0; 5735 } else { 5736 *aio_result = -EIO; 5737 } 5738 } 5739 5740 void 5741 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5742 { 5743 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5744 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5745 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 5746 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 5747 } else { 5748 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5749 } 5750 5751 bdev_io->internal.error.nvme.cdw0 = cdw0; 5752 bdev_io->internal.error.nvme.sct = sct; 5753 bdev_io->internal.error.nvme.sc = sc; 5754 5755 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5756 } 5757 5758 void 5759 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5760 { 5761 assert(sct != NULL); 5762 assert(sc != NULL); 5763 assert(cdw0 != NULL); 5764 5765 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 5766 *sct = SPDK_NVME_SCT_GENERIC; 5767 *sc = SPDK_NVME_SC_SUCCESS; 5768 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5769 *cdw0 = 0; 5770 } else { 5771 *cdw0 = 1U; 5772 } 5773 return; 5774 } 5775 5776 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5777 *sct = bdev_io->internal.error.nvme.sct; 5778 *sc = bdev_io->internal.error.nvme.sc; 5779 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5780 *sct = SPDK_NVME_SCT_GENERIC; 5781 *sc = SPDK_NVME_SC_SUCCESS; 5782 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5783 *sct = SPDK_NVME_SCT_GENERIC; 5784 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5785 } else { 5786 *sct = SPDK_NVME_SCT_GENERIC; 5787 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5788 } 5789 5790 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5791 } 5792 5793 void 5794 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5795 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5796 { 5797 assert(first_sct != NULL); 5798 assert(first_sc != NULL); 5799 assert(second_sct != NULL); 5800 assert(second_sc != NULL); 5801 assert(cdw0 != NULL); 5802 5803 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5804 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5805 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5806 *first_sct = bdev_io->internal.error.nvme.sct; 5807 *first_sc = bdev_io->internal.error.nvme.sc; 5808 *second_sct = SPDK_NVME_SCT_GENERIC; 5809 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5810 } else { 5811 *first_sct = SPDK_NVME_SCT_GENERIC; 5812 *first_sc = SPDK_NVME_SC_SUCCESS; 5813 *second_sct = bdev_io->internal.error.nvme.sct; 5814 *second_sc = bdev_io->internal.error.nvme.sc; 5815 } 5816 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5817 *first_sct = SPDK_NVME_SCT_GENERIC; 5818 *first_sc = SPDK_NVME_SC_SUCCESS; 5819 *second_sct = SPDK_NVME_SCT_GENERIC; 5820 *second_sc = SPDK_NVME_SC_SUCCESS; 5821 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 5822 *first_sct = SPDK_NVME_SCT_GENERIC; 5823 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5824 *second_sct = SPDK_NVME_SCT_GENERIC; 5825 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5826 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 5827 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 5828 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 5829 *second_sct = SPDK_NVME_SCT_GENERIC; 5830 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5831 } else { 5832 *first_sct = SPDK_NVME_SCT_GENERIC; 5833 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5834 *second_sct = SPDK_NVME_SCT_GENERIC; 5835 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5836 } 5837 5838 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5839 } 5840 5841 struct spdk_thread * 5842 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 5843 { 5844 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 5845 } 5846 5847 struct spdk_io_channel * 5848 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 5849 { 5850 return bdev_io->internal.ch->channel; 5851 } 5852 5853 static int 5854 bdev_register(struct spdk_bdev *bdev) 5855 { 5856 char *bdev_name; 5857 char uuid[SPDK_UUID_STRING_LEN]; 5858 int ret; 5859 5860 assert(bdev->module != NULL); 5861 5862 if (!bdev->name) { 5863 SPDK_ERRLOG("Bdev name is NULL\n"); 5864 return -EINVAL; 5865 } 5866 5867 if (!strlen(bdev->name)) { 5868 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 5869 return -EINVAL; 5870 } 5871 5872 /* Users often register their own I/O devices using the bdev name. In 5873 * order to avoid conflicts, prepend bdev_. */ 5874 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 5875 if (!bdev_name) { 5876 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 5877 return -ENOMEM; 5878 } 5879 5880 bdev->internal.status = SPDK_BDEV_STATUS_READY; 5881 bdev->internal.measured_queue_depth = UINT64_MAX; 5882 bdev->internal.claim_module = NULL; 5883 bdev->internal.qd_poller = NULL; 5884 bdev->internal.qos = NULL; 5885 5886 TAILQ_INIT(&bdev->internal.open_descs); 5887 TAILQ_INIT(&bdev->internal.locked_ranges); 5888 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 5889 TAILQ_INIT(&bdev->aliases); 5890 5891 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 5892 if (ret != 0) { 5893 free(bdev_name); 5894 return ret; 5895 } 5896 5897 /* If the user didn't specify a uuid, generate one. */ 5898 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 5899 spdk_uuid_generate(&bdev->uuid); 5900 } 5901 5902 /* Add the UUID alias only if it's different than the name */ 5903 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 5904 if (strcmp(bdev->name, uuid) != 0) { 5905 ret = spdk_bdev_alias_add(bdev, uuid); 5906 if (ret != 0) { 5907 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 5908 bdev_name_del(&bdev->internal.bdev_name); 5909 free(bdev_name); 5910 return ret; 5911 } 5912 } 5913 5914 if (spdk_bdev_get_buf_align(bdev) > 1) { 5915 if (bdev->split_on_optimal_io_boundary) { 5916 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 5917 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 5918 } else { 5919 bdev->split_on_optimal_io_boundary = true; 5920 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 5921 } 5922 } 5923 5924 /* If the user didn't specify a write unit size, set it to one. */ 5925 if (bdev->write_unit_size == 0) { 5926 bdev->write_unit_size = 1; 5927 } 5928 5929 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 5930 if (bdev->acwu == 0) { 5931 bdev->acwu = 1; 5932 } 5933 5934 if (bdev->phys_blocklen == 0) { 5935 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 5936 } 5937 5938 bdev->internal.reset_in_progress = NULL; 5939 5940 spdk_io_device_register(__bdev_to_io_dev(bdev), 5941 bdev_channel_create, bdev_channel_destroy, 5942 sizeof(struct spdk_bdev_channel), 5943 bdev_name); 5944 5945 free(bdev_name); 5946 5947 pthread_mutex_init(&bdev->internal.mutex, NULL); 5948 5949 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 5950 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 5951 5952 return 0; 5953 } 5954 5955 static void 5956 bdev_destroy_cb(void *io_device) 5957 { 5958 int rc; 5959 struct spdk_bdev *bdev; 5960 spdk_bdev_unregister_cb cb_fn; 5961 void *cb_arg; 5962 5963 bdev = __bdev_from_io_dev(io_device); 5964 cb_fn = bdev->internal.unregister_cb; 5965 cb_arg = bdev->internal.unregister_ctx; 5966 5967 pthread_mutex_destroy(&bdev->internal.mutex); 5968 free(bdev->internal.qos); 5969 5970 rc = bdev->fn_table->destruct(bdev->ctxt); 5971 if (rc < 0) { 5972 SPDK_ERRLOG("destruct failed\n"); 5973 } 5974 if (rc <= 0 && cb_fn != NULL) { 5975 cb_fn(cb_arg, rc); 5976 } 5977 } 5978 5979 static void 5980 bdev_register_finished(void *arg) 5981 { 5982 struct spdk_bdev *bdev = arg; 5983 5984 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 5985 } 5986 5987 int 5988 spdk_bdev_register(struct spdk_bdev *bdev) 5989 { 5990 int rc = bdev_register(bdev); 5991 5992 if (rc == 0) { 5993 /* Examine configuration before initializing I/O */ 5994 bdev_examine(bdev); 5995 5996 spdk_bdev_wait_for_examine(bdev_register_finished, bdev); 5997 } 5998 5999 return rc; 6000 } 6001 6002 void 6003 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6004 { 6005 if (bdev->internal.unregister_cb != NULL) { 6006 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6007 } 6008 } 6009 6010 static void 6011 _remove_notify(void *arg) 6012 { 6013 struct spdk_bdev_desc *desc = arg; 6014 6015 pthread_mutex_lock(&desc->mutex); 6016 desc->refs--; 6017 6018 if (!desc->closed) { 6019 pthread_mutex_unlock(&desc->mutex); 6020 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6021 return; 6022 } else if (0 == desc->refs) { 6023 /* This descriptor was closed after this remove_notify message was sent. 6024 * spdk_bdev_close() could not free the descriptor since this message was 6025 * in flight, so we free it now using bdev_desc_free(). 6026 */ 6027 pthread_mutex_unlock(&desc->mutex); 6028 bdev_desc_free(desc); 6029 return; 6030 } 6031 pthread_mutex_unlock(&desc->mutex); 6032 } 6033 6034 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 6035 * returns: 0 - bdev removed and ready to be destructed. 6036 * -EBUSY - bdev can't be destructed yet. */ 6037 static int 6038 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6039 { 6040 struct spdk_bdev_desc *desc, *tmp; 6041 int rc = 0; 6042 char uuid[SPDK_UUID_STRING_LEN]; 6043 6044 /* Notify each descriptor about hotremoval */ 6045 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6046 rc = -EBUSY; 6047 pthread_mutex_lock(&desc->mutex); 6048 /* 6049 * Defer invocation of the event_cb to a separate message that will 6050 * run later on its thread. This ensures this context unwinds and 6051 * we don't recursively unregister this bdev again if the event_cb 6052 * immediately closes its descriptor. 6053 */ 6054 desc->refs++; 6055 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6056 pthread_mutex_unlock(&desc->mutex); 6057 } 6058 6059 /* If there are no descriptors, proceed removing the bdev */ 6060 if (rc == 0) { 6061 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6062 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6063 6064 /* Delete the name and the UUID alias */ 6065 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6066 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6067 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6068 6069 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6070 } 6071 6072 return rc; 6073 } 6074 6075 void 6076 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6077 { 6078 struct spdk_thread *thread; 6079 int rc; 6080 6081 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6082 6083 thread = spdk_get_thread(); 6084 if (!thread) { 6085 /* The user called this from a non-SPDK thread. */ 6086 if (cb_fn != NULL) { 6087 cb_fn(cb_arg, -ENOTSUP); 6088 } 6089 return; 6090 } 6091 6092 pthread_mutex_lock(&g_bdev_mgr.mutex); 6093 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6094 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6095 if (cb_fn) { 6096 cb_fn(cb_arg, -EBUSY); 6097 } 6098 return; 6099 } 6100 6101 pthread_mutex_lock(&bdev->internal.mutex); 6102 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6103 bdev->internal.unregister_cb = cb_fn; 6104 bdev->internal.unregister_ctx = cb_arg; 6105 6106 /* Call under lock. */ 6107 rc = bdev_unregister_unsafe(bdev); 6108 pthread_mutex_unlock(&bdev->internal.mutex); 6109 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6110 6111 if (rc == 0) { 6112 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6113 } 6114 } 6115 6116 static int 6117 bdev_start_qos(struct spdk_bdev *bdev) 6118 { 6119 struct set_qos_limit_ctx *ctx; 6120 6121 /* Enable QoS */ 6122 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6123 ctx = calloc(1, sizeof(*ctx)); 6124 if (ctx == NULL) { 6125 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6126 return -ENOMEM; 6127 } 6128 ctx->bdev = bdev; 6129 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6130 bdev_enable_qos_msg, ctx, 6131 bdev_enable_qos_done); 6132 } 6133 6134 return 0; 6135 } 6136 6137 static int 6138 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6139 { 6140 struct spdk_thread *thread; 6141 int rc = 0; 6142 6143 thread = spdk_get_thread(); 6144 if (!thread) { 6145 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6146 return -ENOTSUP; 6147 } 6148 6149 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6150 spdk_get_thread()); 6151 6152 desc->bdev = bdev; 6153 desc->thread = thread; 6154 desc->write = write; 6155 6156 pthread_mutex_lock(&bdev->internal.mutex); 6157 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6158 pthread_mutex_unlock(&bdev->internal.mutex); 6159 return -ENODEV; 6160 } 6161 6162 if (write && bdev->internal.claim_module) { 6163 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6164 bdev->name, bdev->internal.claim_module->name); 6165 pthread_mutex_unlock(&bdev->internal.mutex); 6166 return -EPERM; 6167 } 6168 6169 rc = bdev_start_qos(bdev); 6170 if (rc != 0) { 6171 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6172 pthread_mutex_unlock(&bdev->internal.mutex); 6173 return rc; 6174 } 6175 6176 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6177 6178 pthread_mutex_unlock(&bdev->internal.mutex); 6179 6180 return 0; 6181 } 6182 6183 int 6184 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6185 void *event_ctx, struct spdk_bdev_desc **_desc) 6186 { 6187 struct spdk_bdev_desc *desc; 6188 struct spdk_bdev *bdev; 6189 unsigned int event_id; 6190 int rc; 6191 6192 if (event_cb == NULL) { 6193 SPDK_ERRLOG("Missing event callback function\n"); 6194 return -EINVAL; 6195 } 6196 6197 pthread_mutex_lock(&g_bdev_mgr.mutex); 6198 6199 bdev = bdev_get_by_name(bdev_name); 6200 6201 if (bdev == NULL) { 6202 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6203 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6204 return -ENODEV; 6205 } 6206 6207 desc = calloc(1, sizeof(*desc)); 6208 if (desc == NULL) { 6209 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6210 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6211 return -ENOMEM; 6212 } 6213 6214 TAILQ_INIT(&desc->pending_media_events); 6215 TAILQ_INIT(&desc->free_media_events); 6216 6217 desc->callback.event_fn = event_cb; 6218 desc->callback.ctx = event_ctx; 6219 pthread_mutex_init(&desc->mutex, NULL); 6220 6221 if (bdev->media_events) { 6222 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6223 sizeof(*desc->media_events_buffer)); 6224 if (desc->media_events_buffer == NULL) { 6225 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6226 bdev_desc_free(desc); 6227 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6228 return -ENOMEM; 6229 } 6230 6231 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6232 TAILQ_INSERT_TAIL(&desc->free_media_events, 6233 &desc->media_events_buffer[event_id], tailq); 6234 } 6235 } 6236 6237 rc = bdev_open(bdev, write, desc); 6238 if (rc != 0) { 6239 bdev_desc_free(desc); 6240 desc = NULL; 6241 } 6242 6243 *_desc = desc; 6244 6245 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6246 6247 return rc; 6248 } 6249 6250 void 6251 spdk_bdev_close(struct spdk_bdev_desc *desc) 6252 { 6253 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6254 int rc; 6255 6256 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6257 spdk_get_thread()); 6258 6259 assert(desc->thread == spdk_get_thread()); 6260 6261 spdk_poller_unregister(&desc->io_timeout_poller); 6262 6263 pthread_mutex_lock(&g_bdev_mgr.mutex); 6264 pthread_mutex_lock(&bdev->internal.mutex); 6265 pthread_mutex_lock(&desc->mutex); 6266 6267 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6268 6269 desc->closed = true; 6270 6271 if (0 == desc->refs) { 6272 pthread_mutex_unlock(&desc->mutex); 6273 bdev_desc_free(desc); 6274 } else { 6275 pthread_mutex_unlock(&desc->mutex); 6276 } 6277 6278 /* If no more descriptors, kill QoS channel */ 6279 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6280 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6281 bdev->name, spdk_get_thread()); 6282 6283 if (bdev_qos_destroy(bdev)) { 6284 /* There isn't anything we can do to recover here. Just let the 6285 * old QoS poller keep running. The QoS handling won't change 6286 * cores when the user allocates a new channel, but it won't break. */ 6287 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6288 } 6289 } 6290 6291 spdk_bdev_set_qd_sampling_period(bdev, 0); 6292 6293 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6294 rc = bdev_unregister_unsafe(bdev); 6295 pthread_mutex_unlock(&bdev->internal.mutex); 6296 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6297 6298 if (rc == 0) { 6299 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6300 } 6301 } else { 6302 pthread_mutex_unlock(&bdev->internal.mutex); 6303 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6304 } 6305 } 6306 6307 int 6308 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6309 struct spdk_bdev_module *module) 6310 { 6311 if (bdev->internal.claim_module != NULL) { 6312 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6313 bdev->internal.claim_module->name); 6314 return -EPERM; 6315 } 6316 6317 if (desc && !desc->write) { 6318 desc->write = true; 6319 } 6320 6321 bdev->internal.claim_module = module; 6322 return 0; 6323 } 6324 6325 void 6326 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6327 { 6328 assert(bdev->internal.claim_module != NULL); 6329 bdev->internal.claim_module = NULL; 6330 } 6331 6332 struct spdk_bdev * 6333 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6334 { 6335 assert(desc != NULL); 6336 return desc->bdev; 6337 } 6338 6339 void 6340 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6341 { 6342 struct iovec *iovs; 6343 int iovcnt; 6344 6345 if (bdev_io == NULL) { 6346 return; 6347 } 6348 6349 switch (bdev_io->type) { 6350 case SPDK_BDEV_IO_TYPE_READ: 6351 case SPDK_BDEV_IO_TYPE_WRITE: 6352 case SPDK_BDEV_IO_TYPE_ZCOPY: 6353 iovs = bdev_io->u.bdev.iovs; 6354 iovcnt = bdev_io->u.bdev.iovcnt; 6355 break; 6356 default: 6357 iovs = NULL; 6358 iovcnt = 0; 6359 break; 6360 } 6361 6362 if (iovp) { 6363 *iovp = iovs; 6364 } 6365 if (iovcntp) { 6366 *iovcntp = iovcnt; 6367 } 6368 } 6369 6370 void * 6371 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 6372 { 6373 if (bdev_io == NULL) { 6374 return NULL; 6375 } 6376 6377 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 6378 return NULL; 6379 } 6380 6381 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 6382 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 6383 return bdev_io->u.bdev.md_buf; 6384 } 6385 6386 return NULL; 6387 } 6388 6389 void * 6390 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 6391 { 6392 if (bdev_io == NULL) { 6393 assert(false); 6394 return NULL; 6395 } 6396 6397 return bdev_io->internal.caller_ctx; 6398 } 6399 6400 void 6401 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 6402 { 6403 6404 if (spdk_bdev_module_list_find(bdev_module->name)) { 6405 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 6406 assert(false); 6407 } 6408 6409 /* 6410 * Modules with examine callbacks must be initialized first, so they are 6411 * ready to handle examine callbacks from later modules that will 6412 * register physical bdevs. 6413 */ 6414 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 6415 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6416 } else { 6417 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6418 } 6419 } 6420 6421 struct spdk_bdev_module * 6422 spdk_bdev_module_list_find(const char *name) 6423 { 6424 struct spdk_bdev_module *bdev_module; 6425 6426 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 6427 if (strcmp(name, bdev_module->name) == 0) { 6428 break; 6429 } 6430 } 6431 6432 return bdev_module; 6433 } 6434 6435 static void 6436 bdev_write_zero_buffer_next(void *_bdev_io) 6437 { 6438 struct spdk_bdev_io *bdev_io = _bdev_io; 6439 uint64_t num_bytes, num_blocks; 6440 void *md_buf = NULL; 6441 int rc; 6442 6443 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 6444 bdev_io->u.bdev.split_remaining_num_blocks, 6445 ZERO_BUFFER_SIZE); 6446 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 6447 6448 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 6449 md_buf = (char *)g_bdev_mgr.zero_buffer + 6450 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 6451 } 6452 6453 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 6454 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6455 g_bdev_mgr.zero_buffer, md_buf, 6456 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 6457 bdev_write_zero_buffer_done, bdev_io); 6458 if (rc == 0) { 6459 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 6460 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 6461 } else if (rc == -ENOMEM) { 6462 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 6463 } else { 6464 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6465 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6466 } 6467 } 6468 6469 static void 6470 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6471 { 6472 struct spdk_bdev_io *parent_io = cb_arg; 6473 6474 spdk_bdev_free_io(bdev_io); 6475 6476 if (!success) { 6477 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6478 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6479 return; 6480 } 6481 6482 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 6483 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6484 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6485 return; 6486 } 6487 6488 bdev_write_zero_buffer_next(parent_io); 6489 } 6490 6491 static void 6492 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 6493 { 6494 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6495 ctx->bdev->internal.qos_mod_in_progress = false; 6496 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6497 6498 if (ctx->cb_fn) { 6499 ctx->cb_fn(ctx->cb_arg, status); 6500 } 6501 free(ctx); 6502 } 6503 6504 static void 6505 bdev_disable_qos_done(void *cb_arg) 6506 { 6507 struct set_qos_limit_ctx *ctx = cb_arg; 6508 struct spdk_bdev *bdev = ctx->bdev; 6509 struct spdk_bdev_io *bdev_io; 6510 struct spdk_bdev_qos *qos; 6511 6512 pthread_mutex_lock(&bdev->internal.mutex); 6513 qos = bdev->internal.qos; 6514 bdev->internal.qos = NULL; 6515 pthread_mutex_unlock(&bdev->internal.mutex); 6516 6517 while (!TAILQ_EMPTY(&qos->queued)) { 6518 /* Send queued I/O back to their original thread for resubmission. */ 6519 bdev_io = TAILQ_FIRST(&qos->queued); 6520 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6521 6522 if (bdev_io->internal.io_submit_ch) { 6523 /* 6524 * Channel was changed when sending it to the QoS thread - change it back 6525 * before sending it back to the original thread. 6526 */ 6527 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6528 bdev_io->internal.io_submit_ch = NULL; 6529 } 6530 6531 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6532 _bdev_io_submit, bdev_io); 6533 } 6534 6535 if (qos->thread != NULL) { 6536 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6537 spdk_poller_unregister(&qos->poller); 6538 } 6539 6540 free(qos); 6541 6542 bdev_set_qos_limit_done(ctx, 0); 6543 } 6544 6545 static void 6546 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6547 { 6548 void *io_device = spdk_io_channel_iter_get_io_device(i); 6549 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6550 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6551 struct spdk_thread *thread; 6552 6553 pthread_mutex_lock(&bdev->internal.mutex); 6554 thread = bdev->internal.qos->thread; 6555 pthread_mutex_unlock(&bdev->internal.mutex); 6556 6557 if (thread != NULL) { 6558 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6559 } else { 6560 bdev_disable_qos_done(ctx); 6561 } 6562 } 6563 6564 static void 6565 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6566 { 6567 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6568 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6569 6570 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6571 6572 spdk_for_each_channel_continue(i, 0); 6573 } 6574 6575 static void 6576 bdev_update_qos_rate_limit_msg(void *cb_arg) 6577 { 6578 struct set_qos_limit_ctx *ctx = cb_arg; 6579 struct spdk_bdev *bdev = ctx->bdev; 6580 6581 pthread_mutex_lock(&bdev->internal.mutex); 6582 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6583 pthread_mutex_unlock(&bdev->internal.mutex); 6584 6585 bdev_set_qos_limit_done(ctx, 0); 6586 } 6587 6588 static void 6589 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6590 { 6591 void *io_device = spdk_io_channel_iter_get_io_device(i); 6592 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6593 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6594 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6595 6596 pthread_mutex_lock(&bdev->internal.mutex); 6597 bdev_enable_qos(bdev, bdev_ch); 6598 pthread_mutex_unlock(&bdev->internal.mutex); 6599 spdk_for_each_channel_continue(i, 0); 6600 } 6601 6602 static void 6603 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6604 { 6605 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6606 6607 bdev_set_qos_limit_done(ctx, status); 6608 } 6609 6610 static void 6611 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6612 { 6613 int i; 6614 6615 assert(bdev->internal.qos != NULL); 6616 6617 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6618 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6619 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6620 6621 if (limits[i] == 0) { 6622 bdev->internal.qos->rate_limits[i].limit = 6623 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6624 } 6625 } 6626 } 6627 } 6628 6629 void 6630 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6631 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6632 { 6633 struct set_qos_limit_ctx *ctx; 6634 uint32_t limit_set_complement; 6635 uint64_t min_limit_per_sec; 6636 int i; 6637 bool disable_rate_limit = true; 6638 6639 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6640 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6641 continue; 6642 } 6643 6644 if (limits[i] > 0) { 6645 disable_rate_limit = false; 6646 } 6647 6648 if (bdev_qos_is_iops_rate_limit(i) == true) { 6649 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 6650 } else { 6651 /* Change from megabyte to byte rate limit */ 6652 limits[i] = limits[i] * 1024 * 1024; 6653 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 6654 } 6655 6656 limit_set_complement = limits[i] % min_limit_per_sec; 6657 if (limit_set_complement) { 6658 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 6659 limits[i], min_limit_per_sec); 6660 limits[i] += min_limit_per_sec - limit_set_complement; 6661 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 6662 } 6663 } 6664 6665 ctx = calloc(1, sizeof(*ctx)); 6666 if (ctx == NULL) { 6667 cb_fn(cb_arg, -ENOMEM); 6668 return; 6669 } 6670 6671 ctx->cb_fn = cb_fn; 6672 ctx->cb_arg = cb_arg; 6673 ctx->bdev = bdev; 6674 6675 pthread_mutex_lock(&bdev->internal.mutex); 6676 if (bdev->internal.qos_mod_in_progress) { 6677 pthread_mutex_unlock(&bdev->internal.mutex); 6678 free(ctx); 6679 cb_fn(cb_arg, -EAGAIN); 6680 return; 6681 } 6682 bdev->internal.qos_mod_in_progress = true; 6683 6684 if (disable_rate_limit == true && bdev->internal.qos) { 6685 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6686 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 6687 (bdev->internal.qos->rate_limits[i].limit > 0 && 6688 bdev->internal.qos->rate_limits[i].limit != 6689 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 6690 disable_rate_limit = false; 6691 break; 6692 } 6693 } 6694 } 6695 6696 if (disable_rate_limit == false) { 6697 if (bdev->internal.qos == NULL) { 6698 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 6699 if (!bdev->internal.qos) { 6700 pthread_mutex_unlock(&bdev->internal.mutex); 6701 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 6702 bdev_set_qos_limit_done(ctx, -ENOMEM); 6703 return; 6704 } 6705 } 6706 6707 if (bdev->internal.qos->thread == NULL) { 6708 /* Enabling */ 6709 bdev_set_qos_rate_limits(bdev, limits); 6710 6711 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6712 bdev_enable_qos_msg, ctx, 6713 bdev_enable_qos_done); 6714 } else { 6715 /* Updating */ 6716 bdev_set_qos_rate_limits(bdev, limits); 6717 6718 spdk_thread_send_msg(bdev->internal.qos->thread, 6719 bdev_update_qos_rate_limit_msg, ctx); 6720 } 6721 } else { 6722 if (bdev->internal.qos != NULL) { 6723 bdev_set_qos_rate_limits(bdev, limits); 6724 6725 /* Disabling */ 6726 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6727 bdev_disable_qos_msg, ctx, 6728 bdev_disable_qos_msg_done); 6729 } else { 6730 pthread_mutex_unlock(&bdev->internal.mutex); 6731 bdev_set_qos_limit_done(ctx, 0); 6732 return; 6733 } 6734 } 6735 6736 pthread_mutex_unlock(&bdev->internal.mutex); 6737 } 6738 6739 struct spdk_bdev_histogram_ctx { 6740 spdk_bdev_histogram_status_cb cb_fn; 6741 void *cb_arg; 6742 struct spdk_bdev *bdev; 6743 int status; 6744 }; 6745 6746 static void 6747 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 6748 { 6749 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6750 6751 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6752 ctx->bdev->internal.histogram_in_progress = false; 6753 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6754 ctx->cb_fn(ctx->cb_arg, ctx->status); 6755 free(ctx); 6756 } 6757 6758 static void 6759 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 6760 { 6761 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6762 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6763 6764 if (ch->histogram != NULL) { 6765 spdk_histogram_data_free(ch->histogram); 6766 ch->histogram = NULL; 6767 } 6768 spdk_for_each_channel_continue(i, 0); 6769 } 6770 6771 static void 6772 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 6773 { 6774 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6775 6776 if (status != 0) { 6777 ctx->status = status; 6778 ctx->bdev->internal.histogram_enabled = false; 6779 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 6780 bdev_histogram_disable_channel_cb); 6781 } else { 6782 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6783 ctx->bdev->internal.histogram_in_progress = false; 6784 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6785 ctx->cb_fn(ctx->cb_arg, ctx->status); 6786 free(ctx); 6787 } 6788 } 6789 6790 static void 6791 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 6792 { 6793 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6794 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6795 int status = 0; 6796 6797 if (ch->histogram == NULL) { 6798 ch->histogram = spdk_histogram_data_alloc(); 6799 if (ch->histogram == NULL) { 6800 status = -ENOMEM; 6801 } 6802 } 6803 6804 spdk_for_each_channel_continue(i, status); 6805 } 6806 6807 void 6808 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 6809 void *cb_arg, bool enable) 6810 { 6811 struct spdk_bdev_histogram_ctx *ctx; 6812 6813 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 6814 if (ctx == NULL) { 6815 cb_fn(cb_arg, -ENOMEM); 6816 return; 6817 } 6818 6819 ctx->bdev = bdev; 6820 ctx->status = 0; 6821 ctx->cb_fn = cb_fn; 6822 ctx->cb_arg = cb_arg; 6823 6824 pthread_mutex_lock(&bdev->internal.mutex); 6825 if (bdev->internal.histogram_in_progress) { 6826 pthread_mutex_unlock(&bdev->internal.mutex); 6827 free(ctx); 6828 cb_fn(cb_arg, -EAGAIN); 6829 return; 6830 } 6831 6832 bdev->internal.histogram_in_progress = true; 6833 pthread_mutex_unlock(&bdev->internal.mutex); 6834 6835 bdev->internal.histogram_enabled = enable; 6836 6837 if (enable) { 6838 /* Allocate histogram for each channel */ 6839 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 6840 bdev_histogram_enable_channel_cb); 6841 } else { 6842 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 6843 bdev_histogram_disable_channel_cb); 6844 } 6845 } 6846 6847 struct spdk_bdev_histogram_data_ctx { 6848 spdk_bdev_histogram_data_cb cb_fn; 6849 void *cb_arg; 6850 struct spdk_bdev *bdev; 6851 /** merged histogram data from all channels */ 6852 struct spdk_histogram_data *histogram; 6853 }; 6854 6855 static void 6856 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 6857 { 6858 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6859 6860 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 6861 free(ctx); 6862 } 6863 6864 static void 6865 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 6866 { 6867 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6868 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6869 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6870 int status = 0; 6871 6872 if (ch->histogram == NULL) { 6873 status = -EFAULT; 6874 } else { 6875 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 6876 } 6877 6878 spdk_for_each_channel_continue(i, status); 6879 } 6880 6881 void 6882 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 6883 spdk_bdev_histogram_data_cb cb_fn, 6884 void *cb_arg) 6885 { 6886 struct spdk_bdev_histogram_data_ctx *ctx; 6887 6888 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 6889 if (ctx == NULL) { 6890 cb_fn(cb_arg, -ENOMEM, NULL); 6891 return; 6892 } 6893 6894 ctx->bdev = bdev; 6895 ctx->cb_fn = cb_fn; 6896 ctx->cb_arg = cb_arg; 6897 6898 ctx->histogram = histogram; 6899 6900 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 6901 bdev_histogram_get_channel_cb); 6902 } 6903 6904 size_t 6905 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 6906 size_t max_events) 6907 { 6908 struct media_event_entry *entry; 6909 size_t num_events = 0; 6910 6911 for (; num_events < max_events; ++num_events) { 6912 entry = TAILQ_FIRST(&desc->pending_media_events); 6913 if (entry == NULL) { 6914 break; 6915 } 6916 6917 events[num_events] = entry->event; 6918 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 6919 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 6920 } 6921 6922 return num_events; 6923 } 6924 6925 int 6926 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 6927 size_t num_events) 6928 { 6929 struct spdk_bdev_desc *desc; 6930 struct media_event_entry *entry; 6931 size_t event_id; 6932 int rc = 0; 6933 6934 assert(bdev->media_events); 6935 6936 pthread_mutex_lock(&bdev->internal.mutex); 6937 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6938 if (desc->write) { 6939 break; 6940 } 6941 } 6942 6943 if (desc == NULL || desc->media_events_buffer == NULL) { 6944 rc = -ENODEV; 6945 goto out; 6946 } 6947 6948 for (event_id = 0; event_id < num_events; ++event_id) { 6949 entry = TAILQ_FIRST(&desc->free_media_events); 6950 if (entry == NULL) { 6951 break; 6952 } 6953 6954 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 6955 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 6956 entry->event = events[event_id]; 6957 } 6958 6959 rc = event_id; 6960 out: 6961 pthread_mutex_unlock(&bdev->internal.mutex); 6962 return rc; 6963 } 6964 6965 void 6966 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 6967 { 6968 struct spdk_bdev_desc *desc; 6969 6970 pthread_mutex_lock(&bdev->internal.mutex); 6971 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6972 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 6973 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 6974 desc->callback.ctx); 6975 } 6976 } 6977 pthread_mutex_unlock(&bdev->internal.mutex); 6978 } 6979 6980 struct locked_lba_range_ctx { 6981 struct lba_range range; 6982 struct spdk_bdev *bdev; 6983 struct lba_range *current_range; 6984 struct lba_range *owner_range; 6985 struct spdk_poller *poller; 6986 lock_range_cb cb_fn; 6987 void *cb_arg; 6988 }; 6989 6990 static void 6991 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 6992 { 6993 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6994 6995 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 6996 free(ctx); 6997 } 6998 6999 static void 7000 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 7001 7002 static void 7003 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7004 { 7005 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7006 struct spdk_bdev *bdev = ctx->bdev; 7007 7008 if (status == -ENOMEM) { 7009 /* One of the channels could not allocate a range object. 7010 * So we have to go back and clean up any ranges that were 7011 * allocated successfully before we return error status to 7012 * the caller. We can reuse the unlock function to do that 7013 * clean up. 7014 */ 7015 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7016 bdev_unlock_lba_range_get_channel, ctx, 7017 bdev_lock_error_cleanup_cb); 7018 return; 7019 } 7020 7021 /* All channels have locked this range and no I/O overlapping the range 7022 * are outstanding! Set the owner_ch for the range object for the 7023 * locking channel, so that this channel will know that it is allowed 7024 * to write to this range. 7025 */ 7026 ctx->owner_range->owner_ch = ctx->range.owner_ch; 7027 ctx->cb_fn(ctx->cb_arg, status); 7028 7029 /* Don't free the ctx here. Its range is in the bdev's global list of 7030 * locked ranges still, and will be removed and freed when this range 7031 * is later unlocked. 7032 */ 7033 } 7034 7035 static int 7036 bdev_lock_lba_range_check_io(void *_i) 7037 { 7038 struct spdk_io_channel_iter *i = _i; 7039 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7040 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7041 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7042 struct lba_range *range = ctx->current_range; 7043 struct spdk_bdev_io *bdev_io; 7044 7045 spdk_poller_unregister(&ctx->poller); 7046 7047 /* The range is now in the locked_ranges, so no new IO can be submitted to this 7048 * range. But we need to wait until any outstanding IO overlapping with this range 7049 * are completed. 7050 */ 7051 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 7052 if (bdev_io_range_is_locked(bdev_io, range)) { 7053 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 7054 return SPDK_POLLER_BUSY; 7055 } 7056 } 7057 7058 spdk_for_each_channel_continue(i, 0); 7059 return SPDK_POLLER_BUSY; 7060 } 7061 7062 static void 7063 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7064 { 7065 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7066 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7067 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7068 struct lba_range *range; 7069 7070 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7071 if (range->length == ctx->range.length && 7072 range->offset == ctx->range.offset && 7073 range->locked_ctx == ctx->range.locked_ctx) { 7074 /* This range already exists on this channel, so don't add 7075 * it again. This can happen when a new channel is created 7076 * while the for_each_channel operation is in progress. 7077 * Do not check for outstanding I/O in that case, since the 7078 * range was locked before any I/O could be submitted to the 7079 * new channel. 7080 */ 7081 spdk_for_each_channel_continue(i, 0); 7082 return; 7083 } 7084 } 7085 7086 range = calloc(1, sizeof(*range)); 7087 if (range == NULL) { 7088 spdk_for_each_channel_continue(i, -ENOMEM); 7089 return; 7090 } 7091 7092 range->length = ctx->range.length; 7093 range->offset = ctx->range.offset; 7094 range->locked_ctx = ctx->range.locked_ctx; 7095 ctx->current_range = range; 7096 if (ctx->range.owner_ch == ch) { 7097 /* This is the range object for the channel that will hold 7098 * the lock. Store it in the ctx object so that we can easily 7099 * set its owner_ch after the lock is finally acquired. 7100 */ 7101 ctx->owner_range = range; 7102 } 7103 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 7104 bdev_lock_lba_range_check_io(i); 7105 } 7106 7107 static void 7108 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 7109 { 7110 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 7111 7112 /* We will add a copy of this range to each channel now. */ 7113 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 7114 bdev_lock_lba_range_cb); 7115 } 7116 7117 static bool 7118 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 7119 { 7120 struct lba_range *r; 7121 7122 TAILQ_FOREACH(r, tailq, tailq) { 7123 if (bdev_lba_range_overlapped(range, r)) { 7124 return true; 7125 } 7126 } 7127 return false; 7128 } 7129 7130 static int 7131 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7132 uint64_t offset, uint64_t length, 7133 lock_range_cb cb_fn, void *cb_arg) 7134 { 7135 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7136 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7137 struct locked_lba_range_ctx *ctx; 7138 7139 if (cb_arg == NULL) { 7140 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7141 return -EINVAL; 7142 } 7143 7144 ctx = calloc(1, sizeof(*ctx)); 7145 if (ctx == NULL) { 7146 return -ENOMEM; 7147 } 7148 7149 ctx->range.offset = offset; 7150 ctx->range.length = length; 7151 ctx->range.owner_ch = ch; 7152 ctx->range.locked_ctx = cb_arg; 7153 ctx->bdev = bdev; 7154 ctx->cb_fn = cb_fn; 7155 ctx->cb_arg = cb_arg; 7156 7157 pthread_mutex_lock(&bdev->internal.mutex); 7158 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7159 /* There is an active lock overlapping with this range. 7160 * Put it on the pending list until this range no 7161 * longer overlaps with another. 7162 */ 7163 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7164 } else { 7165 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7166 bdev_lock_lba_range_ctx(bdev, ctx); 7167 } 7168 pthread_mutex_unlock(&bdev->internal.mutex); 7169 return 0; 7170 } 7171 7172 static void 7173 bdev_lock_lba_range_ctx_msg(void *_ctx) 7174 { 7175 struct locked_lba_range_ctx *ctx = _ctx; 7176 7177 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7178 } 7179 7180 static void 7181 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7182 { 7183 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7184 struct locked_lba_range_ctx *pending_ctx; 7185 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7186 struct spdk_bdev *bdev = ch->bdev; 7187 struct lba_range *range, *tmp; 7188 7189 pthread_mutex_lock(&bdev->internal.mutex); 7190 /* Check if there are any pending locked ranges that overlap with this range 7191 * that was just unlocked. If there are, check that it doesn't overlap with any 7192 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7193 * the lock process. 7194 */ 7195 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7196 if (bdev_lba_range_overlapped(range, &ctx->range) && 7197 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7198 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7199 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7200 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7201 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7202 bdev_lock_lba_range_ctx_msg, pending_ctx); 7203 } 7204 } 7205 pthread_mutex_unlock(&bdev->internal.mutex); 7206 7207 ctx->cb_fn(ctx->cb_arg, status); 7208 free(ctx); 7209 } 7210 7211 static void 7212 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7213 { 7214 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7215 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7216 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7217 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7218 struct spdk_bdev_io *bdev_io; 7219 struct lba_range *range; 7220 7221 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7222 if (ctx->range.offset == range->offset && 7223 ctx->range.length == range->length && 7224 ctx->range.locked_ctx == range->locked_ctx) { 7225 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7226 free(range); 7227 break; 7228 } 7229 } 7230 7231 /* Note: we should almost always be able to assert that the range specified 7232 * was found. But there are some very rare corner cases where a new channel 7233 * gets created simultaneously with a range unlock, where this function 7234 * would execute on that new channel and wouldn't have the range. 7235 * We also use this to clean up range allocations when a later allocation 7236 * fails in the locking path. 7237 * So we can't actually assert() here. 7238 */ 7239 7240 /* Swap the locked IO into a temporary list, and then try to submit them again. 7241 * We could hyper-optimize this to only resubmit locked I/O that overlap 7242 * with the range that was just unlocked, but this isn't a performance path so 7243 * we go for simplicity here. 7244 */ 7245 TAILQ_INIT(&io_locked); 7246 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7247 while (!TAILQ_EMPTY(&io_locked)) { 7248 bdev_io = TAILQ_FIRST(&io_locked); 7249 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7250 bdev_io_submit(bdev_io); 7251 } 7252 7253 spdk_for_each_channel_continue(i, 0); 7254 } 7255 7256 static int 7257 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7258 uint64_t offset, uint64_t length, 7259 lock_range_cb cb_fn, void *cb_arg) 7260 { 7261 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7262 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7263 struct locked_lba_range_ctx *ctx; 7264 struct lba_range *range; 7265 bool range_found = false; 7266 7267 /* Let's make sure the specified channel actually has a lock on 7268 * the specified range. Note that the range must match exactly. 7269 */ 7270 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7271 if (range->offset == offset && range->length == length && 7272 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7273 range_found = true; 7274 break; 7275 } 7276 } 7277 7278 if (!range_found) { 7279 return -EINVAL; 7280 } 7281 7282 pthread_mutex_lock(&bdev->internal.mutex); 7283 /* We confirmed that this channel has locked the specified range. To 7284 * start the unlock the process, we find the range in the bdev's locked_ranges 7285 * and remove it. This ensures new channels don't inherit the locked range. 7286 * Then we will send a message to each channel (including the one specified 7287 * here) to remove the range from its per-channel list. 7288 */ 7289 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7290 if (range->offset == offset && range->length == length && 7291 range->locked_ctx == cb_arg) { 7292 break; 7293 } 7294 } 7295 if (range == NULL) { 7296 assert(false); 7297 pthread_mutex_unlock(&bdev->internal.mutex); 7298 return -EINVAL; 7299 } 7300 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7301 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7302 pthread_mutex_unlock(&bdev->internal.mutex); 7303 7304 ctx->cb_fn = cb_fn; 7305 ctx->cb_arg = cb_arg; 7306 7307 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7308 bdev_unlock_lba_range_cb); 7309 return 0; 7310 } 7311 7312 int 7313 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7314 int array_size) 7315 { 7316 if (!bdev) { 7317 return -EINVAL; 7318 } 7319 7320 if (bdev->fn_table->get_memory_domains) { 7321 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7322 } 7323 7324 return 0; 7325 } 7326 7327 SPDK_LOG_REGISTER_COMPONENT(bdev) 7328 7329 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 7330 { 7331 struct spdk_trace_tpoint_opts opts[] = { 7332 { 7333 "BDEV_IO_START", TRACE_BDEV_IO_START, 7334 OWNER_BDEV, OBJECT_BDEV_IO, 1, 7335 { 7336 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7337 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 7338 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7339 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 } 7340 } 7341 }, 7342 { 7343 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 7344 OWNER_BDEV, OBJECT_BDEV_IO, 0, 7345 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7346 }, 7347 { 7348 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 7349 OWNER_BDEV, OBJECT_NONE, 1, 7350 { 7351 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7352 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7353 } 7354 }, 7355 { 7356 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 7357 OWNER_BDEV, OBJECT_NONE, 0, 7358 { 7359 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7360 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7361 } 7362 }, 7363 }; 7364 7365 7366 spdk_trace_register_owner(OWNER_BDEV, 'b'); 7367 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 7368 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7369 } 7370