1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/notify.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk/log.h" 52 #include "spdk/string.h" 53 54 #include "bdev_internal.h" 55 #include "spdk_internal/trace_defs.h" 56 57 #ifdef SPDK_CONFIG_VTUNE 58 #include "ittnotify.h" 59 #include "ittnotify_types.h" 60 int __itt_init_ittlib(const char *, __itt_group_id); 61 #endif 62 63 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 64 #define SPDK_BDEV_IO_CACHE_SIZE 256 65 #define SPDK_BDEV_AUTO_EXAMINE true 66 #define BUF_SMALL_POOL_SIZE 8191 67 #define BUF_LARGE_POOL_SIZE 1023 68 #define NOMEM_THRESHOLD_COUNT 8 69 #define ZERO_BUFFER_SIZE 0x100000 70 71 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 72 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 73 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 74 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 75 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 76 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 77 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 78 79 #define SPDK_BDEV_POOL_ALIGNMENT 512 80 81 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 82 * when splitting into children requests at a time. 83 */ 84 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 85 86 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 87 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 88 }; 89 90 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 91 92 RB_HEAD(bdev_name_tree, spdk_bdev_name); 93 94 static int 95 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 96 { 97 return strcmp(name1->name, name2->name); 98 } 99 100 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 101 102 struct spdk_bdev_mgr { 103 struct spdk_mempool *bdev_io_pool; 104 105 struct spdk_mempool *buf_small_pool; 106 struct spdk_mempool *buf_large_pool; 107 108 void *zero_buffer; 109 110 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 111 112 struct spdk_bdev_list bdevs; 113 struct bdev_name_tree bdev_names; 114 115 bool init_complete; 116 bool module_init_complete; 117 118 pthread_mutex_t mutex; 119 120 #ifdef SPDK_CONFIG_VTUNE 121 __itt_domain *domain; 122 #endif 123 }; 124 125 static struct spdk_bdev_mgr g_bdev_mgr = { 126 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 127 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 128 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 129 .init_complete = false, 130 .module_init_complete = false, 131 .mutex = PTHREAD_MUTEX_INITIALIZER, 132 }; 133 134 typedef void (*lock_range_cb)(void *ctx, int status); 135 136 struct lba_range { 137 uint64_t offset; 138 uint64_t length; 139 void *locked_ctx; 140 struct spdk_bdev_channel *owner_ch; 141 TAILQ_ENTRY(lba_range) tailq; 142 }; 143 144 static struct spdk_bdev_opts g_bdev_opts = { 145 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 146 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 147 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 148 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 149 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 150 }; 151 152 static spdk_bdev_init_cb g_init_cb_fn = NULL; 153 static void *g_init_cb_arg = NULL; 154 155 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 156 static void *g_fini_cb_arg = NULL; 157 static struct spdk_thread *g_fini_thread = NULL; 158 159 struct spdk_bdev_qos_limit { 160 /** IOs or bytes allowed per second (i.e., 1s). */ 161 uint64_t limit; 162 163 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 164 * For remaining bytes, allowed to run negative if an I/O is submitted when 165 * some bytes are remaining, but the I/O is bigger than that amount. The 166 * excess will be deducted from the next timeslice. 167 */ 168 int64_t remaining_this_timeslice; 169 170 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 171 uint32_t min_per_timeslice; 172 173 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t max_per_timeslice; 175 176 /** Function to check whether to queue the IO. */ 177 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 178 179 /** Function to update for the submitted IO. */ 180 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 }; 182 183 struct spdk_bdev_qos { 184 /** Types of structure of rate limits. */ 185 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 186 187 /** The channel that all I/O are funneled through. */ 188 struct spdk_bdev_channel *ch; 189 190 /** The thread on which the poller is running. */ 191 struct spdk_thread *thread; 192 193 /** Queue of I/O waiting to be issued. */ 194 bdev_io_tailq_t queued; 195 196 /** Size of a timeslice in tsc ticks. */ 197 uint64_t timeslice_size; 198 199 /** Timestamp of start of last timeslice. */ 200 uint64_t last_timeslice; 201 202 /** Poller that processes queued I/O commands each time slice. */ 203 struct spdk_poller *poller; 204 }; 205 206 struct spdk_bdev_mgmt_channel { 207 bdev_io_stailq_t need_buf_small; 208 bdev_io_stailq_t need_buf_large; 209 210 /* 211 * Each thread keeps a cache of bdev_io - this allows 212 * bdev threads which are *not* DPDK threads to still 213 * benefit from a per-thread bdev_io cache. Without 214 * this, non-DPDK threads fetching from the mempool 215 * incur a cmpxchg on get and put. 216 */ 217 bdev_io_stailq_t per_thread_cache; 218 uint32_t per_thread_cache_count; 219 uint32_t bdev_io_cache_size; 220 221 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 222 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 223 }; 224 225 /* 226 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 227 * will queue here their IO that awaits retry. It makes it possible to retry sending 228 * IO to one bdev after IO from other bdev completes. 229 */ 230 struct spdk_bdev_shared_resource { 231 /* The bdev management channel */ 232 struct spdk_bdev_mgmt_channel *mgmt_ch; 233 234 /* 235 * Count of I/O submitted to bdev module and waiting for completion. 236 * Incremented before submit_request() is called on an spdk_bdev_io. 237 */ 238 uint64_t io_outstanding; 239 240 /* 241 * Queue of IO awaiting retry because of a previous NOMEM status returned 242 * on this channel. 243 */ 244 bdev_io_tailq_t nomem_io; 245 246 /* 247 * Threshold which io_outstanding must drop to before retrying nomem_io. 248 */ 249 uint64_t nomem_threshold; 250 251 /* I/O channel allocated by a bdev module */ 252 struct spdk_io_channel *shared_ch; 253 254 /* Refcount of bdev channels using this resource */ 255 uint32_t ref; 256 257 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 258 }; 259 260 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 261 #define BDEV_CH_QOS_ENABLED (1 << 1) 262 263 struct spdk_bdev_channel { 264 struct spdk_bdev *bdev; 265 266 /* The channel for the underlying device */ 267 struct spdk_io_channel *channel; 268 269 /* Per io_device per thread data */ 270 struct spdk_bdev_shared_resource *shared_resource; 271 272 struct spdk_bdev_io_stat stat; 273 274 /* 275 * Count of I/O submitted to the underlying dev module through this channel 276 * and waiting for completion. 277 */ 278 uint64_t io_outstanding; 279 280 /* 281 * List of all submitted I/Os including I/O that are generated via splitting. 282 */ 283 bdev_io_tailq_t io_submitted; 284 285 /* 286 * List of spdk_bdev_io that are currently queued because they write to a locked 287 * LBA range. 288 */ 289 bdev_io_tailq_t io_locked; 290 291 uint32_t flags; 292 293 struct spdk_histogram_data *histogram; 294 295 #ifdef SPDK_CONFIG_VTUNE 296 uint64_t start_tsc; 297 uint64_t interval_tsc; 298 __itt_string_handle *handle; 299 struct spdk_bdev_io_stat prev_stat; 300 #endif 301 302 bdev_io_tailq_t queued_resets; 303 304 lba_range_tailq_t locked_ranges; 305 }; 306 307 struct media_event_entry { 308 struct spdk_bdev_media_event event; 309 TAILQ_ENTRY(media_event_entry) tailq; 310 }; 311 312 #define MEDIA_EVENT_POOL_SIZE 64 313 314 struct spdk_bdev_desc { 315 struct spdk_bdev *bdev; 316 struct spdk_thread *thread; 317 struct { 318 spdk_bdev_event_cb_t event_fn; 319 void *ctx; 320 } callback; 321 bool closed; 322 bool write; 323 pthread_mutex_t mutex; 324 uint32_t refs; 325 TAILQ_HEAD(, media_event_entry) pending_media_events; 326 TAILQ_HEAD(, media_event_entry) free_media_events; 327 struct media_event_entry *media_events_buffer; 328 TAILQ_ENTRY(spdk_bdev_desc) link; 329 330 uint64_t timeout_in_sec; 331 spdk_bdev_io_timeout_cb cb_fn; 332 void *cb_arg; 333 struct spdk_poller *io_timeout_poller; 334 }; 335 336 struct spdk_bdev_iostat_ctx { 337 struct spdk_bdev_io_stat *stat; 338 spdk_bdev_get_device_stat_cb cb; 339 void *cb_arg; 340 }; 341 342 struct set_qos_limit_ctx { 343 void (*cb_fn)(void *cb_arg, int status); 344 void *cb_arg; 345 struct spdk_bdev *bdev; 346 }; 347 348 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 349 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 350 351 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 352 static void bdev_write_zero_buffer_next(void *_bdev_io); 353 354 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 355 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 356 357 static int 358 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 359 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 360 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 361 struct spdk_bdev_ext_io_opts *opts); 362 static int 363 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 364 struct iovec *iov, int iovcnt, void *md_buf, 365 uint64_t offset_blocks, uint64_t num_blocks, 366 spdk_bdev_io_completion_cb cb, void *cb_arg, 367 struct spdk_bdev_ext_io_opts *opts); 368 369 static int 370 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 371 uint64_t offset, uint64_t length, 372 lock_range_cb cb_fn, void *cb_arg); 373 374 static int 375 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 376 uint64_t offset, uint64_t length, 377 lock_range_cb cb_fn, void *cb_arg); 378 379 static inline void bdev_io_complete(void *ctx); 380 381 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 382 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 383 384 void 385 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 386 { 387 if (!opts) { 388 SPDK_ERRLOG("opts should not be NULL\n"); 389 return; 390 } 391 392 if (!opts_size) { 393 SPDK_ERRLOG("opts_size should not be zero value\n"); 394 return; 395 } 396 397 opts->opts_size = opts_size; 398 399 #define SET_FIELD(field) \ 400 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 401 opts->field = g_bdev_opts.field; \ 402 } \ 403 404 SET_FIELD(bdev_io_pool_size); 405 SET_FIELD(bdev_io_cache_size); 406 SET_FIELD(bdev_auto_examine); 407 SET_FIELD(small_buf_pool_size); 408 SET_FIELD(large_buf_pool_size); 409 410 /* Do not remove this statement, you should always update this statement when you adding a new field, 411 * and do not forget to add the SET_FIELD statement for your added field. */ 412 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 413 414 #undef SET_FIELD 415 } 416 417 int 418 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 419 { 420 uint32_t min_pool_size; 421 422 if (!opts) { 423 SPDK_ERRLOG("opts cannot be NULL\n"); 424 return -1; 425 } 426 427 if (!opts->opts_size) { 428 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 429 return -1; 430 } 431 432 /* 433 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 434 * initialization. A second mgmt_ch will be created on the same thread when the application starts 435 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 436 */ 437 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 438 if (opts->bdev_io_pool_size < min_pool_size) { 439 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 440 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 441 spdk_thread_get_count()); 442 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 443 return -1; 444 } 445 446 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 447 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 448 return -1; 449 } 450 451 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 452 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 453 return -1; 454 } 455 456 #define SET_FIELD(field) \ 457 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 458 g_bdev_opts.field = opts->field; \ 459 } \ 460 461 SET_FIELD(bdev_io_pool_size); 462 SET_FIELD(bdev_io_cache_size); 463 SET_FIELD(bdev_auto_examine); 464 SET_FIELD(small_buf_pool_size); 465 SET_FIELD(large_buf_pool_size); 466 467 g_bdev_opts.opts_size = opts->opts_size; 468 469 #undef SET_FIELD 470 471 return 0; 472 } 473 474 static struct spdk_bdev * 475 bdev_get_by_name(const char *bdev_name) 476 { 477 struct spdk_bdev_name find; 478 struct spdk_bdev_name *res; 479 480 find.name = (char *)bdev_name; 481 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 482 if (res != NULL) { 483 return res->bdev; 484 } 485 486 return NULL; 487 } 488 489 struct spdk_bdev * 490 spdk_bdev_get_by_name(const char *bdev_name) 491 { 492 struct spdk_bdev *bdev; 493 494 pthread_mutex_lock(&g_bdev_mgr.mutex); 495 bdev = bdev_get_by_name(bdev_name); 496 pthread_mutex_unlock(&g_bdev_mgr.mutex); 497 498 return bdev; 499 } 500 501 struct spdk_bdev_wait_for_examine_ctx { 502 struct spdk_poller *poller; 503 spdk_bdev_wait_for_examine_cb cb_fn; 504 void *cb_arg; 505 }; 506 507 static bool 508 bdev_module_all_actions_completed(void); 509 510 static int 511 bdev_wait_for_examine_cb(void *arg) 512 { 513 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 514 515 if (!bdev_module_all_actions_completed()) { 516 return SPDK_POLLER_IDLE; 517 } 518 519 spdk_poller_unregister(&ctx->poller); 520 ctx->cb_fn(ctx->cb_arg); 521 free(ctx); 522 523 return SPDK_POLLER_BUSY; 524 } 525 526 int 527 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 528 { 529 struct spdk_bdev_wait_for_examine_ctx *ctx; 530 531 ctx = calloc(1, sizeof(*ctx)); 532 if (ctx == NULL) { 533 return -ENOMEM; 534 } 535 ctx->cb_fn = cb_fn; 536 ctx->cb_arg = cb_arg; 537 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 538 539 return 0; 540 } 541 542 struct spdk_bdev_examine_item { 543 char *name; 544 TAILQ_ENTRY(spdk_bdev_examine_item) link; 545 }; 546 547 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 548 549 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 550 g_bdev_examine_allowlist); 551 552 static inline bool 553 bdev_examine_allowlist_check(const char *name) 554 { 555 struct spdk_bdev_examine_item *item; 556 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 557 if (strcmp(name, item->name) == 0) { 558 return true; 559 } 560 } 561 return false; 562 } 563 564 static inline void 565 bdev_examine_allowlist_free(void) 566 { 567 struct spdk_bdev_examine_item *item; 568 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 569 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 570 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 571 free(item->name); 572 free(item); 573 } 574 } 575 576 static inline bool 577 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 578 { 579 struct spdk_bdev_alias *tmp; 580 if (bdev_examine_allowlist_check(bdev->name)) { 581 return true; 582 } 583 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 584 if (bdev_examine_allowlist_check(tmp->alias.name)) { 585 return true; 586 } 587 } 588 return false; 589 } 590 591 static inline bool 592 bdev_ok_to_examine(struct spdk_bdev *bdev) 593 { 594 if (g_bdev_opts.bdev_auto_examine) { 595 return true; 596 } else { 597 return bdev_in_examine_allowlist(bdev); 598 } 599 } 600 601 static void 602 bdev_examine(struct spdk_bdev *bdev) 603 { 604 struct spdk_bdev_module *module; 605 uint32_t action; 606 607 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 608 if (module->examine_config && bdev_ok_to_examine(bdev)) { 609 action = module->internal.action_in_progress; 610 module->internal.action_in_progress++; 611 module->examine_config(bdev); 612 if (action != module->internal.action_in_progress) { 613 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 614 module->name); 615 } 616 } 617 } 618 619 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 620 if (bdev->internal.claim_module->examine_disk) { 621 bdev->internal.claim_module->internal.action_in_progress++; 622 bdev->internal.claim_module->examine_disk(bdev); 623 } 624 return; 625 } 626 627 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 628 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 629 module->internal.action_in_progress++; 630 module->examine_disk(bdev); 631 } 632 } 633 } 634 635 int 636 spdk_bdev_examine(const char *name) 637 { 638 struct spdk_bdev *bdev; 639 struct spdk_bdev_examine_item *item; 640 641 if (g_bdev_opts.bdev_auto_examine) { 642 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 643 return -EINVAL; 644 } 645 646 if (bdev_examine_allowlist_check(name)) { 647 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 648 return -EEXIST; 649 } 650 651 item = calloc(1, sizeof(*item)); 652 if (!item) { 653 return -ENOMEM; 654 } 655 item->name = strdup(name); 656 if (!item->name) { 657 free(item); 658 return -ENOMEM; 659 } 660 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 661 662 bdev = spdk_bdev_get_by_name(name); 663 if (bdev) { 664 bdev_examine(bdev); 665 } 666 return 0; 667 } 668 669 static inline void 670 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 671 { 672 struct spdk_bdev_examine_item *item; 673 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 674 spdk_json_write_object_begin(w); 675 spdk_json_write_named_string(w, "method", "bdev_examine"); 676 spdk_json_write_named_object_begin(w, "params"); 677 spdk_json_write_named_string(w, "name", item->name); 678 spdk_json_write_object_end(w); 679 spdk_json_write_object_end(w); 680 } 681 } 682 683 struct spdk_bdev * 684 spdk_bdev_first(void) 685 { 686 struct spdk_bdev *bdev; 687 688 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 689 if (bdev) { 690 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 691 } 692 693 return bdev; 694 } 695 696 struct spdk_bdev * 697 spdk_bdev_next(struct spdk_bdev *prev) 698 { 699 struct spdk_bdev *bdev; 700 701 bdev = TAILQ_NEXT(prev, internal.link); 702 if (bdev) { 703 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 704 } 705 706 return bdev; 707 } 708 709 static struct spdk_bdev * 710 _bdev_next_leaf(struct spdk_bdev *bdev) 711 { 712 while (bdev != NULL) { 713 if (bdev->internal.claim_module == NULL) { 714 return bdev; 715 } else { 716 bdev = TAILQ_NEXT(bdev, internal.link); 717 } 718 } 719 720 return bdev; 721 } 722 723 struct spdk_bdev * 724 spdk_bdev_first_leaf(void) 725 { 726 struct spdk_bdev *bdev; 727 728 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 729 730 if (bdev) { 731 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 732 } 733 734 return bdev; 735 } 736 737 struct spdk_bdev * 738 spdk_bdev_next_leaf(struct spdk_bdev *prev) 739 { 740 struct spdk_bdev *bdev; 741 742 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 743 744 if (bdev) { 745 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 746 } 747 748 return bdev; 749 } 750 751 void 752 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 753 { 754 struct iovec *iovs; 755 756 if (bdev_io->u.bdev.iovs == NULL) { 757 bdev_io->u.bdev.iovs = &bdev_io->iov; 758 bdev_io->u.bdev.iovcnt = 1; 759 } 760 761 iovs = bdev_io->u.bdev.iovs; 762 763 assert(iovs != NULL); 764 assert(bdev_io->u.bdev.iovcnt >= 1); 765 766 iovs[0].iov_base = buf; 767 iovs[0].iov_len = len; 768 } 769 770 void 771 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 772 { 773 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 774 bdev_io->u.bdev.md_buf = md_buf; 775 } 776 777 static bool 778 _is_buf_allocated(const struct iovec *iovs) 779 { 780 if (iovs == NULL) { 781 return false; 782 } 783 784 return iovs[0].iov_base != NULL; 785 } 786 787 static bool 788 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 789 { 790 int i; 791 uintptr_t iov_base; 792 793 if (spdk_likely(alignment == 1)) { 794 return true; 795 } 796 797 for (i = 0; i < iovcnt; i++) { 798 iov_base = (uintptr_t)iovs[i].iov_base; 799 if ((iov_base & (alignment - 1)) != 0) { 800 return false; 801 } 802 } 803 804 return true; 805 } 806 807 static void 808 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 809 { 810 int i; 811 size_t len; 812 813 for (i = 0; i < iovcnt; i++) { 814 len = spdk_min(iovs[i].iov_len, buf_len); 815 memcpy(buf, iovs[i].iov_base, len); 816 buf += len; 817 buf_len -= len; 818 } 819 } 820 821 static void 822 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 823 { 824 int i; 825 size_t len; 826 827 for (i = 0; i < iovcnt; i++) { 828 len = spdk_min(iovs[i].iov_len, buf_len); 829 memcpy(iovs[i].iov_base, buf, len); 830 buf += len; 831 buf_len -= len; 832 } 833 } 834 835 static void 836 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 837 { 838 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 839 void *buf; 840 841 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 842 buf = bdev_io->internal.buf; 843 bdev_io->internal.buf = NULL; 844 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 845 bdev_io->internal.get_aux_buf_cb = NULL; 846 } else { 847 assert(bdev_io->internal.get_buf_cb != NULL); 848 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 849 bdev_io->internal.get_buf_cb = NULL; 850 } 851 } 852 853 static void 854 _bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 855 { 856 /* save original md_buf */ 857 bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf; 858 /* set bounce md_buf */ 859 bdev_io->u.bdev.md_buf = md_buf; 860 861 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 862 memcpy(md_buf, bdev_io->internal.orig_md_buf, len); 863 } 864 } 865 866 static void 867 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 868 { 869 struct spdk_bdev *bdev = bdev_io->bdev; 870 uint64_t md_len; 871 void *buf; 872 873 if (spdk_bdev_is_md_separate(bdev)) { 874 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 875 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 876 877 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 878 879 if (bdev_io->u.bdev.md_buf != NULL) { 880 _bdev_io_set_bounce_md_buf(bdev_io, buf, md_len); 881 return; 882 } else { 883 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 884 } 885 } 886 887 bdev_io_get_buf_complete(bdev_io, true); 888 } 889 890 static void 891 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 892 { 893 /* save original iovec */ 894 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 895 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 896 /* set bounce iov */ 897 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 898 bdev_io->u.bdev.iovcnt = 1; 899 /* set bounce buffer for this operation */ 900 bdev_io->u.bdev.iovs[0].iov_base = buf; 901 bdev_io->u.bdev.iovs[0].iov_len = len; 902 /* if this is write path, copy data from original buffer to bounce buffer */ 903 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 904 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 905 } 906 } 907 908 static void 909 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 910 { 911 struct spdk_bdev *bdev = bdev_io->bdev; 912 bool buf_allocated; 913 uint64_t alignment; 914 void *aligned_buf; 915 916 bdev_io->internal.buf = buf; 917 918 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 919 bdev_io_get_buf_complete(bdev_io, true); 920 return; 921 } 922 923 alignment = spdk_bdev_get_buf_align(bdev); 924 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 925 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 926 927 if (buf_allocated) { 928 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 929 } else { 930 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 931 } 932 933 _bdev_io_set_md_buf(bdev_io); 934 } 935 936 static void 937 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 938 { 939 struct spdk_bdev *bdev = bdev_io->bdev; 940 struct spdk_mempool *pool; 941 struct spdk_bdev_io *tmp; 942 bdev_io_stailq_t *stailq; 943 struct spdk_bdev_mgmt_channel *ch; 944 uint64_t md_len, alignment; 945 946 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 947 alignment = spdk_bdev_get_buf_align(bdev); 948 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 949 950 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 951 SPDK_BDEV_POOL_ALIGNMENT) { 952 pool = g_bdev_mgr.buf_small_pool; 953 stailq = &ch->need_buf_small; 954 } else { 955 pool = g_bdev_mgr.buf_large_pool; 956 stailq = &ch->need_buf_large; 957 } 958 959 if (STAILQ_EMPTY(stailq)) { 960 spdk_mempool_put(pool, buf); 961 } else { 962 tmp = STAILQ_FIRST(stailq); 963 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 964 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 965 } 966 } 967 968 static void 969 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 970 { 971 assert(bdev_io->internal.buf != NULL); 972 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 973 bdev_io->internal.buf = NULL; 974 } 975 976 void 977 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 978 { 979 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 980 981 assert(buf != NULL); 982 _bdev_io_put_buf(bdev_io, buf, len); 983 } 984 985 static void 986 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 987 { 988 struct spdk_bdev *bdev = bdev_ch->bdev; 989 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 990 struct spdk_bdev_io *bdev_io; 991 992 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 993 /* 994 * Allow some more I/O to complete before retrying the nomem_io queue. 995 * Some drivers (such as nvme) cannot immediately take a new I/O in 996 * the context of a completion, because the resources for the I/O are 997 * not released until control returns to the bdev poller. Also, we 998 * may require several small I/O to complete before a larger I/O 999 * (that requires splitting) can be submitted. 1000 */ 1001 return; 1002 } 1003 1004 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1005 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1006 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1007 bdev_io->internal.ch->io_outstanding++; 1008 shared_resource->io_outstanding++; 1009 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1010 bdev_io->internal.error.nvme.cdw0 = 0; 1011 bdev_io->num_retries++; 1012 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1013 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1014 break; 1015 } 1016 } 1017 } 1018 1019 static inline void 1020 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1021 struct spdk_bdev_shared_resource *shared_resource) 1022 { 1023 assert(bdev_ch->io_outstanding > 0); 1024 assert(shared_resource->io_outstanding > 0); 1025 bdev_ch->io_outstanding--; 1026 shared_resource->io_outstanding--; 1027 } 1028 1029 static inline bool 1030 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1031 { 1032 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1033 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1034 1035 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1036 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1037 /* 1038 * Wait for some of the outstanding I/O to complete before we 1039 * retry any of the nomem_io. Normally we will wait for 1040 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1041 * depth channels we will instead wait for half to complete. 1042 */ 1043 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1044 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1045 return true; 1046 } 1047 1048 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1049 bdev_ch_retry_io(bdev_ch); 1050 } 1051 1052 return false; 1053 } 1054 1055 static inline void 1056 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1057 { 1058 /* do the same for metadata buffer */ 1059 if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) { 1060 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1061 1062 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1063 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1064 memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf, 1065 bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev)); 1066 } 1067 1068 bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf; 1069 bdev_io->internal.orig_md_buf = NULL; 1070 } 1071 1072 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1073 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1074 */ 1075 bdev_io_put_buf(bdev_io); 1076 } 1077 1078 static void 1079 _bdev_io_push_bounce_data_buffer_done(void *ctx) 1080 { 1081 struct spdk_bdev_io *bdev_io = ctx; 1082 1083 /* set original buffer for this io */ 1084 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1085 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1086 /* disable bouncing buffer for this io */ 1087 bdev_io->internal.orig_iovcnt = 0; 1088 bdev_io->internal.orig_iovs = NULL; 1089 1090 _bdev_io_push_bounce_md_buffer(bdev_io); 1091 } 1092 1093 static void 1094 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 1095 { 1096 if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) { 1097 assert(bdev_io->internal.orig_md_buf == NULL); 1098 return; 1099 } 1100 1101 /* if this is read path, copy data from bounce buffer to original buffer */ 1102 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1103 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1104 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1105 bdev_io->internal.orig_iovcnt, 1106 bdev_io->internal.bounce_iov.iov_base, 1107 bdev_io->internal.bounce_iov.iov_len); 1108 } 1109 1110 _bdev_io_push_bounce_data_buffer_done(bdev_io); 1111 } 1112 1113 static void 1114 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1115 { 1116 struct spdk_bdev *bdev = bdev_io->bdev; 1117 struct spdk_mempool *pool; 1118 bdev_io_stailq_t *stailq; 1119 struct spdk_bdev_mgmt_channel *mgmt_ch; 1120 uint64_t alignment, md_len; 1121 void *buf; 1122 1123 alignment = spdk_bdev_get_buf_align(bdev); 1124 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1125 1126 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1127 SPDK_BDEV_POOL_ALIGNMENT) { 1128 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1129 len + alignment); 1130 bdev_io_get_buf_complete(bdev_io, false); 1131 return; 1132 } 1133 1134 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1135 1136 bdev_io->internal.buf_len = len; 1137 1138 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1139 SPDK_BDEV_POOL_ALIGNMENT) { 1140 pool = g_bdev_mgr.buf_small_pool; 1141 stailq = &mgmt_ch->need_buf_small; 1142 } else { 1143 pool = g_bdev_mgr.buf_large_pool; 1144 stailq = &mgmt_ch->need_buf_large; 1145 } 1146 1147 buf = spdk_mempool_get(pool); 1148 if (!buf) { 1149 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1150 } else { 1151 _bdev_io_set_buf(bdev_io, buf, len); 1152 } 1153 } 1154 1155 void 1156 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1157 { 1158 struct spdk_bdev *bdev = bdev_io->bdev; 1159 uint64_t alignment; 1160 1161 assert(cb != NULL); 1162 bdev_io->internal.get_buf_cb = cb; 1163 1164 alignment = spdk_bdev_get_buf_align(bdev); 1165 1166 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1167 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1168 /* Buffer already present and aligned */ 1169 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1170 return; 1171 } 1172 1173 bdev_io_get_buf(bdev_io, len); 1174 } 1175 1176 void 1177 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1178 { 1179 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1180 1181 assert(cb != NULL); 1182 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1183 bdev_io->internal.get_aux_buf_cb = cb; 1184 bdev_io_get_buf(bdev_io, len); 1185 } 1186 1187 static int 1188 bdev_module_get_max_ctx_size(void) 1189 { 1190 struct spdk_bdev_module *bdev_module; 1191 int max_bdev_module_size = 0; 1192 1193 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1194 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1195 max_bdev_module_size = bdev_module->get_ctx_size(); 1196 } 1197 } 1198 1199 return max_bdev_module_size; 1200 } 1201 1202 static void 1203 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1204 { 1205 int i; 1206 struct spdk_bdev_qos *qos = bdev->internal.qos; 1207 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1208 1209 if (!qos) { 1210 return; 1211 } 1212 1213 spdk_bdev_get_qos_rate_limits(bdev, limits); 1214 1215 spdk_json_write_object_begin(w); 1216 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1217 1218 spdk_json_write_named_object_begin(w, "params"); 1219 spdk_json_write_named_string(w, "name", bdev->name); 1220 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1221 if (limits[i] > 0) { 1222 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1223 } 1224 } 1225 spdk_json_write_object_end(w); 1226 1227 spdk_json_write_object_end(w); 1228 } 1229 1230 void 1231 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1232 { 1233 struct spdk_bdev_module *bdev_module; 1234 struct spdk_bdev *bdev; 1235 1236 assert(w != NULL); 1237 1238 spdk_json_write_array_begin(w); 1239 1240 spdk_json_write_object_begin(w); 1241 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1242 spdk_json_write_named_object_begin(w, "params"); 1243 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1244 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1245 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1246 spdk_json_write_object_end(w); 1247 spdk_json_write_object_end(w); 1248 1249 bdev_examine_allowlist_config_json(w); 1250 1251 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1252 if (bdev_module->config_json) { 1253 bdev_module->config_json(w); 1254 } 1255 } 1256 1257 pthread_mutex_lock(&g_bdev_mgr.mutex); 1258 1259 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1260 if (bdev->fn_table->write_config_json) { 1261 bdev->fn_table->write_config_json(bdev, w); 1262 } 1263 1264 bdev_qos_config_json(bdev, w); 1265 } 1266 1267 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1268 1269 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1270 spdk_json_write_object_begin(w); 1271 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1272 spdk_json_write_object_end(w); 1273 1274 spdk_json_write_array_end(w); 1275 } 1276 1277 static int 1278 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1279 { 1280 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1281 struct spdk_bdev_io *bdev_io; 1282 uint32_t i; 1283 1284 STAILQ_INIT(&ch->need_buf_small); 1285 STAILQ_INIT(&ch->need_buf_large); 1286 1287 STAILQ_INIT(&ch->per_thread_cache); 1288 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1289 1290 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1291 ch->per_thread_cache_count = 0; 1292 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1293 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1294 assert(bdev_io != NULL); 1295 ch->per_thread_cache_count++; 1296 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1297 } 1298 1299 TAILQ_INIT(&ch->shared_resources); 1300 TAILQ_INIT(&ch->io_wait_queue); 1301 1302 return 0; 1303 } 1304 1305 static void 1306 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1307 { 1308 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1309 struct spdk_bdev_io *bdev_io; 1310 1311 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1312 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1313 } 1314 1315 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1316 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1317 } 1318 1319 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1320 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1321 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1322 ch->per_thread_cache_count--; 1323 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1324 } 1325 1326 assert(ch->per_thread_cache_count == 0); 1327 } 1328 1329 static void 1330 bdev_init_complete(int rc) 1331 { 1332 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1333 void *cb_arg = g_init_cb_arg; 1334 struct spdk_bdev_module *m; 1335 1336 g_bdev_mgr.init_complete = true; 1337 g_init_cb_fn = NULL; 1338 g_init_cb_arg = NULL; 1339 1340 /* 1341 * For modules that need to know when subsystem init is complete, 1342 * inform them now. 1343 */ 1344 if (rc == 0) { 1345 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1346 if (m->init_complete) { 1347 m->init_complete(); 1348 } 1349 } 1350 } 1351 1352 cb_fn(cb_arg, rc); 1353 } 1354 1355 static bool 1356 bdev_module_all_actions_completed(void) 1357 { 1358 struct spdk_bdev_module *m; 1359 1360 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1361 if (m->internal.action_in_progress > 0) { 1362 return false; 1363 } 1364 } 1365 return true; 1366 } 1367 1368 static void 1369 bdev_module_action_complete(void) 1370 { 1371 /* 1372 * Don't finish bdev subsystem initialization if 1373 * module pre-initialization is still in progress, or 1374 * the subsystem been already initialized. 1375 */ 1376 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1377 return; 1378 } 1379 1380 /* 1381 * Check all bdev modules for inits/examinations in progress. If any 1382 * exist, return immediately since we cannot finish bdev subsystem 1383 * initialization until all are completed. 1384 */ 1385 if (!bdev_module_all_actions_completed()) { 1386 return; 1387 } 1388 1389 /* 1390 * Modules already finished initialization - now that all 1391 * the bdev modules have finished their asynchronous I/O 1392 * processing, the entire bdev layer can be marked as complete. 1393 */ 1394 bdev_init_complete(0); 1395 } 1396 1397 static void 1398 bdev_module_action_done(struct spdk_bdev_module *module) 1399 { 1400 assert(module->internal.action_in_progress > 0); 1401 module->internal.action_in_progress--; 1402 bdev_module_action_complete(); 1403 } 1404 1405 void 1406 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1407 { 1408 bdev_module_action_done(module); 1409 } 1410 1411 void 1412 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1413 { 1414 bdev_module_action_done(module); 1415 } 1416 1417 /** The last initialized bdev module */ 1418 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1419 1420 static void 1421 bdev_init_failed(void *cb_arg) 1422 { 1423 struct spdk_bdev_module *module = cb_arg; 1424 1425 module->internal.action_in_progress--; 1426 bdev_init_complete(-1); 1427 } 1428 1429 static int 1430 bdev_modules_init(void) 1431 { 1432 struct spdk_bdev_module *module; 1433 int rc = 0; 1434 1435 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1436 g_resume_bdev_module = module; 1437 if (module->async_init) { 1438 module->internal.action_in_progress = 1; 1439 } 1440 rc = module->module_init(); 1441 if (rc != 0) { 1442 /* Bump action_in_progress to prevent other modules from completion of modules_init 1443 * Send message to defer application shutdown until resources are cleaned up */ 1444 module->internal.action_in_progress = 1; 1445 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1446 return rc; 1447 } 1448 } 1449 1450 g_resume_bdev_module = NULL; 1451 return 0; 1452 } 1453 1454 void 1455 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1456 { 1457 int cache_size; 1458 int rc = 0; 1459 char mempool_name[32]; 1460 1461 assert(cb_fn != NULL); 1462 1463 g_init_cb_fn = cb_fn; 1464 g_init_cb_arg = cb_arg; 1465 1466 spdk_notify_type_register("bdev_register"); 1467 spdk_notify_type_register("bdev_unregister"); 1468 1469 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1470 1471 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1472 g_bdev_opts.bdev_io_pool_size, 1473 sizeof(struct spdk_bdev_io) + 1474 bdev_module_get_max_ctx_size(), 1475 0, 1476 SPDK_ENV_SOCKET_ID_ANY); 1477 1478 if (g_bdev_mgr.bdev_io_pool == NULL) { 1479 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1480 bdev_init_complete(-1); 1481 return; 1482 } 1483 1484 /** 1485 * Ensure no more than half of the total buffers end up local caches, by 1486 * using spdk_env_get_core_count() to determine how many local caches we need 1487 * to account for. 1488 */ 1489 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1490 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1491 1492 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1493 g_bdev_opts.small_buf_pool_size, 1494 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1495 SPDK_BDEV_POOL_ALIGNMENT, 1496 cache_size, 1497 SPDK_ENV_SOCKET_ID_ANY); 1498 if (!g_bdev_mgr.buf_small_pool) { 1499 SPDK_ERRLOG("create rbuf small pool failed\n"); 1500 bdev_init_complete(-1); 1501 return; 1502 } 1503 1504 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1505 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1506 1507 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1508 g_bdev_opts.large_buf_pool_size, 1509 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1510 SPDK_BDEV_POOL_ALIGNMENT, 1511 cache_size, 1512 SPDK_ENV_SOCKET_ID_ANY); 1513 if (!g_bdev_mgr.buf_large_pool) { 1514 SPDK_ERRLOG("create rbuf large pool failed\n"); 1515 bdev_init_complete(-1); 1516 return; 1517 } 1518 1519 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1520 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1521 if (!g_bdev_mgr.zero_buffer) { 1522 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1523 bdev_init_complete(-1); 1524 return; 1525 } 1526 1527 #ifdef SPDK_CONFIG_VTUNE 1528 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1529 #endif 1530 1531 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1532 bdev_mgmt_channel_destroy, 1533 sizeof(struct spdk_bdev_mgmt_channel), 1534 "bdev_mgr"); 1535 1536 rc = bdev_modules_init(); 1537 g_bdev_mgr.module_init_complete = true; 1538 if (rc != 0) { 1539 SPDK_ERRLOG("bdev modules init failed\n"); 1540 return; 1541 } 1542 1543 bdev_module_action_complete(); 1544 } 1545 1546 static void 1547 bdev_mgr_unregister_cb(void *io_device) 1548 { 1549 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1550 1551 if (g_bdev_mgr.bdev_io_pool) { 1552 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1553 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1554 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1555 g_bdev_opts.bdev_io_pool_size); 1556 } 1557 1558 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1559 } 1560 1561 if (g_bdev_mgr.buf_small_pool) { 1562 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1563 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1564 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1565 g_bdev_opts.small_buf_pool_size); 1566 assert(false); 1567 } 1568 1569 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1570 } 1571 1572 if (g_bdev_mgr.buf_large_pool) { 1573 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1574 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1575 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1576 g_bdev_opts.large_buf_pool_size); 1577 assert(false); 1578 } 1579 1580 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1581 } 1582 1583 spdk_free(g_bdev_mgr.zero_buffer); 1584 1585 bdev_examine_allowlist_free(); 1586 1587 cb_fn(g_fini_cb_arg); 1588 g_fini_cb_fn = NULL; 1589 g_fini_cb_arg = NULL; 1590 g_bdev_mgr.init_complete = false; 1591 g_bdev_mgr.module_init_complete = false; 1592 } 1593 1594 static void 1595 bdev_module_fini_iter(void *arg) 1596 { 1597 struct spdk_bdev_module *bdev_module; 1598 1599 /* FIXME: Handling initialization failures is broken now, 1600 * so we won't even try cleaning up after successfully 1601 * initialized modules. if module_init_complete is false, 1602 * just call spdk_bdev_mgr_unregister_cb 1603 */ 1604 if (!g_bdev_mgr.module_init_complete) { 1605 bdev_mgr_unregister_cb(NULL); 1606 return; 1607 } 1608 1609 /* Start iterating from the last touched module */ 1610 if (!g_resume_bdev_module) { 1611 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1612 } else { 1613 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1614 internal.tailq); 1615 } 1616 1617 while (bdev_module) { 1618 if (bdev_module->async_fini) { 1619 /* Save our place so we can resume later. We must 1620 * save the variable here, before calling module_fini() 1621 * below, because in some cases the module may immediately 1622 * call spdk_bdev_module_fini_done() and re-enter 1623 * this function to continue iterating. */ 1624 g_resume_bdev_module = bdev_module; 1625 } 1626 1627 if (bdev_module->module_fini) { 1628 bdev_module->module_fini(); 1629 } 1630 1631 if (bdev_module->async_fini) { 1632 return; 1633 } 1634 1635 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1636 internal.tailq); 1637 } 1638 1639 g_resume_bdev_module = NULL; 1640 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1641 } 1642 1643 void 1644 spdk_bdev_module_fini_done(void) 1645 { 1646 if (spdk_get_thread() != g_fini_thread) { 1647 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1648 } else { 1649 bdev_module_fini_iter(NULL); 1650 } 1651 } 1652 1653 static void 1654 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1655 { 1656 struct spdk_bdev *bdev = cb_arg; 1657 1658 if (bdeverrno && bdev) { 1659 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1660 bdev->name); 1661 1662 /* 1663 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1664 * bdev; try to continue by manually removing this bdev from the list and continue 1665 * with the next bdev in the list. 1666 */ 1667 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1668 } 1669 1670 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1671 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1672 /* 1673 * Bdev module finish need to be deferred as we might be in the middle of some context 1674 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1675 * after returning. 1676 */ 1677 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1678 return; 1679 } 1680 1681 /* 1682 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1683 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1684 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1685 * base bdevs. 1686 * 1687 * Also, walk the list in the reverse order. 1688 */ 1689 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1690 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1691 if (bdev->internal.claim_module != NULL) { 1692 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1693 bdev->name, bdev->internal.claim_module->name); 1694 continue; 1695 } 1696 1697 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1698 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1699 return; 1700 } 1701 1702 /* 1703 * If any bdev fails to unclaim underlying bdev properly, we may face the 1704 * case of bdev list consisting of claimed bdevs only (if claims are managed 1705 * correctly, this would mean there's a loop in the claims graph which is 1706 * clearly impossible). Warn and unregister last bdev on the list then. 1707 */ 1708 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1709 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1710 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1711 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1712 return; 1713 } 1714 } 1715 1716 static void 1717 bdev_module_fini_start_iter(void *arg) 1718 { 1719 struct spdk_bdev_module *bdev_module; 1720 1721 if (!g_resume_bdev_module) { 1722 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1723 } else { 1724 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1725 } 1726 1727 while (bdev_module) { 1728 if (bdev_module->async_fini_start) { 1729 /* Save our place so we can resume later. We must 1730 * save the variable here, before calling fini_start() 1731 * below, because in some cases the module may immediately 1732 * call spdk_bdev_module_fini_start_done() and re-enter 1733 * this function to continue iterating. */ 1734 g_resume_bdev_module = bdev_module; 1735 } 1736 1737 if (bdev_module->fini_start) { 1738 bdev_module->fini_start(); 1739 } 1740 1741 if (bdev_module->async_fini_start) { 1742 return; 1743 } 1744 1745 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1746 } 1747 1748 g_resume_bdev_module = NULL; 1749 1750 bdev_finish_unregister_bdevs_iter(NULL, 0); 1751 } 1752 1753 void 1754 spdk_bdev_module_fini_start_done(void) 1755 { 1756 if (spdk_get_thread() != g_fini_thread) { 1757 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1758 } else { 1759 bdev_module_fini_start_iter(NULL); 1760 } 1761 } 1762 1763 void 1764 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1765 { 1766 assert(cb_fn != NULL); 1767 1768 g_fini_thread = spdk_get_thread(); 1769 1770 g_fini_cb_fn = cb_fn; 1771 g_fini_cb_arg = cb_arg; 1772 1773 bdev_module_fini_start_iter(NULL); 1774 } 1775 1776 struct spdk_bdev_io * 1777 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1778 { 1779 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1780 struct spdk_bdev_io *bdev_io; 1781 1782 if (ch->per_thread_cache_count > 0) { 1783 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1784 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1785 ch->per_thread_cache_count--; 1786 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1787 /* 1788 * Don't try to look for bdev_ios in the global pool if there are 1789 * waiters on bdev_ios - we don't want this caller to jump the line. 1790 */ 1791 bdev_io = NULL; 1792 } else { 1793 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1794 } 1795 1796 return bdev_io; 1797 } 1798 1799 void 1800 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1801 { 1802 struct spdk_bdev_mgmt_channel *ch; 1803 1804 assert(bdev_io != NULL); 1805 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1806 1807 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1808 1809 if (bdev_io->internal.buf != NULL) { 1810 bdev_io_put_buf(bdev_io); 1811 } 1812 1813 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1814 ch->per_thread_cache_count++; 1815 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1816 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1817 struct spdk_bdev_io_wait_entry *entry; 1818 1819 entry = TAILQ_FIRST(&ch->io_wait_queue); 1820 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1821 entry->cb_fn(entry->cb_arg); 1822 } 1823 } else { 1824 /* We should never have a full cache with entries on the io wait queue. */ 1825 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1826 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1827 } 1828 } 1829 1830 static bool 1831 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1832 { 1833 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1834 1835 switch (limit) { 1836 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1837 return true; 1838 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1839 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1840 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1841 return false; 1842 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1843 default: 1844 return false; 1845 } 1846 } 1847 1848 static bool 1849 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1850 { 1851 switch (bdev_io->type) { 1852 case SPDK_BDEV_IO_TYPE_NVME_IO: 1853 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1854 case SPDK_BDEV_IO_TYPE_READ: 1855 case SPDK_BDEV_IO_TYPE_WRITE: 1856 return true; 1857 case SPDK_BDEV_IO_TYPE_ZCOPY: 1858 if (bdev_io->u.bdev.zcopy.start) { 1859 return true; 1860 } else { 1861 return false; 1862 } 1863 default: 1864 return false; 1865 } 1866 } 1867 1868 static bool 1869 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1870 { 1871 switch (bdev_io->type) { 1872 case SPDK_BDEV_IO_TYPE_NVME_IO: 1873 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1874 /* Bit 1 (0x2) set for read operation */ 1875 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1876 return true; 1877 } else { 1878 return false; 1879 } 1880 case SPDK_BDEV_IO_TYPE_READ: 1881 return true; 1882 case SPDK_BDEV_IO_TYPE_ZCOPY: 1883 /* Populate to read from disk */ 1884 if (bdev_io->u.bdev.zcopy.populate) { 1885 return true; 1886 } else { 1887 return false; 1888 } 1889 default: 1890 return false; 1891 } 1892 } 1893 1894 static uint64_t 1895 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1896 { 1897 struct spdk_bdev *bdev = bdev_io->bdev; 1898 1899 switch (bdev_io->type) { 1900 case SPDK_BDEV_IO_TYPE_NVME_IO: 1901 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1902 return bdev_io->u.nvme_passthru.nbytes; 1903 case SPDK_BDEV_IO_TYPE_READ: 1904 case SPDK_BDEV_IO_TYPE_WRITE: 1905 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1906 case SPDK_BDEV_IO_TYPE_ZCOPY: 1907 /* Track the data in the start phase only */ 1908 if (bdev_io->u.bdev.zcopy.start) { 1909 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1910 } else { 1911 return 0; 1912 } 1913 default: 1914 return 0; 1915 } 1916 } 1917 1918 static bool 1919 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1920 { 1921 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1922 return true; 1923 } else { 1924 return false; 1925 } 1926 } 1927 1928 static bool 1929 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1930 { 1931 if (bdev_is_read_io(io) == false) { 1932 return false; 1933 } 1934 1935 return bdev_qos_rw_queue_io(limit, io); 1936 } 1937 1938 static bool 1939 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1940 { 1941 if (bdev_is_read_io(io) == true) { 1942 return false; 1943 } 1944 1945 return bdev_qos_rw_queue_io(limit, io); 1946 } 1947 1948 static void 1949 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1950 { 1951 limit->remaining_this_timeslice--; 1952 } 1953 1954 static void 1955 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1956 { 1957 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 1958 } 1959 1960 static void 1961 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1962 { 1963 if (bdev_is_read_io(io) == false) { 1964 return; 1965 } 1966 1967 return bdev_qos_rw_bps_update_quota(limit, io); 1968 } 1969 1970 static void 1971 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1972 { 1973 if (bdev_is_read_io(io) == true) { 1974 return; 1975 } 1976 1977 return bdev_qos_rw_bps_update_quota(limit, io); 1978 } 1979 1980 static void 1981 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1982 { 1983 int i; 1984 1985 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1986 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1987 qos->rate_limits[i].queue_io = NULL; 1988 qos->rate_limits[i].update_quota = NULL; 1989 continue; 1990 } 1991 1992 switch (i) { 1993 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1994 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1995 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 1996 break; 1997 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1998 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1999 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2000 break; 2001 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2002 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2003 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2004 break; 2005 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2006 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2007 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2008 break; 2009 default: 2010 break; 2011 } 2012 } 2013 } 2014 2015 static void 2016 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2017 struct spdk_bdev_io *bdev_io, 2018 enum spdk_bdev_io_status status) 2019 { 2020 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2021 2022 bdev_io->internal.in_submit_request = true; 2023 bdev_ch->io_outstanding++; 2024 shared_resource->io_outstanding++; 2025 spdk_bdev_io_complete(bdev_io, status); 2026 bdev_io->internal.in_submit_request = false; 2027 } 2028 2029 static inline void 2030 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2031 { 2032 struct spdk_bdev *bdev = bdev_io->bdev; 2033 struct spdk_io_channel *ch = bdev_ch->channel; 2034 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2035 2036 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2037 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2038 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2039 2040 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2041 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 2042 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 2043 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2044 SPDK_BDEV_IO_STATUS_SUCCESS); 2045 return; 2046 } 2047 } 2048 2049 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2050 bdev_ch->io_outstanding++; 2051 shared_resource->io_outstanding++; 2052 bdev_io->internal.in_submit_request = true; 2053 bdev->fn_table->submit_request(ch, bdev_io); 2054 bdev_io->internal.in_submit_request = false; 2055 } else { 2056 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2057 } 2058 } 2059 2060 static int 2061 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2062 { 2063 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2064 int i, submitted_ios = 0; 2065 2066 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2067 if (bdev_qos_io_to_limit(bdev_io) == true) { 2068 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2069 if (!qos->rate_limits[i].queue_io) { 2070 continue; 2071 } 2072 2073 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2074 bdev_io) == true) { 2075 return submitted_ios; 2076 } 2077 } 2078 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2079 if (!qos->rate_limits[i].update_quota) { 2080 continue; 2081 } 2082 2083 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2084 } 2085 } 2086 2087 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2088 bdev_io_do_submit(ch, bdev_io); 2089 submitted_ios++; 2090 } 2091 2092 return submitted_ios; 2093 } 2094 2095 static void 2096 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2097 { 2098 int rc; 2099 2100 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2101 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2102 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2103 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2104 &bdev_io->internal.waitq_entry); 2105 if (rc != 0) { 2106 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2107 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2108 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2109 } 2110 } 2111 2112 static bool 2113 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2114 { 2115 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 2116 uint32_t max_size = bdev_io->bdev->max_segment_size; 2117 int max_segs = bdev_io->bdev->max_num_segments; 2118 2119 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 2120 2121 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2122 return false; 2123 } 2124 2125 if (io_boundary) { 2126 uint64_t start_stripe, end_stripe; 2127 2128 start_stripe = bdev_io->u.bdev.offset_blocks; 2129 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2130 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2131 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2132 start_stripe >>= spdk_u32log2(io_boundary); 2133 end_stripe >>= spdk_u32log2(io_boundary); 2134 } else { 2135 start_stripe /= io_boundary; 2136 end_stripe /= io_boundary; 2137 } 2138 2139 if (start_stripe != end_stripe) { 2140 return true; 2141 } 2142 } 2143 2144 if (max_segs) { 2145 if (bdev_io->u.bdev.iovcnt > max_segs) { 2146 return true; 2147 } 2148 } 2149 2150 if (max_size) { 2151 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2152 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2153 return true; 2154 } 2155 } 2156 } 2157 2158 return false; 2159 } 2160 2161 static bool 2162 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2163 { 2164 uint32_t num_unmap_segments; 2165 2166 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2167 return false; 2168 } 2169 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2170 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2171 return true; 2172 } 2173 2174 return false; 2175 } 2176 2177 static bool 2178 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2179 { 2180 if (!bdev_io->bdev->max_write_zeroes) { 2181 return false; 2182 } 2183 2184 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2185 return true; 2186 } 2187 2188 return false; 2189 } 2190 2191 static bool 2192 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2193 { 2194 switch (bdev_io->type) { 2195 case SPDK_BDEV_IO_TYPE_READ: 2196 case SPDK_BDEV_IO_TYPE_WRITE: 2197 return bdev_rw_should_split(bdev_io); 2198 case SPDK_BDEV_IO_TYPE_UNMAP: 2199 return bdev_unmap_should_split(bdev_io); 2200 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2201 return bdev_write_zeroes_should_split(bdev_io); 2202 default: 2203 return false; 2204 } 2205 } 2206 2207 static uint32_t 2208 _to_next_boundary(uint64_t offset, uint32_t boundary) 2209 { 2210 return (boundary - (offset % boundary)); 2211 } 2212 2213 static void 2214 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2215 2216 static void 2217 _bdev_rw_split(void *_bdev_io); 2218 2219 static void 2220 bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2221 2222 static void 2223 _bdev_unmap_split(void *_bdev_io) 2224 { 2225 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2226 } 2227 2228 static void 2229 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2230 2231 static void 2232 _bdev_write_zeroes_split(void *_bdev_io) 2233 { 2234 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2235 } 2236 2237 static int 2238 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2239 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2240 { 2241 int rc; 2242 uint64_t current_offset, current_remaining; 2243 spdk_bdev_io_wait_cb io_wait_fn; 2244 2245 current_offset = *offset; 2246 current_remaining = *remaining; 2247 2248 bdev_io->u.bdev.split_outstanding++; 2249 2250 io_wait_fn = _bdev_rw_split; 2251 switch (bdev_io->type) { 2252 case SPDK_BDEV_IO_TYPE_READ: 2253 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2254 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2255 iov, iovcnt, md_buf, current_offset, 2256 num_blocks, 2257 bdev_io_split_done, bdev_io, 2258 bdev_io->internal.ext_opts); 2259 break; 2260 case SPDK_BDEV_IO_TYPE_WRITE: 2261 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2262 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2263 iov, iovcnt, md_buf, current_offset, 2264 num_blocks, 2265 bdev_io_split_done, bdev_io, 2266 bdev_io->internal.ext_opts); 2267 break; 2268 case SPDK_BDEV_IO_TYPE_UNMAP: 2269 io_wait_fn = _bdev_unmap_split; 2270 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2271 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2272 current_offset, num_blocks, 2273 bdev_io_split_done, bdev_io); 2274 break; 2275 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2276 io_wait_fn = _bdev_write_zeroes_split; 2277 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2278 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2279 current_offset, num_blocks, 2280 bdev_io_split_done, bdev_io); 2281 break; 2282 default: 2283 assert(false); 2284 rc = -EINVAL; 2285 break; 2286 } 2287 2288 if (rc == 0) { 2289 current_offset += num_blocks; 2290 current_remaining -= num_blocks; 2291 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2292 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2293 *offset = current_offset; 2294 *remaining = current_remaining; 2295 } else { 2296 bdev_io->u.bdev.split_outstanding--; 2297 if (rc == -ENOMEM) { 2298 if (bdev_io->u.bdev.split_outstanding == 0) { 2299 /* No I/O is outstanding. Hence we should wait here. */ 2300 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2301 } 2302 } else { 2303 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2304 if (bdev_io->u.bdev.split_outstanding == 0) { 2305 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2306 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2307 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2308 } 2309 } 2310 } 2311 2312 return rc; 2313 } 2314 2315 static void 2316 _bdev_rw_split(void *_bdev_io) 2317 { 2318 struct iovec *parent_iov, *iov; 2319 struct spdk_bdev_io *bdev_io = _bdev_io; 2320 struct spdk_bdev *bdev = bdev_io->bdev; 2321 uint64_t parent_offset, current_offset, remaining; 2322 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2323 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2324 uint32_t iovcnt, iov_len, child_iovsize; 2325 uint32_t blocklen = bdev->blocklen; 2326 uint32_t io_boundary = bdev->optimal_io_boundary; 2327 uint32_t max_segment_size = bdev->max_segment_size; 2328 uint32_t max_child_iovcnt = bdev->max_num_segments; 2329 void *md_buf = NULL; 2330 int rc; 2331 2332 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2333 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2334 BDEV_IO_NUM_CHILD_IOV; 2335 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2336 2337 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2338 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2339 parent_offset = bdev_io->u.bdev.offset_blocks; 2340 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2341 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2342 2343 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2344 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2345 if (parent_iov_offset < parent_iov->iov_len) { 2346 break; 2347 } 2348 parent_iov_offset -= parent_iov->iov_len; 2349 } 2350 2351 child_iovcnt = 0; 2352 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2353 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2354 to_next_boundary = spdk_min(remaining, to_next_boundary); 2355 to_next_boundary_bytes = to_next_boundary * blocklen; 2356 2357 iov = &bdev_io->child_iov[child_iovcnt]; 2358 iovcnt = 0; 2359 2360 if (bdev_io->u.bdev.md_buf) { 2361 md_buf = (char *)bdev_io->u.bdev.md_buf + 2362 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2363 } 2364 2365 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2366 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2367 iovcnt < child_iovsize) { 2368 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2369 iov_len = parent_iov->iov_len - parent_iov_offset; 2370 2371 iov_len = spdk_min(iov_len, max_segment_size); 2372 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2373 to_next_boundary_bytes -= iov_len; 2374 2375 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2376 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2377 2378 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2379 parent_iov_offset += iov_len; 2380 } else { 2381 parent_iovpos++; 2382 parent_iov_offset = 0; 2383 } 2384 child_iovcnt++; 2385 iovcnt++; 2386 } 2387 2388 if (to_next_boundary_bytes > 0) { 2389 /* We had to stop this child I/O early because we ran out of 2390 * child_iov space or were limited by max_num_segments. 2391 * Ensure the iovs to be aligned with block size and 2392 * then adjust to_next_boundary before starting the 2393 * child I/O. 2394 */ 2395 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2396 iovcnt == child_iovsize); 2397 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2398 if (to_last_block_bytes != 0) { 2399 uint32_t child_iovpos = child_iovcnt - 1; 2400 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2401 * so the loop will naturally end 2402 */ 2403 2404 to_last_block_bytes = blocklen - to_last_block_bytes; 2405 to_next_boundary_bytes += to_last_block_bytes; 2406 while (to_last_block_bytes > 0 && iovcnt > 0) { 2407 iov_len = spdk_min(to_last_block_bytes, 2408 bdev_io->child_iov[child_iovpos].iov_len); 2409 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2410 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2411 child_iovpos--; 2412 if (--iovcnt == 0) { 2413 /* If the child IO is less than a block size just return. 2414 * If the first child IO of any split round is less than 2415 * a block size, an error exit. 2416 */ 2417 if (bdev_io->u.bdev.split_outstanding == 0) { 2418 SPDK_ERRLOG("The first child io was less than a block size\n"); 2419 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2420 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2421 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2422 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2423 } 2424 2425 return; 2426 } 2427 } 2428 2429 to_last_block_bytes -= iov_len; 2430 2431 if (parent_iov_offset == 0) { 2432 parent_iovpos--; 2433 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2434 } 2435 parent_iov_offset -= iov_len; 2436 } 2437 2438 assert(to_last_block_bytes == 0); 2439 } 2440 to_next_boundary -= to_next_boundary_bytes / blocklen; 2441 } 2442 2443 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2444 ¤t_offset, &remaining); 2445 if (spdk_unlikely(rc)) { 2446 return; 2447 } 2448 } 2449 } 2450 2451 static void 2452 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2453 { 2454 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2455 uint32_t num_children_reqs = 0; 2456 int rc; 2457 2458 offset = bdev_io->u.bdev.split_current_offset_blocks; 2459 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2460 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2461 2462 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2463 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2464 2465 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2466 &offset, &remaining); 2467 if (spdk_likely(rc == 0)) { 2468 num_children_reqs++; 2469 } else { 2470 return; 2471 } 2472 } 2473 } 2474 2475 static void 2476 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2477 { 2478 uint64_t offset, write_zeroes_blocks, remaining; 2479 uint32_t num_children_reqs = 0; 2480 int rc; 2481 2482 offset = bdev_io->u.bdev.split_current_offset_blocks; 2483 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2484 2485 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2486 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2487 2488 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2489 &offset, &remaining); 2490 if (spdk_likely(rc == 0)) { 2491 num_children_reqs++; 2492 } else { 2493 return; 2494 } 2495 } 2496 } 2497 2498 static void 2499 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2500 { 2501 struct spdk_bdev_io *parent_io = cb_arg; 2502 2503 spdk_bdev_free_io(bdev_io); 2504 2505 if (!success) { 2506 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2507 /* If any child I/O failed, stop further splitting process. */ 2508 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2509 parent_io->u.bdev.split_remaining_num_blocks = 0; 2510 } 2511 parent_io->u.bdev.split_outstanding--; 2512 if (parent_io->u.bdev.split_outstanding != 0) { 2513 return; 2514 } 2515 2516 /* 2517 * Parent I/O finishes when all blocks are consumed. 2518 */ 2519 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2520 assert(parent_io->internal.cb != bdev_io_split_done); 2521 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2522 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2523 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2524 parent_io->internal.caller_ctx); 2525 return; 2526 } 2527 2528 /* 2529 * Continue with the splitting process. This function will complete the parent I/O if the 2530 * splitting is done. 2531 */ 2532 switch (parent_io->type) { 2533 case SPDK_BDEV_IO_TYPE_READ: 2534 case SPDK_BDEV_IO_TYPE_WRITE: 2535 _bdev_rw_split(parent_io); 2536 break; 2537 case SPDK_BDEV_IO_TYPE_UNMAP: 2538 bdev_unmap_split(parent_io); 2539 break; 2540 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2541 bdev_write_zeroes_split(parent_io); 2542 break; 2543 default: 2544 assert(false); 2545 break; 2546 } 2547 } 2548 2549 static void 2550 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2551 2552 static void 2553 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2554 { 2555 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2556 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2557 bdev_io->u.bdev.split_outstanding = 0; 2558 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2559 2560 switch (bdev_io->type) { 2561 case SPDK_BDEV_IO_TYPE_READ: 2562 case SPDK_BDEV_IO_TYPE_WRITE: 2563 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2564 _bdev_rw_split(bdev_io); 2565 } else { 2566 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2567 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2568 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2569 } 2570 break; 2571 case SPDK_BDEV_IO_TYPE_UNMAP: 2572 bdev_unmap_split(bdev_io); 2573 break; 2574 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2575 bdev_write_zeroes_split(bdev_io); 2576 break; 2577 default: 2578 assert(false); 2579 break; 2580 } 2581 } 2582 2583 static void 2584 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2585 { 2586 if (!success) { 2587 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2588 return; 2589 } 2590 2591 _bdev_rw_split(bdev_io); 2592 } 2593 2594 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2595 * be inlined, at least on some compilers. 2596 */ 2597 static inline void 2598 _bdev_io_submit(void *ctx) 2599 { 2600 struct spdk_bdev_io *bdev_io = ctx; 2601 struct spdk_bdev *bdev = bdev_io->bdev; 2602 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2603 uint64_t tsc; 2604 2605 tsc = spdk_get_ticks(); 2606 bdev_io->internal.submit_tsc = tsc; 2607 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type, 2608 bdev_io->internal.caller_ctx, bdev_io->u.bdev.offset_blocks, 2609 bdev_io->u.bdev.num_blocks); 2610 2611 if (spdk_likely(bdev_ch->flags == 0)) { 2612 bdev_io_do_submit(bdev_ch, bdev_io); 2613 return; 2614 } 2615 2616 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2617 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2618 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2619 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2620 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2621 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2622 } else { 2623 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2624 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2625 } 2626 } else { 2627 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2628 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2629 } 2630 } 2631 2632 bool 2633 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2634 2635 bool 2636 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2637 { 2638 if (range1->length == 0 || range2->length == 0) { 2639 return false; 2640 } 2641 2642 if (range1->offset + range1->length <= range2->offset) { 2643 return false; 2644 } 2645 2646 if (range2->offset + range2->length <= range1->offset) { 2647 return false; 2648 } 2649 2650 return true; 2651 } 2652 2653 static bool 2654 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2655 { 2656 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2657 struct lba_range r; 2658 2659 switch (bdev_io->type) { 2660 case SPDK_BDEV_IO_TYPE_NVME_IO: 2661 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2662 /* Don't try to decode the NVMe command - just assume worst-case and that 2663 * it overlaps a locked range. 2664 */ 2665 return true; 2666 case SPDK_BDEV_IO_TYPE_WRITE: 2667 case SPDK_BDEV_IO_TYPE_UNMAP: 2668 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2669 case SPDK_BDEV_IO_TYPE_ZCOPY: 2670 r.offset = bdev_io->u.bdev.offset_blocks; 2671 r.length = bdev_io->u.bdev.num_blocks; 2672 if (!bdev_lba_range_overlapped(range, &r)) { 2673 /* This I/O doesn't overlap the specified LBA range. */ 2674 return false; 2675 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2676 /* This I/O overlaps, but the I/O is on the same channel that locked this 2677 * range, and the caller_ctx is the same as the locked_ctx. This means 2678 * that this I/O is associated with the lock, and is allowed to execute. 2679 */ 2680 return false; 2681 } else { 2682 return true; 2683 } 2684 default: 2685 return false; 2686 } 2687 } 2688 2689 void 2690 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2691 { 2692 struct spdk_bdev *bdev = bdev_io->bdev; 2693 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2694 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2695 2696 assert(thread != NULL); 2697 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2698 2699 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2700 struct lba_range *range; 2701 2702 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2703 if (bdev_io_range_is_locked(bdev_io, range)) { 2704 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2705 return; 2706 } 2707 } 2708 } 2709 2710 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2711 2712 if (bdev_io_should_split(bdev_io)) { 2713 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2714 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2715 (uintptr_t)bdev_io, bdev_io->type, bdev_io->internal.caller_ctx, 2716 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2717 bdev_io_split(NULL, bdev_io); 2718 return; 2719 } 2720 2721 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2722 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2723 _bdev_io_submit(bdev_io); 2724 } else { 2725 bdev_io->internal.io_submit_ch = ch; 2726 bdev_io->internal.ch = bdev->internal.qos->ch; 2727 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2728 } 2729 } else { 2730 _bdev_io_submit(bdev_io); 2731 } 2732 } 2733 2734 static void 2735 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2736 { 2737 struct spdk_bdev *bdev = bdev_io->bdev; 2738 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2739 struct spdk_io_channel *ch = bdev_ch->channel; 2740 2741 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2742 2743 bdev_io->internal.in_submit_request = true; 2744 bdev->fn_table->submit_request(ch, bdev_io); 2745 bdev_io->internal.in_submit_request = false; 2746 } 2747 2748 void 2749 bdev_io_init(struct spdk_bdev_io *bdev_io, 2750 struct spdk_bdev *bdev, void *cb_arg, 2751 spdk_bdev_io_completion_cb cb) 2752 { 2753 bdev_io->bdev = bdev; 2754 bdev_io->internal.caller_ctx = cb_arg; 2755 bdev_io->internal.cb = cb; 2756 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2757 bdev_io->internal.in_submit_request = false; 2758 bdev_io->internal.buf = NULL; 2759 bdev_io->internal.io_submit_ch = NULL; 2760 bdev_io->internal.orig_iovs = NULL; 2761 bdev_io->internal.orig_iovcnt = 0; 2762 bdev_io->internal.orig_md_buf = NULL; 2763 bdev_io->internal.error.nvme.cdw0 = 0; 2764 bdev_io->num_retries = 0; 2765 bdev_io->internal.get_buf_cb = NULL; 2766 bdev_io->internal.get_aux_buf_cb = NULL; 2767 bdev_io->internal.ext_opts = NULL; 2768 } 2769 2770 static bool 2771 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2772 { 2773 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2774 } 2775 2776 bool 2777 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2778 { 2779 bool supported; 2780 2781 supported = bdev_io_type_supported(bdev, io_type); 2782 2783 if (!supported) { 2784 switch (io_type) { 2785 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2786 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2787 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2788 break; 2789 default: 2790 break; 2791 } 2792 } 2793 2794 return supported; 2795 } 2796 2797 int 2798 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2799 { 2800 if (bdev->fn_table->dump_info_json) { 2801 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2802 } 2803 2804 return 0; 2805 } 2806 2807 static void 2808 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 2809 { 2810 uint32_t max_per_timeslice = 0; 2811 int i; 2812 2813 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2814 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2815 qos->rate_limits[i].max_per_timeslice = 0; 2816 continue; 2817 } 2818 2819 max_per_timeslice = qos->rate_limits[i].limit * 2820 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 2821 2822 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 2823 qos->rate_limits[i].min_per_timeslice); 2824 2825 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 2826 } 2827 2828 bdev_qos_set_ops(qos); 2829 } 2830 2831 static int 2832 bdev_channel_poll_qos(void *arg) 2833 { 2834 struct spdk_bdev_qos *qos = arg; 2835 uint64_t now = spdk_get_ticks(); 2836 int i; 2837 2838 if (now < (qos->last_timeslice + qos->timeslice_size)) { 2839 /* We received our callback earlier than expected - return 2840 * immediately and wait to do accounting until at least one 2841 * timeslice has actually expired. This should never happen 2842 * with a well-behaved timer implementation. 2843 */ 2844 return SPDK_POLLER_IDLE; 2845 } 2846 2847 /* Reset for next round of rate limiting */ 2848 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2849 /* We may have allowed the IOs or bytes to slightly overrun in the last 2850 * timeslice. remaining_this_timeslice is signed, so if it's negative 2851 * here, we'll account for the overrun so that the next timeslice will 2852 * be appropriately reduced. 2853 */ 2854 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 2855 qos->rate_limits[i].remaining_this_timeslice = 0; 2856 } 2857 } 2858 2859 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 2860 qos->last_timeslice += qos->timeslice_size; 2861 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2862 qos->rate_limits[i].remaining_this_timeslice += 2863 qos->rate_limits[i].max_per_timeslice; 2864 } 2865 } 2866 2867 return bdev_qos_io_submit(qos->ch, qos); 2868 } 2869 2870 static void 2871 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 2872 { 2873 struct spdk_bdev_shared_resource *shared_resource; 2874 struct lba_range *range; 2875 2876 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 2877 range = TAILQ_FIRST(&ch->locked_ranges); 2878 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 2879 free(range); 2880 } 2881 2882 spdk_put_io_channel(ch->channel); 2883 2884 shared_resource = ch->shared_resource; 2885 2886 assert(TAILQ_EMPTY(&ch->io_locked)); 2887 assert(TAILQ_EMPTY(&ch->io_submitted)); 2888 assert(ch->io_outstanding == 0); 2889 assert(shared_resource->ref > 0); 2890 shared_resource->ref--; 2891 if (shared_resource->ref == 0) { 2892 assert(shared_resource->io_outstanding == 0); 2893 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 2894 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 2895 free(shared_resource); 2896 } 2897 } 2898 2899 /* Caller must hold bdev->internal.mutex. */ 2900 static void 2901 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 2902 { 2903 struct spdk_bdev_qos *qos = bdev->internal.qos; 2904 int i; 2905 2906 /* Rate limiting on this bdev enabled */ 2907 if (qos) { 2908 if (qos->ch == NULL) { 2909 struct spdk_io_channel *io_ch; 2910 2911 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 2912 bdev->name, spdk_get_thread()); 2913 2914 /* No qos channel has been selected, so set one up */ 2915 2916 /* Take another reference to ch */ 2917 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2918 assert(io_ch != NULL); 2919 qos->ch = ch; 2920 2921 qos->thread = spdk_io_channel_get_thread(io_ch); 2922 2923 TAILQ_INIT(&qos->queued); 2924 2925 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2926 if (bdev_qos_is_iops_rate_limit(i) == true) { 2927 qos->rate_limits[i].min_per_timeslice = 2928 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 2929 } else { 2930 qos->rate_limits[i].min_per_timeslice = 2931 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 2932 } 2933 2934 if (qos->rate_limits[i].limit == 0) { 2935 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 2936 } 2937 } 2938 bdev_qos_update_max_quota_per_timeslice(qos); 2939 qos->timeslice_size = 2940 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 2941 qos->last_timeslice = spdk_get_ticks(); 2942 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 2943 qos, 2944 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 2945 } 2946 2947 ch->flags |= BDEV_CH_QOS_ENABLED; 2948 } 2949 } 2950 2951 struct poll_timeout_ctx { 2952 struct spdk_bdev_desc *desc; 2953 uint64_t timeout_in_sec; 2954 spdk_bdev_io_timeout_cb cb_fn; 2955 void *cb_arg; 2956 }; 2957 2958 static void 2959 bdev_desc_free(struct spdk_bdev_desc *desc) 2960 { 2961 pthread_mutex_destroy(&desc->mutex); 2962 free(desc->media_events_buffer); 2963 free(desc); 2964 } 2965 2966 static void 2967 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 2968 { 2969 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2970 struct spdk_bdev_desc *desc = ctx->desc; 2971 2972 free(ctx); 2973 2974 pthread_mutex_lock(&desc->mutex); 2975 desc->refs--; 2976 if (desc->closed == true && desc->refs == 0) { 2977 pthread_mutex_unlock(&desc->mutex); 2978 bdev_desc_free(desc); 2979 return; 2980 } 2981 pthread_mutex_unlock(&desc->mutex); 2982 } 2983 2984 static void 2985 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 2986 { 2987 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2988 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2989 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 2990 struct spdk_bdev_desc *desc = ctx->desc; 2991 struct spdk_bdev_io *bdev_io; 2992 uint64_t now; 2993 2994 pthread_mutex_lock(&desc->mutex); 2995 if (desc->closed == true) { 2996 pthread_mutex_unlock(&desc->mutex); 2997 spdk_for_each_channel_continue(i, -1); 2998 return; 2999 } 3000 pthread_mutex_unlock(&desc->mutex); 3001 3002 now = spdk_get_ticks(); 3003 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3004 /* Exclude any I/O that are generated via splitting. */ 3005 if (bdev_io->internal.cb == bdev_io_split_done) { 3006 continue; 3007 } 3008 3009 /* Once we find an I/O that has not timed out, we can immediately 3010 * exit the loop. 3011 */ 3012 if (now < (bdev_io->internal.submit_tsc + 3013 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3014 goto end; 3015 } 3016 3017 if (bdev_io->internal.desc == desc) { 3018 ctx->cb_fn(ctx->cb_arg, bdev_io); 3019 } 3020 } 3021 3022 end: 3023 spdk_for_each_channel_continue(i, 0); 3024 } 3025 3026 static int 3027 bdev_poll_timeout_io(void *arg) 3028 { 3029 struct spdk_bdev_desc *desc = arg; 3030 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3031 struct poll_timeout_ctx *ctx; 3032 3033 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3034 if (!ctx) { 3035 SPDK_ERRLOG("failed to allocate memory\n"); 3036 return SPDK_POLLER_BUSY; 3037 } 3038 ctx->desc = desc; 3039 ctx->cb_arg = desc->cb_arg; 3040 ctx->cb_fn = desc->cb_fn; 3041 ctx->timeout_in_sec = desc->timeout_in_sec; 3042 3043 /* Take a ref on the descriptor in case it gets closed while we are checking 3044 * all of the channels. 3045 */ 3046 pthread_mutex_lock(&desc->mutex); 3047 desc->refs++; 3048 pthread_mutex_unlock(&desc->mutex); 3049 3050 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3051 bdev_channel_poll_timeout_io, 3052 ctx, 3053 bdev_channel_poll_timeout_io_done); 3054 3055 return SPDK_POLLER_BUSY; 3056 } 3057 3058 int 3059 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3060 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3061 { 3062 assert(desc->thread == spdk_get_thread()); 3063 3064 spdk_poller_unregister(&desc->io_timeout_poller); 3065 3066 if (timeout_in_sec) { 3067 assert(cb_fn != NULL); 3068 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3069 desc, 3070 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3071 1000); 3072 if (desc->io_timeout_poller == NULL) { 3073 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3074 return -1; 3075 } 3076 } 3077 3078 desc->cb_fn = cb_fn; 3079 desc->cb_arg = cb_arg; 3080 desc->timeout_in_sec = timeout_in_sec; 3081 3082 return 0; 3083 } 3084 3085 static int 3086 bdev_channel_create(void *io_device, void *ctx_buf) 3087 { 3088 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3089 struct spdk_bdev_channel *ch = ctx_buf; 3090 struct spdk_io_channel *mgmt_io_ch; 3091 struct spdk_bdev_mgmt_channel *mgmt_ch; 3092 struct spdk_bdev_shared_resource *shared_resource; 3093 struct lba_range *range; 3094 3095 ch->bdev = bdev; 3096 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3097 if (!ch->channel) { 3098 return -1; 3099 } 3100 3101 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3102 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3103 3104 assert(ch->histogram == NULL); 3105 if (bdev->internal.histogram_enabled) { 3106 ch->histogram = spdk_histogram_data_alloc(); 3107 if (ch->histogram == NULL) { 3108 SPDK_ERRLOG("Could not allocate histogram\n"); 3109 } 3110 } 3111 3112 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3113 if (!mgmt_io_ch) { 3114 spdk_put_io_channel(ch->channel); 3115 return -1; 3116 } 3117 3118 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3119 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3120 if (shared_resource->shared_ch == ch->channel) { 3121 spdk_put_io_channel(mgmt_io_ch); 3122 shared_resource->ref++; 3123 break; 3124 } 3125 } 3126 3127 if (shared_resource == NULL) { 3128 shared_resource = calloc(1, sizeof(*shared_resource)); 3129 if (shared_resource == NULL) { 3130 spdk_put_io_channel(ch->channel); 3131 spdk_put_io_channel(mgmt_io_ch); 3132 return -1; 3133 } 3134 3135 shared_resource->mgmt_ch = mgmt_ch; 3136 shared_resource->io_outstanding = 0; 3137 TAILQ_INIT(&shared_resource->nomem_io); 3138 shared_resource->nomem_threshold = 0; 3139 shared_resource->shared_ch = ch->channel; 3140 shared_resource->ref = 1; 3141 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3142 } 3143 3144 memset(&ch->stat, 0, sizeof(ch->stat)); 3145 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3146 ch->io_outstanding = 0; 3147 TAILQ_INIT(&ch->queued_resets); 3148 TAILQ_INIT(&ch->locked_ranges); 3149 ch->flags = 0; 3150 ch->shared_resource = shared_resource; 3151 3152 TAILQ_INIT(&ch->io_submitted); 3153 TAILQ_INIT(&ch->io_locked); 3154 3155 #ifdef SPDK_CONFIG_VTUNE 3156 { 3157 char *name; 3158 __itt_init_ittlib(NULL, 0); 3159 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3160 if (!name) { 3161 bdev_channel_destroy_resource(ch); 3162 return -1; 3163 } 3164 ch->handle = __itt_string_handle_create(name); 3165 free(name); 3166 ch->start_tsc = spdk_get_ticks(); 3167 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3168 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3169 } 3170 #endif 3171 3172 pthread_mutex_lock(&bdev->internal.mutex); 3173 bdev_enable_qos(bdev, ch); 3174 3175 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3176 struct lba_range *new_range; 3177 3178 new_range = calloc(1, sizeof(*new_range)); 3179 if (new_range == NULL) { 3180 pthread_mutex_unlock(&bdev->internal.mutex); 3181 bdev_channel_destroy_resource(ch); 3182 return -1; 3183 } 3184 new_range->length = range->length; 3185 new_range->offset = range->offset; 3186 new_range->locked_ctx = range->locked_ctx; 3187 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3188 } 3189 3190 pthread_mutex_unlock(&bdev->internal.mutex); 3191 3192 return 0; 3193 } 3194 3195 /* 3196 * Abort I/O that are waiting on a data buffer. These types of I/O are 3197 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3198 */ 3199 static void 3200 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3201 { 3202 bdev_io_stailq_t tmp; 3203 struct spdk_bdev_io *bdev_io; 3204 3205 STAILQ_INIT(&tmp); 3206 3207 while (!STAILQ_EMPTY(queue)) { 3208 bdev_io = STAILQ_FIRST(queue); 3209 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3210 if (bdev_io->internal.ch == ch) { 3211 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3212 } else { 3213 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3214 } 3215 } 3216 3217 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3218 } 3219 3220 /* 3221 * Abort I/O that are queued waiting for submission. These types of I/O are 3222 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3223 */ 3224 static void 3225 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3226 { 3227 struct spdk_bdev_io *bdev_io, *tmp; 3228 3229 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3230 if (bdev_io->internal.ch == ch) { 3231 TAILQ_REMOVE(queue, bdev_io, internal.link); 3232 /* 3233 * spdk_bdev_io_complete() assumes that the completed I/O had 3234 * been submitted to the bdev module. Since in this case it 3235 * hadn't, bump io_outstanding to account for the decrement 3236 * that spdk_bdev_io_complete() will do. 3237 */ 3238 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3239 ch->io_outstanding++; 3240 ch->shared_resource->io_outstanding++; 3241 } 3242 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3243 } 3244 } 3245 } 3246 3247 static bool 3248 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3249 { 3250 struct spdk_bdev_io *bdev_io; 3251 3252 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3253 if (bdev_io == bio_to_abort) { 3254 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3255 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3256 return true; 3257 } 3258 } 3259 3260 return false; 3261 } 3262 3263 static bool 3264 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3265 { 3266 struct spdk_bdev_io *bdev_io; 3267 3268 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3269 if (bdev_io == bio_to_abort) { 3270 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3271 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3272 return true; 3273 } 3274 } 3275 3276 return false; 3277 } 3278 3279 static void 3280 bdev_qos_channel_destroy(void *cb_arg) 3281 { 3282 struct spdk_bdev_qos *qos = cb_arg; 3283 3284 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3285 spdk_poller_unregister(&qos->poller); 3286 3287 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3288 3289 free(qos); 3290 } 3291 3292 static int 3293 bdev_qos_destroy(struct spdk_bdev *bdev) 3294 { 3295 int i; 3296 3297 /* 3298 * Cleanly shutting down the QoS poller is tricky, because 3299 * during the asynchronous operation the user could open 3300 * a new descriptor and create a new channel, spawning 3301 * a new QoS poller. 3302 * 3303 * The strategy is to create a new QoS structure here and swap it 3304 * in. The shutdown path then continues to refer to the old one 3305 * until it completes and then releases it. 3306 */ 3307 struct spdk_bdev_qos *new_qos, *old_qos; 3308 3309 old_qos = bdev->internal.qos; 3310 3311 new_qos = calloc(1, sizeof(*new_qos)); 3312 if (!new_qos) { 3313 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3314 return -ENOMEM; 3315 } 3316 3317 /* Copy the old QoS data into the newly allocated structure */ 3318 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3319 3320 /* Zero out the key parts of the QoS structure */ 3321 new_qos->ch = NULL; 3322 new_qos->thread = NULL; 3323 new_qos->poller = NULL; 3324 TAILQ_INIT(&new_qos->queued); 3325 /* 3326 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3327 * It will be used later for the new QoS structure. 3328 */ 3329 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3330 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3331 new_qos->rate_limits[i].min_per_timeslice = 0; 3332 new_qos->rate_limits[i].max_per_timeslice = 0; 3333 } 3334 3335 bdev->internal.qos = new_qos; 3336 3337 if (old_qos->thread == NULL) { 3338 free(old_qos); 3339 } else { 3340 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3341 } 3342 3343 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3344 * been destroyed yet. The destruction path will end up waiting for the final 3345 * channel to be put before it releases resources. */ 3346 3347 return 0; 3348 } 3349 3350 static void 3351 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3352 { 3353 total->bytes_read += add->bytes_read; 3354 total->num_read_ops += add->num_read_ops; 3355 total->bytes_written += add->bytes_written; 3356 total->num_write_ops += add->num_write_ops; 3357 total->bytes_unmapped += add->bytes_unmapped; 3358 total->num_unmap_ops += add->num_unmap_ops; 3359 total->read_latency_ticks += add->read_latency_ticks; 3360 total->write_latency_ticks += add->write_latency_ticks; 3361 total->unmap_latency_ticks += add->unmap_latency_ticks; 3362 } 3363 3364 static void 3365 bdev_channel_destroy(void *io_device, void *ctx_buf) 3366 { 3367 struct spdk_bdev_channel *ch = ctx_buf; 3368 struct spdk_bdev_mgmt_channel *mgmt_ch; 3369 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3370 3371 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3372 spdk_get_thread()); 3373 3374 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3375 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3376 3377 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3378 pthread_mutex_lock(&ch->bdev->internal.mutex); 3379 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3380 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3381 3382 mgmt_ch = shared_resource->mgmt_ch; 3383 3384 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3385 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3386 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3387 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3388 3389 if (ch->histogram) { 3390 spdk_histogram_data_free(ch->histogram); 3391 } 3392 3393 bdev_channel_destroy_resource(ch); 3394 } 3395 3396 /* 3397 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3398 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3399 */ 3400 static int 3401 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3402 { 3403 struct spdk_bdev_name *tmp; 3404 3405 bdev_name->name = strdup(name); 3406 if (bdev_name->name == NULL) { 3407 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3408 return -ENOMEM; 3409 } 3410 3411 bdev_name->bdev = bdev; 3412 3413 pthread_mutex_lock(&g_bdev_mgr.mutex); 3414 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3415 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3416 3417 if (tmp != NULL) { 3418 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3419 free(bdev_name->name); 3420 return -EEXIST; 3421 } 3422 3423 return 0; 3424 } 3425 3426 static void 3427 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3428 { 3429 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3430 free(bdev_name->name); 3431 } 3432 3433 static void 3434 bdev_name_del(struct spdk_bdev_name *bdev_name) 3435 { 3436 pthread_mutex_lock(&g_bdev_mgr.mutex); 3437 bdev_name_del_unsafe(bdev_name); 3438 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3439 } 3440 3441 int 3442 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3443 { 3444 struct spdk_bdev_alias *tmp; 3445 int ret; 3446 3447 if (alias == NULL) { 3448 SPDK_ERRLOG("Empty alias passed\n"); 3449 return -EINVAL; 3450 } 3451 3452 tmp = calloc(1, sizeof(*tmp)); 3453 if (tmp == NULL) { 3454 SPDK_ERRLOG("Unable to allocate alias\n"); 3455 return -ENOMEM; 3456 } 3457 3458 ret = bdev_name_add(&tmp->alias, bdev, alias); 3459 if (ret != 0) { 3460 free(tmp); 3461 return ret; 3462 } 3463 3464 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3465 3466 return 0; 3467 } 3468 3469 static int 3470 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3471 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3472 { 3473 struct spdk_bdev_alias *tmp; 3474 3475 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3476 if (strcmp(alias, tmp->alias.name) == 0) { 3477 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3478 alias_del_fn(&tmp->alias); 3479 free(tmp); 3480 return 0; 3481 } 3482 } 3483 3484 return -ENOENT; 3485 } 3486 3487 int 3488 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3489 { 3490 int rc; 3491 3492 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3493 if (rc == -ENOENT) { 3494 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3495 } 3496 3497 return rc; 3498 } 3499 3500 void 3501 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3502 { 3503 struct spdk_bdev_alias *p, *tmp; 3504 3505 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3506 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3507 bdev_name_del(&p->alias); 3508 free(p); 3509 } 3510 } 3511 3512 struct spdk_io_channel * 3513 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3514 { 3515 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3516 } 3517 3518 void * 3519 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3520 { 3521 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3522 void *ctx = NULL; 3523 3524 if (bdev->fn_table->get_module_ctx) { 3525 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3526 } 3527 3528 return ctx; 3529 } 3530 3531 const char * 3532 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3533 { 3534 return bdev->module->name; 3535 } 3536 3537 const char * 3538 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3539 { 3540 return bdev->name; 3541 } 3542 3543 const char * 3544 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3545 { 3546 return bdev->product_name; 3547 } 3548 3549 const struct spdk_bdev_aliases_list * 3550 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3551 { 3552 return &bdev->aliases; 3553 } 3554 3555 uint32_t 3556 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3557 { 3558 return bdev->blocklen; 3559 } 3560 3561 uint32_t 3562 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3563 { 3564 return bdev->write_unit_size; 3565 } 3566 3567 uint64_t 3568 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3569 { 3570 return bdev->blockcnt; 3571 } 3572 3573 const char * 3574 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3575 { 3576 return qos_rpc_type[type]; 3577 } 3578 3579 void 3580 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3581 { 3582 int i; 3583 3584 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3585 3586 pthread_mutex_lock(&bdev->internal.mutex); 3587 if (bdev->internal.qos) { 3588 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3589 if (bdev->internal.qos->rate_limits[i].limit != 3590 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3591 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3592 if (bdev_qos_is_iops_rate_limit(i) == false) { 3593 /* Change from Byte to Megabyte which is user visible. */ 3594 limits[i] = limits[i] / 1024 / 1024; 3595 } 3596 } 3597 } 3598 } 3599 pthread_mutex_unlock(&bdev->internal.mutex); 3600 } 3601 3602 size_t 3603 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3604 { 3605 return 1 << bdev->required_alignment; 3606 } 3607 3608 uint32_t 3609 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3610 { 3611 return bdev->optimal_io_boundary; 3612 } 3613 3614 bool 3615 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3616 { 3617 return bdev->write_cache; 3618 } 3619 3620 const struct spdk_uuid * 3621 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3622 { 3623 return &bdev->uuid; 3624 } 3625 3626 uint16_t 3627 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3628 { 3629 return bdev->acwu; 3630 } 3631 3632 uint32_t 3633 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3634 { 3635 return bdev->md_len; 3636 } 3637 3638 bool 3639 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3640 { 3641 return (bdev->md_len != 0) && bdev->md_interleave; 3642 } 3643 3644 bool 3645 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3646 { 3647 return (bdev->md_len != 0) && !bdev->md_interleave; 3648 } 3649 3650 bool 3651 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3652 { 3653 return bdev->zoned; 3654 } 3655 3656 uint32_t 3657 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3658 { 3659 if (spdk_bdev_is_md_interleaved(bdev)) { 3660 return bdev->blocklen - bdev->md_len; 3661 } else { 3662 return bdev->blocklen; 3663 } 3664 } 3665 3666 uint32_t 3667 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3668 { 3669 return bdev->phys_blocklen; 3670 } 3671 3672 static uint32_t 3673 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3674 { 3675 if (!spdk_bdev_is_md_interleaved(bdev)) { 3676 return bdev->blocklen + bdev->md_len; 3677 } else { 3678 return bdev->blocklen; 3679 } 3680 } 3681 3682 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3683 { 3684 if (bdev->md_len != 0) { 3685 return bdev->dif_type; 3686 } else { 3687 return SPDK_DIF_DISABLE; 3688 } 3689 } 3690 3691 bool 3692 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3693 { 3694 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3695 return bdev->dif_is_head_of_md; 3696 } else { 3697 return false; 3698 } 3699 } 3700 3701 bool 3702 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3703 enum spdk_dif_check_type check_type) 3704 { 3705 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3706 return false; 3707 } 3708 3709 switch (check_type) { 3710 case SPDK_DIF_CHECK_TYPE_REFTAG: 3711 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3712 case SPDK_DIF_CHECK_TYPE_APPTAG: 3713 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3714 case SPDK_DIF_CHECK_TYPE_GUARD: 3715 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3716 default: 3717 return false; 3718 } 3719 } 3720 3721 uint64_t 3722 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3723 { 3724 return bdev->internal.measured_queue_depth; 3725 } 3726 3727 uint64_t 3728 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3729 { 3730 return bdev->internal.period; 3731 } 3732 3733 uint64_t 3734 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3735 { 3736 return bdev->internal.weighted_io_time; 3737 } 3738 3739 uint64_t 3740 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3741 { 3742 return bdev->internal.io_time; 3743 } 3744 3745 static void 3746 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3747 { 3748 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3749 3750 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3751 3752 if (bdev->internal.measured_queue_depth) { 3753 bdev->internal.io_time += bdev->internal.period; 3754 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3755 } 3756 } 3757 3758 static void 3759 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3760 { 3761 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3762 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3763 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3764 3765 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3766 spdk_for_each_channel_continue(i, 0); 3767 } 3768 3769 static int 3770 bdev_calculate_measured_queue_depth(void *ctx) 3771 { 3772 struct spdk_bdev *bdev = ctx; 3773 bdev->internal.temporary_queue_depth = 0; 3774 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3775 _calculate_measured_qd_cpl); 3776 return SPDK_POLLER_BUSY; 3777 } 3778 3779 void 3780 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3781 { 3782 bdev->internal.period = period; 3783 3784 if (bdev->internal.qd_poller != NULL) { 3785 spdk_poller_unregister(&bdev->internal.qd_poller); 3786 bdev->internal.measured_queue_depth = UINT64_MAX; 3787 } 3788 3789 if (period != 0) { 3790 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 3791 period); 3792 } 3793 } 3794 3795 static void 3796 _resize_notify(void *arg) 3797 { 3798 struct spdk_bdev_desc *desc = arg; 3799 3800 pthread_mutex_lock(&desc->mutex); 3801 desc->refs--; 3802 if (!desc->closed) { 3803 pthread_mutex_unlock(&desc->mutex); 3804 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 3805 desc->bdev, 3806 desc->callback.ctx); 3807 return; 3808 } else if (0 == desc->refs) { 3809 /* This descriptor was closed after this resize_notify message was sent. 3810 * spdk_bdev_close() could not free the descriptor since this message was 3811 * in flight, so we free it now using bdev_desc_free(). 3812 */ 3813 pthread_mutex_unlock(&desc->mutex); 3814 bdev_desc_free(desc); 3815 return; 3816 } 3817 pthread_mutex_unlock(&desc->mutex); 3818 } 3819 3820 int 3821 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 3822 { 3823 struct spdk_bdev_desc *desc; 3824 int ret; 3825 3826 if (size == bdev->blockcnt) { 3827 return 0; 3828 } 3829 3830 pthread_mutex_lock(&bdev->internal.mutex); 3831 3832 /* bdev has open descriptors */ 3833 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 3834 bdev->blockcnt > size) { 3835 ret = -EBUSY; 3836 } else { 3837 bdev->blockcnt = size; 3838 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 3839 pthread_mutex_lock(&desc->mutex); 3840 if (!desc->closed) { 3841 desc->refs++; 3842 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 3843 } 3844 pthread_mutex_unlock(&desc->mutex); 3845 } 3846 ret = 0; 3847 } 3848 3849 pthread_mutex_unlock(&bdev->internal.mutex); 3850 3851 return ret; 3852 } 3853 3854 /* 3855 * Convert I/O offset and length from bytes to blocks. 3856 * 3857 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 3858 */ 3859 static uint64_t 3860 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 3861 uint64_t num_bytes, uint64_t *num_blocks) 3862 { 3863 uint32_t block_size = bdev->blocklen; 3864 uint8_t shift_cnt; 3865 3866 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3867 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 3868 shift_cnt = spdk_u32log2(block_size); 3869 *offset_blocks = offset_bytes >> shift_cnt; 3870 *num_blocks = num_bytes >> shift_cnt; 3871 return (offset_bytes - (*offset_blocks << shift_cnt)) | 3872 (num_bytes - (*num_blocks << shift_cnt)); 3873 } else { 3874 *offset_blocks = offset_bytes / block_size; 3875 *num_blocks = num_bytes / block_size; 3876 return (offset_bytes % block_size) | (num_bytes % block_size); 3877 } 3878 } 3879 3880 static bool 3881 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 3882 { 3883 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 3884 * has been an overflow and hence the offset has been wrapped around */ 3885 if (offset_blocks + num_blocks < offset_blocks) { 3886 return false; 3887 } 3888 3889 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 3890 if (offset_blocks + num_blocks > bdev->blockcnt) { 3891 return false; 3892 } 3893 3894 return true; 3895 } 3896 3897 static bool 3898 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 3899 { 3900 return _is_buf_allocated(iovs) == (md_buf != NULL); 3901 } 3902 3903 static int 3904 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 3905 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3906 spdk_bdev_io_completion_cb cb, void *cb_arg) 3907 { 3908 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3909 struct spdk_bdev_io *bdev_io; 3910 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3911 3912 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3913 return -EINVAL; 3914 } 3915 3916 bdev_io = bdev_channel_get_io(channel); 3917 if (!bdev_io) { 3918 return -ENOMEM; 3919 } 3920 3921 bdev_io->internal.ch = channel; 3922 bdev_io->internal.desc = desc; 3923 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3924 bdev_io->u.bdev.iovs = &bdev_io->iov; 3925 bdev_io->u.bdev.iovs[0].iov_base = buf; 3926 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3927 bdev_io->u.bdev.iovcnt = 1; 3928 bdev_io->u.bdev.md_buf = md_buf; 3929 bdev_io->u.bdev.num_blocks = num_blocks; 3930 bdev_io->u.bdev.offset_blocks = offset_blocks; 3931 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3932 3933 bdev_io_submit(bdev_io); 3934 return 0; 3935 } 3936 3937 int 3938 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3939 void *buf, uint64_t offset, uint64_t nbytes, 3940 spdk_bdev_io_completion_cb cb, void *cb_arg) 3941 { 3942 uint64_t offset_blocks, num_blocks; 3943 3944 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3945 nbytes, &num_blocks) != 0) { 3946 return -EINVAL; 3947 } 3948 3949 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3950 } 3951 3952 int 3953 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3954 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3955 spdk_bdev_io_completion_cb cb, void *cb_arg) 3956 { 3957 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 3958 } 3959 3960 int 3961 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3962 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3963 spdk_bdev_io_completion_cb cb, void *cb_arg) 3964 { 3965 struct iovec iov = { 3966 .iov_base = buf, 3967 }; 3968 3969 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3970 return -EINVAL; 3971 } 3972 3973 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3974 return -EINVAL; 3975 } 3976 3977 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3978 cb, cb_arg); 3979 } 3980 3981 int 3982 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3983 struct iovec *iov, int iovcnt, 3984 uint64_t offset, uint64_t nbytes, 3985 spdk_bdev_io_completion_cb cb, void *cb_arg) 3986 { 3987 uint64_t offset_blocks, num_blocks; 3988 3989 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3990 nbytes, &num_blocks) != 0) { 3991 return -EINVAL; 3992 } 3993 3994 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3995 } 3996 3997 static int 3998 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3999 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4000 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4001 struct spdk_bdev_ext_io_opts *opts) 4002 { 4003 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4004 struct spdk_bdev_io *bdev_io; 4005 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4006 4007 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4008 return -EINVAL; 4009 } 4010 4011 bdev_io = bdev_channel_get_io(channel); 4012 if (!bdev_io) { 4013 return -ENOMEM; 4014 } 4015 4016 bdev_io->internal.ch = channel; 4017 bdev_io->internal.desc = desc; 4018 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4019 bdev_io->u.bdev.iovs = iov; 4020 bdev_io->u.bdev.iovcnt = iovcnt; 4021 bdev_io->u.bdev.md_buf = md_buf; 4022 bdev_io->u.bdev.num_blocks = num_blocks; 4023 bdev_io->u.bdev.offset_blocks = offset_blocks; 4024 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4025 bdev_io->internal.ext_opts = opts; 4026 4027 bdev_io_submit(bdev_io); 4028 return 0; 4029 } 4030 4031 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4032 struct iovec *iov, int iovcnt, 4033 uint64_t offset_blocks, uint64_t num_blocks, 4034 spdk_bdev_io_completion_cb cb, void *cb_arg) 4035 { 4036 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4037 num_blocks, cb, cb_arg, NULL); 4038 } 4039 4040 int 4041 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4042 struct iovec *iov, int iovcnt, void *md_buf, 4043 uint64_t offset_blocks, uint64_t num_blocks, 4044 spdk_bdev_io_completion_cb cb, void *cb_arg) 4045 { 4046 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4047 return -EINVAL; 4048 } 4049 4050 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4051 return -EINVAL; 4052 } 4053 4054 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4055 num_blocks, cb, cb_arg, NULL); 4056 } 4057 4058 int 4059 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4060 struct iovec *iov, int iovcnt, 4061 uint64_t offset_blocks, uint64_t num_blocks, 4062 spdk_bdev_io_completion_cb cb, void *cb_arg, 4063 struct spdk_bdev_ext_io_opts *opts) 4064 { 4065 void *md = NULL; 4066 4067 if (opts) { 4068 md = opts->metadata; 4069 } 4070 4071 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4072 return -EINVAL; 4073 } 4074 4075 if (md && !_bdev_io_check_md_buf(iov, md)) { 4076 return -EINVAL; 4077 } 4078 4079 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4080 num_blocks, cb, cb_arg, opts); 4081 } 4082 4083 static int 4084 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4085 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4086 spdk_bdev_io_completion_cb cb, void *cb_arg) 4087 { 4088 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4089 struct spdk_bdev_io *bdev_io; 4090 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4091 4092 if (!desc->write) { 4093 return -EBADF; 4094 } 4095 4096 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4097 return -EINVAL; 4098 } 4099 4100 bdev_io = bdev_channel_get_io(channel); 4101 if (!bdev_io) { 4102 return -ENOMEM; 4103 } 4104 4105 bdev_io->internal.ch = channel; 4106 bdev_io->internal.desc = desc; 4107 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4108 bdev_io->u.bdev.iovs = &bdev_io->iov; 4109 bdev_io->u.bdev.iovs[0].iov_base = buf; 4110 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4111 bdev_io->u.bdev.iovcnt = 1; 4112 bdev_io->u.bdev.md_buf = md_buf; 4113 bdev_io->u.bdev.num_blocks = num_blocks; 4114 bdev_io->u.bdev.offset_blocks = offset_blocks; 4115 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4116 4117 bdev_io_submit(bdev_io); 4118 return 0; 4119 } 4120 4121 int 4122 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4123 void *buf, uint64_t offset, uint64_t nbytes, 4124 spdk_bdev_io_completion_cb cb, void *cb_arg) 4125 { 4126 uint64_t offset_blocks, num_blocks; 4127 4128 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4129 nbytes, &num_blocks) != 0) { 4130 return -EINVAL; 4131 } 4132 4133 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4134 } 4135 4136 int 4137 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4138 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4139 spdk_bdev_io_completion_cb cb, void *cb_arg) 4140 { 4141 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4142 cb, cb_arg); 4143 } 4144 4145 int 4146 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4147 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4148 spdk_bdev_io_completion_cb cb, void *cb_arg) 4149 { 4150 struct iovec iov = { 4151 .iov_base = buf, 4152 }; 4153 4154 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4155 return -EINVAL; 4156 } 4157 4158 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4159 return -EINVAL; 4160 } 4161 4162 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4163 cb, cb_arg); 4164 } 4165 4166 static int 4167 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4168 struct iovec *iov, int iovcnt, void *md_buf, 4169 uint64_t offset_blocks, uint64_t num_blocks, 4170 spdk_bdev_io_completion_cb cb, void *cb_arg, 4171 struct spdk_bdev_ext_io_opts *opts) 4172 { 4173 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4174 struct spdk_bdev_io *bdev_io; 4175 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4176 4177 if (!desc->write) { 4178 return -EBADF; 4179 } 4180 4181 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4182 return -EINVAL; 4183 } 4184 4185 bdev_io = bdev_channel_get_io(channel); 4186 if (!bdev_io) { 4187 return -ENOMEM; 4188 } 4189 4190 bdev_io->internal.ch = channel; 4191 bdev_io->internal.desc = desc; 4192 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4193 bdev_io->u.bdev.iovs = iov; 4194 bdev_io->u.bdev.iovcnt = iovcnt; 4195 bdev_io->u.bdev.md_buf = md_buf; 4196 bdev_io->u.bdev.num_blocks = num_blocks; 4197 bdev_io->u.bdev.offset_blocks = offset_blocks; 4198 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4199 bdev_io->internal.ext_opts = opts; 4200 4201 bdev_io_submit(bdev_io); 4202 return 0; 4203 } 4204 4205 int 4206 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4207 struct iovec *iov, int iovcnt, 4208 uint64_t offset, uint64_t len, 4209 spdk_bdev_io_completion_cb cb, void *cb_arg) 4210 { 4211 uint64_t offset_blocks, num_blocks; 4212 4213 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4214 len, &num_blocks) != 0) { 4215 return -EINVAL; 4216 } 4217 4218 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4219 } 4220 4221 int 4222 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4223 struct iovec *iov, int iovcnt, 4224 uint64_t offset_blocks, uint64_t num_blocks, 4225 spdk_bdev_io_completion_cb cb, void *cb_arg) 4226 { 4227 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4228 num_blocks, cb, cb_arg, NULL); 4229 } 4230 4231 int 4232 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4233 struct iovec *iov, int iovcnt, void *md_buf, 4234 uint64_t offset_blocks, uint64_t num_blocks, 4235 spdk_bdev_io_completion_cb cb, void *cb_arg) 4236 { 4237 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4238 return -EINVAL; 4239 } 4240 4241 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4242 return -EINVAL; 4243 } 4244 4245 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4246 num_blocks, cb, cb_arg, NULL); 4247 } 4248 4249 int 4250 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4251 struct iovec *iov, int iovcnt, 4252 uint64_t offset_blocks, uint64_t num_blocks, 4253 spdk_bdev_io_completion_cb cb, void *cb_arg, 4254 struct spdk_bdev_ext_io_opts *opts) 4255 { 4256 void *md = NULL; 4257 4258 if (opts) { 4259 md = opts->metadata; 4260 } 4261 4262 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4263 return -EINVAL; 4264 } 4265 4266 if (md && !_bdev_io_check_md_buf(iov, md)) { 4267 return -EINVAL; 4268 } 4269 4270 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4271 num_blocks, cb, cb_arg, opts); 4272 } 4273 4274 static void 4275 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4276 { 4277 struct spdk_bdev_io *parent_io = cb_arg; 4278 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4279 int i, rc = 0; 4280 4281 if (!success) { 4282 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4283 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4284 spdk_bdev_free_io(bdev_io); 4285 return; 4286 } 4287 4288 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4289 rc = memcmp(read_buf, 4290 parent_io->u.bdev.iovs[i].iov_base, 4291 parent_io->u.bdev.iovs[i].iov_len); 4292 if (rc) { 4293 break; 4294 } 4295 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4296 } 4297 4298 spdk_bdev_free_io(bdev_io); 4299 4300 if (rc == 0) { 4301 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4302 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4303 } else { 4304 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4305 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4306 } 4307 } 4308 4309 static void 4310 bdev_compare_do_read(void *_bdev_io) 4311 { 4312 struct spdk_bdev_io *bdev_io = _bdev_io; 4313 int rc; 4314 4315 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4316 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4317 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4318 bdev_compare_do_read_done, bdev_io); 4319 4320 if (rc == -ENOMEM) { 4321 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4322 } else if (rc != 0) { 4323 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4324 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4325 } 4326 } 4327 4328 static int 4329 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4330 struct iovec *iov, int iovcnt, void *md_buf, 4331 uint64_t offset_blocks, uint64_t num_blocks, 4332 spdk_bdev_io_completion_cb cb, void *cb_arg) 4333 { 4334 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4335 struct spdk_bdev_io *bdev_io; 4336 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4337 4338 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4339 return -EINVAL; 4340 } 4341 4342 bdev_io = bdev_channel_get_io(channel); 4343 if (!bdev_io) { 4344 return -ENOMEM; 4345 } 4346 4347 bdev_io->internal.ch = channel; 4348 bdev_io->internal.desc = desc; 4349 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4350 bdev_io->u.bdev.iovs = iov; 4351 bdev_io->u.bdev.iovcnt = iovcnt; 4352 bdev_io->u.bdev.md_buf = md_buf; 4353 bdev_io->u.bdev.num_blocks = num_blocks; 4354 bdev_io->u.bdev.offset_blocks = offset_blocks; 4355 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4356 4357 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4358 bdev_io_submit(bdev_io); 4359 return 0; 4360 } 4361 4362 bdev_compare_do_read(bdev_io); 4363 4364 return 0; 4365 } 4366 4367 int 4368 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4369 struct iovec *iov, int iovcnt, 4370 uint64_t offset_blocks, uint64_t num_blocks, 4371 spdk_bdev_io_completion_cb cb, void *cb_arg) 4372 { 4373 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4374 num_blocks, cb, cb_arg); 4375 } 4376 4377 int 4378 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4379 struct iovec *iov, int iovcnt, void *md_buf, 4380 uint64_t offset_blocks, uint64_t num_blocks, 4381 spdk_bdev_io_completion_cb cb, void *cb_arg) 4382 { 4383 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4384 return -EINVAL; 4385 } 4386 4387 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4388 return -EINVAL; 4389 } 4390 4391 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4392 num_blocks, cb, cb_arg); 4393 } 4394 4395 static int 4396 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4397 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4398 spdk_bdev_io_completion_cb cb, void *cb_arg) 4399 { 4400 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4401 struct spdk_bdev_io *bdev_io; 4402 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4403 4404 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4405 return -EINVAL; 4406 } 4407 4408 bdev_io = bdev_channel_get_io(channel); 4409 if (!bdev_io) { 4410 return -ENOMEM; 4411 } 4412 4413 bdev_io->internal.ch = channel; 4414 bdev_io->internal.desc = desc; 4415 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4416 bdev_io->u.bdev.iovs = &bdev_io->iov; 4417 bdev_io->u.bdev.iovs[0].iov_base = buf; 4418 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4419 bdev_io->u.bdev.iovcnt = 1; 4420 bdev_io->u.bdev.md_buf = md_buf; 4421 bdev_io->u.bdev.num_blocks = num_blocks; 4422 bdev_io->u.bdev.offset_blocks = offset_blocks; 4423 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4424 4425 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4426 bdev_io_submit(bdev_io); 4427 return 0; 4428 } 4429 4430 bdev_compare_do_read(bdev_io); 4431 4432 return 0; 4433 } 4434 4435 int 4436 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4437 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4438 spdk_bdev_io_completion_cb cb, void *cb_arg) 4439 { 4440 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4441 cb, cb_arg); 4442 } 4443 4444 int 4445 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4446 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4447 spdk_bdev_io_completion_cb cb, void *cb_arg) 4448 { 4449 struct iovec iov = { 4450 .iov_base = buf, 4451 }; 4452 4453 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4454 return -EINVAL; 4455 } 4456 4457 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4458 return -EINVAL; 4459 } 4460 4461 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4462 cb, cb_arg); 4463 } 4464 4465 static void 4466 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4467 { 4468 struct spdk_bdev_io *bdev_io = ctx; 4469 4470 if (unlock_status) { 4471 SPDK_ERRLOG("LBA range unlock failed\n"); 4472 } 4473 4474 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4475 false, bdev_io->internal.caller_ctx); 4476 } 4477 4478 static void 4479 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4480 { 4481 bdev_io->internal.status = status; 4482 4483 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4484 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4485 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4486 } 4487 4488 static void 4489 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4490 { 4491 struct spdk_bdev_io *parent_io = cb_arg; 4492 4493 if (!success) { 4494 SPDK_ERRLOG("Compare and write operation failed\n"); 4495 } 4496 4497 spdk_bdev_free_io(bdev_io); 4498 4499 bdev_comparev_and_writev_blocks_unlock(parent_io, 4500 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4501 } 4502 4503 static void 4504 bdev_compare_and_write_do_write(void *_bdev_io) 4505 { 4506 struct spdk_bdev_io *bdev_io = _bdev_io; 4507 int rc; 4508 4509 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4510 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4511 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4512 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4513 bdev_compare_and_write_do_write_done, bdev_io); 4514 4515 4516 if (rc == -ENOMEM) { 4517 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4518 } else if (rc != 0) { 4519 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4520 } 4521 } 4522 4523 static void 4524 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4525 { 4526 struct spdk_bdev_io *parent_io = cb_arg; 4527 4528 spdk_bdev_free_io(bdev_io); 4529 4530 if (!success) { 4531 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4532 return; 4533 } 4534 4535 bdev_compare_and_write_do_write(parent_io); 4536 } 4537 4538 static void 4539 bdev_compare_and_write_do_compare(void *_bdev_io) 4540 { 4541 struct spdk_bdev_io *bdev_io = _bdev_io; 4542 int rc; 4543 4544 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4545 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4546 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4547 bdev_compare_and_write_do_compare_done, bdev_io); 4548 4549 if (rc == -ENOMEM) { 4550 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4551 } else if (rc != 0) { 4552 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4553 } 4554 } 4555 4556 static void 4557 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4558 { 4559 struct spdk_bdev_io *bdev_io = ctx; 4560 4561 if (status) { 4562 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4563 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4564 return; 4565 } 4566 4567 bdev_compare_and_write_do_compare(bdev_io); 4568 } 4569 4570 int 4571 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4572 struct iovec *compare_iov, int compare_iovcnt, 4573 struct iovec *write_iov, int write_iovcnt, 4574 uint64_t offset_blocks, uint64_t num_blocks, 4575 spdk_bdev_io_completion_cb cb, void *cb_arg) 4576 { 4577 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4578 struct spdk_bdev_io *bdev_io; 4579 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4580 4581 if (!desc->write) { 4582 return -EBADF; 4583 } 4584 4585 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4586 return -EINVAL; 4587 } 4588 4589 if (num_blocks > bdev->acwu) { 4590 return -EINVAL; 4591 } 4592 4593 bdev_io = bdev_channel_get_io(channel); 4594 if (!bdev_io) { 4595 return -ENOMEM; 4596 } 4597 4598 bdev_io->internal.ch = channel; 4599 bdev_io->internal.desc = desc; 4600 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 4601 bdev_io->u.bdev.iovs = compare_iov; 4602 bdev_io->u.bdev.iovcnt = compare_iovcnt; 4603 bdev_io->u.bdev.fused_iovs = write_iov; 4604 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4605 bdev_io->u.bdev.md_buf = NULL; 4606 bdev_io->u.bdev.num_blocks = num_blocks; 4607 bdev_io->u.bdev.offset_blocks = offset_blocks; 4608 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4609 4610 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4611 bdev_io_submit(bdev_io); 4612 return 0; 4613 } 4614 4615 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4616 bdev_comparev_and_writev_blocks_locked, bdev_io); 4617 } 4618 4619 int 4620 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4621 struct iovec *iov, int iovcnt, 4622 uint64_t offset_blocks, uint64_t num_blocks, 4623 bool populate, 4624 spdk_bdev_io_completion_cb cb, void *cb_arg) 4625 { 4626 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4627 struct spdk_bdev_io *bdev_io; 4628 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4629 4630 if (!desc->write) { 4631 return -EBADF; 4632 } 4633 4634 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4635 return -EINVAL; 4636 } 4637 4638 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4639 return -ENOTSUP; 4640 } 4641 4642 bdev_io = bdev_channel_get_io(channel); 4643 if (!bdev_io) { 4644 return -ENOMEM; 4645 } 4646 4647 bdev_io->internal.ch = channel; 4648 bdev_io->internal.desc = desc; 4649 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4650 bdev_io->u.bdev.num_blocks = num_blocks; 4651 bdev_io->u.bdev.offset_blocks = offset_blocks; 4652 bdev_io->u.bdev.iovs = iov; 4653 bdev_io->u.bdev.iovcnt = iovcnt; 4654 bdev_io->u.bdev.md_buf = NULL; 4655 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4656 bdev_io->u.bdev.zcopy.commit = 0; 4657 bdev_io->u.bdev.zcopy.start = 1; 4658 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4659 4660 bdev_io_submit(bdev_io); 4661 4662 return 0; 4663 } 4664 4665 int 4666 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4667 spdk_bdev_io_completion_cb cb, void *cb_arg) 4668 { 4669 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4670 return -EINVAL; 4671 } 4672 4673 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4674 bdev_io->u.bdev.zcopy.start = 0; 4675 bdev_io->internal.caller_ctx = cb_arg; 4676 bdev_io->internal.cb = cb; 4677 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4678 4679 bdev_io_submit(bdev_io); 4680 4681 return 0; 4682 } 4683 4684 int 4685 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4686 uint64_t offset, uint64_t len, 4687 spdk_bdev_io_completion_cb cb, void *cb_arg) 4688 { 4689 uint64_t offset_blocks, num_blocks; 4690 4691 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4692 len, &num_blocks) != 0) { 4693 return -EINVAL; 4694 } 4695 4696 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4697 } 4698 4699 int 4700 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4701 uint64_t offset_blocks, uint64_t num_blocks, 4702 spdk_bdev_io_completion_cb cb, void *cb_arg) 4703 { 4704 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4705 struct spdk_bdev_io *bdev_io; 4706 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4707 4708 if (!desc->write) { 4709 return -EBADF; 4710 } 4711 4712 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4713 return -EINVAL; 4714 } 4715 4716 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4717 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4718 return -ENOTSUP; 4719 } 4720 4721 bdev_io = bdev_channel_get_io(channel); 4722 4723 if (!bdev_io) { 4724 return -ENOMEM; 4725 } 4726 4727 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 4728 bdev_io->internal.ch = channel; 4729 bdev_io->internal.desc = desc; 4730 bdev_io->u.bdev.offset_blocks = offset_blocks; 4731 bdev_io->u.bdev.num_blocks = num_blocks; 4732 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4733 4734 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 4735 bdev_io_submit(bdev_io); 4736 return 0; 4737 } 4738 4739 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 4740 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 4741 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 4742 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 4743 bdev_write_zero_buffer_next(bdev_io); 4744 4745 return 0; 4746 } 4747 4748 int 4749 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4750 uint64_t offset, uint64_t nbytes, 4751 spdk_bdev_io_completion_cb cb, void *cb_arg) 4752 { 4753 uint64_t offset_blocks, num_blocks; 4754 4755 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4756 nbytes, &num_blocks) != 0) { 4757 return -EINVAL; 4758 } 4759 4760 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4761 } 4762 4763 int 4764 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4765 uint64_t offset_blocks, uint64_t num_blocks, 4766 spdk_bdev_io_completion_cb cb, void *cb_arg) 4767 { 4768 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4769 struct spdk_bdev_io *bdev_io; 4770 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4771 4772 if (!desc->write) { 4773 return -EBADF; 4774 } 4775 4776 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4777 return -EINVAL; 4778 } 4779 4780 if (num_blocks == 0) { 4781 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 4782 return -EINVAL; 4783 } 4784 4785 bdev_io = bdev_channel_get_io(channel); 4786 if (!bdev_io) { 4787 return -ENOMEM; 4788 } 4789 4790 bdev_io->internal.ch = channel; 4791 bdev_io->internal.desc = desc; 4792 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 4793 4794 bdev_io->u.bdev.iovs = &bdev_io->iov; 4795 bdev_io->u.bdev.iovs[0].iov_base = NULL; 4796 bdev_io->u.bdev.iovs[0].iov_len = 0; 4797 bdev_io->u.bdev.iovcnt = 1; 4798 4799 bdev_io->u.bdev.offset_blocks = offset_blocks; 4800 bdev_io->u.bdev.num_blocks = num_blocks; 4801 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4802 4803 bdev_io_submit(bdev_io); 4804 return 0; 4805 } 4806 4807 int 4808 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4809 uint64_t offset, uint64_t length, 4810 spdk_bdev_io_completion_cb cb, void *cb_arg) 4811 { 4812 uint64_t offset_blocks, num_blocks; 4813 4814 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4815 length, &num_blocks) != 0) { 4816 return -EINVAL; 4817 } 4818 4819 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4820 } 4821 4822 int 4823 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4824 uint64_t offset_blocks, uint64_t num_blocks, 4825 spdk_bdev_io_completion_cb cb, void *cb_arg) 4826 { 4827 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4828 struct spdk_bdev_io *bdev_io; 4829 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4830 4831 if (!desc->write) { 4832 return -EBADF; 4833 } 4834 4835 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4836 return -EINVAL; 4837 } 4838 4839 bdev_io = bdev_channel_get_io(channel); 4840 if (!bdev_io) { 4841 return -ENOMEM; 4842 } 4843 4844 bdev_io->internal.ch = channel; 4845 bdev_io->internal.desc = desc; 4846 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 4847 bdev_io->u.bdev.iovs = NULL; 4848 bdev_io->u.bdev.iovcnt = 0; 4849 bdev_io->u.bdev.offset_blocks = offset_blocks; 4850 bdev_io->u.bdev.num_blocks = num_blocks; 4851 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4852 4853 bdev_io_submit(bdev_io); 4854 return 0; 4855 } 4856 4857 static void 4858 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 4859 { 4860 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 4861 struct spdk_bdev_io *bdev_io; 4862 4863 bdev_io = TAILQ_FIRST(&ch->queued_resets); 4864 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 4865 bdev_io_submit_reset(bdev_io); 4866 } 4867 4868 static void 4869 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 4870 { 4871 struct spdk_io_channel *ch; 4872 struct spdk_bdev_channel *channel; 4873 struct spdk_bdev_mgmt_channel *mgmt_channel; 4874 struct spdk_bdev_shared_resource *shared_resource; 4875 bdev_io_tailq_t tmp_queued; 4876 4877 TAILQ_INIT(&tmp_queued); 4878 4879 ch = spdk_io_channel_iter_get_channel(i); 4880 channel = spdk_io_channel_get_ctx(ch); 4881 shared_resource = channel->shared_resource; 4882 mgmt_channel = shared_resource->mgmt_ch; 4883 4884 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 4885 4886 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 4887 /* The QoS object is always valid and readable while 4888 * the channel flag is set, so the lock here should not 4889 * be necessary. We're not in the fast path though, so 4890 * just take it anyway. */ 4891 pthread_mutex_lock(&channel->bdev->internal.mutex); 4892 if (channel->bdev->internal.qos->ch == channel) { 4893 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 4894 } 4895 pthread_mutex_unlock(&channel->bdev->internal.mutex); 4896 } 4897 4898 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 4899 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 4900 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 4901 bdev_abort_all_queued_io(&tmp_queued, channel); 4902 4903 spdk_for_each_channel_continue(i, 0); 4904 } 4905 4906 static void 4907 bdev_start_reset(void *ctx) 4908 { 4909 struct spdk_bdev_channel *ch = ctx; 4910 4911 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 4912 ch, bdev_reset_dev); 4913 } 4914 4915 static void 4916 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 4917 { 4918 struct spdk_bdev *bdev = ch->bdev; 4919 4920 assert(!TAILQ_EMPTY(&ch->queued_resets)); 4921 4922 pthread_mutex_lock(&bdev->internal.mutex); 4923 if (bdev->internal.reset_in_progress == NULL) { 4924 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 4925 /* 4926 * Take a channel reference for the target bdev for the life of this 4927 * reset. This guards against the channel getting destroyed while 4928 * spdk_for_each_channel() calls related to this reset IO are in 4929 * progress. We will release the reference when this reset is 4930 * completed. 4931 */ 4932 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4933 bdev_start_reset(ch); 4934 } 4935 pthread_mutex_unlock(&bdev->internal.mutex); 4936 } 4937 4938 int 4939 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4940 spdk_bdev_io_completion_cb cb, void *cb_arg) 4941 { 4942 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4943 struct spdk_bdev_io *bdev_io; 4944 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4945 4946 bdev_io = bdev_channel_get_io(channel); 4947 if (!bdev_io) { 4948 return -ENOMEM; 4949 } 4950 4951 bdev_io->internal.ch = channel; 4952 bdev_io->internal.desc = desc; 4953 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4954 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 4955 bdev_io->u.reset.ch_ref = NULL; 4956 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4957 4958 pthread_mutex_lock(&bdev->internal.mutex); 4959 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 4960 pthread_mutex_unlock(&bdev->internal.mutex); 4961 4962 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 4963 internal.ch_link); 4964 4965 bdev_channel_start_reset(channel); 4966 4967 return 0; 4968 } 4969 4970 void 4971 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4972 struct spdk_bdev_io_stat *stat) 4973 { 4974 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4975 4976 *stat = channel->stat; 4977 } 4978 4979 static void 4980 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 4981 { 4982 void *io_device = spdk_io_channel_iter_get_io_device(i); 4983 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4984 4985 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 4986 bdev_iostat_ctx->cb_arg, 0); 4987 free(bdev_iostat_ctx); 4988 } 4989 4990 static void 4991 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 4992 { 4993 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4994 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4995 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4996 4997 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 4998 spdk_for_each_channel_continue(i, 0); 4999 } 5000 5001 void 5002 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5003 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5004 { 5005 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5006 5007 assert(bdev != NULL); 5008 assert(stat != NULL); 5009 assert(cb != NULL); 5010 5011 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5012 if (bdev_iostat_ctx == NULL) { 5013 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5014 cb(bdev, stat, cb_arg, -ENOMEM); 5015 return; 5016 } 5017 5018 bdev_iostat_ctx->stat = stat; 5019 bdev_iostat_ctx->cb = cb; 5020 bdev_iostat_ctx->cb_arg = cb_arg; 5021 5022 /* Start with the statistics from previously deleted channels. */ 5023 pthread_mutex_lock(&bdev->internal.mutex); 5024 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 5025 pthread_mutex_unlock(&bdev->internal.mutex); 5026 5027 /* Then iterate and add the statistics from each existing channel. */ 5028 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5029 bdev_get_each_channel_stat, 5030 bdev_iostat_ctx, 5031 bdev_get_device_stat_done); 5032 } 5033 5034 int 5035 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5036 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5037 spdk_bdev_io_completion_cb cb, void *cb_arg) 5038 { 5039 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5040 struct spdk_bdev_io *bdev_io; 5041 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5042 5043 if (!desc->write) { 5044 return -EBADF; 5045 } 5046 5047 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5048 return -ENOTSUP; 5049 } 5050 5051 bdev_io = bdev_channel_get_io(channel); 5052 if (!bdev_io) { 5053 return -ENOMEM; 5054 } 5055 5056 bdev_io->internal.ch = channel; 5057 bdev_io->internal.desc = desc; 5058 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5059 bdev_io->u.nvme_passthru.cmd = *cmd; 5060 bdev_io->u.nvme_passthru.buf = buf; 5061 bdev_io->u.nvme_passthru.nbytes = nbytes; 5062 bdev_io->u.nvme_passthru.md_buf = NULL; 5063 bdev_io->u.nvme_passthru.md_len = 0; 5064 5065 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5066 5067 bdev_io_submit(bdev_io); 5068 return 0; 5069 } 5070 5071 int 5072 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5073 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5074 spdk_bdev_io_completion_cb cb, void *cb_arg) 5075 { 5076 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5077 struct spdk_bdev_io *bdev_io; 5078 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5079 5080 if (!desc->write) { 5081 /* 5082 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5083 * to easily determine if the command is a read or write, but for now just 5084 * do not allow io_passthru with a read-only descriptor. 5085 */ 5086 return -EBADF; 5087 } 5088 5089 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5090 return -ENOTSUP; 5091 } 5092 5093 bdev_io = bdev_channel_get_io(channel); 5094 if (!bdev_io) { 5095 return -ENOMEM; 5096 } 5097 5098 bdev_io->internal.ch = channel; 5099 bdev_io->internal.desc = desc; 5100 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5101 bdev_io->u.nvme_passthru.cmd = *cmd; 5102 bdev_io->u.nvme_passthru.buf = buf; 5103 bdev_io->u.nvme_passthru.nbytes = nbytes; 5104 bdev_io->u.nvme_passthru.md_buf = NULL; 5105 bdev_io->u.nvme_passthru.md_len = 0; 5106 5107 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5108 5109 bdev_io_submit(bdev_io); 5110 return 0; 5111 } 5112 5113 int 5114 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5115 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5116 spdk_bdev_io_completion_cb cb, void *cb_arg) 5117 { 5118 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5119 struct spdk_bdev_io *bdev_io; 5120 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5121 5122 if (!desc->write) { 5123 /* 5124 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5125 * to easily determine if the command is a read or write, but for now just 5126 * do not allow io_passthru with a read-only descriptor. 5127 */ 5128 return -EBADF; 5129 } 5130 5131 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5132 return -ENOTSUP; 5133 } 5134 5135 bdev_io = bdev_channel_get_io(channel); 5136 if (!bdev_io) { 5137 return -ENOMEM; 5138 } 5139 5140 bdev_io->internal.ch = channel; 5141 bdev_io->internal.desc = desc; 5142 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5143 bdev_io->u.nvme_passthru.cmd = *cmd; 5144 bdev_io->u.nvme_passthru.buf = buf; 5145 bdev_io->u.nvme_passthru.nbytes = nbytes; 5146 bdev_io->u.nvme_passthru.md_buf = md_buf; 5147 bdev_io->u.nvme_passthru.md_len = md_len; 5148 5149 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5150 5151 bdev_io_submit(bdev_io); 5152 return 0; 5153 } 5154 5155 static void bdev_abort_retry(void *ctx); 5156 static void bdev_abort(struct spdk_bdev_io *parent_io); 5157 5158 static void 5159 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5160 { 5161 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5162 struct spdk_bdev_io *parent_io = cb_arg; 5163 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5164 5165 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5166 5167 spdk_bdev_free_io(bdev_io); 5168 5169 if (!success) { 5170 /* Check if the target I/O completed in the meantime. */ 5171 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5172 if (tmp_io == bio_to_abort) { 5173 break; 5174 } 5175 } 5176 5177 /* If the target I/O still exists, set the parent to failed. */ 5178 if (tmp_io != NULL) { 5179 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5180 } 5181 } 5182 5183 parent_io->u.bdev.split_outstanding--; 5184 if (parent_io->u.bdev.split_outstanding == 0) { 5185 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5186 bdev_abort_retry(parent_io); 5187 } else { 5188 bdev_io_complete(parent_io); 5189 } 5190 } 5191 } 5192 5193 static int 5194 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5195 struct spdk_bdev_io *bio_to_abort, 5196 spdk_bdev_io_completion_cb cb, void *cb_arg) 5197 { 5198 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5199 struct spdk_bdev_io *bdev_io; 5200 5201 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5202 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5203 /* TODO: Abort reset or abort request. */ 5204 return -ENOTSUP; 5205 } 5206 5207 bdev_io = bdev_channel_get_io(channel); 5208 if (bdev_io == NULL) { 5209 return -ENOMEM; 5210 } 5211 5212 bdev_io->internal.ch = channel; 5213 bdev_io->internal.desc = desc; 5214 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5215 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5216 5217 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5218 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5219 5220 /* Parent abort request is not submitted directly, but to manage its 5221 * execution add it to the submitted list here. 5222 */ 5223 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5224 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5225 5226 bdev_abort(bdev_io); 5227 5228 return 0; 5229 } 5230 5231 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5232 5233 /* Submit the abort request to the underlying bdev module. */ 5234 bdev_io_submit(bdev_io); 5235 5236 return 0; 5237 } 5238 5239 static uint32_t 5240 _bdev_abort(struct spdk_bdev_io *parent_io) 5241 { 5242 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5243 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5244 void *bio_cb_arg; 5245 struct spdk_bdev_io *bio_to_abort; 5246 uint32_t matched_ios; 5247 int rc; 5248 5249 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5250 5251 /* matched_ios is returned and will be kept by the caller. 5252 * 5253 * This funcion will be used for two cases, 1) the same cb_arg is used for 5254 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5255 * Incrementing split_outstanding directly here may confuse readers especially 5256 * for the 1st case. 5257 * 5258 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5259 * works as expected. 5260 */ 5261 matched_ios = 0; 5262 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5263 5264 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5265 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5266 continue; 5267 } 5268 5269 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5270 /* Any I/O which was submitted after this abort command should be excluded. */ 5271 continue; 5272 } 5273 5274 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5275 if (rc != 0) { 5276 if (rc == -ENOMEM) { 5277 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5278 } else { 5279 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5280 } 5281 break; 5282 } 5283 matched_ios++; 5284 } 5285 5286 return matched_ios; 5287 } 5288 5289 static void 5290 bdev_abort_retry(void *ctx) 5291 { 5292 struct spdk_bdev_io *parent_io = ctx; 5293 uint32_t matched_ios; 5294 5295 matched_ios = _bdev_abort(parent_io); 5296 5297 if (matched_ios == 0) { 5298 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5299 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5300 } else { 5301 /* For retry, the case that no target I/O was found is success 5302 * because it means target I/Os completed in the meantime. 5303 */ 5304 bdev_io_complete(parent_io); 5305 } 5306 return; 5307 } 5308 5309 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5310 parent_io->u.bdev.split_outstanding = matched_ios; 5311 } 5312 5313 static void 5314 bdev_abort(struct spdk_bdev_io *parent_io) 5315 { 5316 uint32_t matched_ios; 5317 5318 matched_ios = _bdev_abort(parent_io); 5319 5320 if (matched_ios == 0) { 5321 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5322 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5323 } else { 5324 /* The case the no target I/O was found is failure. */ 5325 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5326 bdev_io_complete(parent_io); 5327 } 5328 return; 5329 } 5330 5331 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5332 parent_io->u.bdev.split_outstanding = matched_ios; 5333 } 5334 5335 int 5336 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5337 void *bio_cb_arg, 5338 spdk_bdev_io_completion_cb cb, void *cb_arg) 5339 { 5340 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5341 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5342 struct spdk_bdev_io *bdev_io; 5343 5344 if (bio_cb_arg == NULL) { 5345 return -EINVAL; 5346 } 5347 5348 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5349 return -ENOTSUP; 5350 } 5351 5352 bdev_io = bdev_channel_get_io(channel); 5353 if (bdev_io == NULL) { 5354 return -ENOMEM; 5355 } 5356 5357 bdev_io->internal.ch = channel; 5358 bdev_io->internal.desc = desc; 5359 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5360 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5361 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5362 5363 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5364 5365 /* Parent abort request is not submitted directly, but to manage its execution, 5366 * add it to the submitted list here. 5367 */ 5368 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5369 5370 bdev_abort(bdev_io); 5371 5372 return 0; 5373 } 5374 5375 int 5376 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5377 struct spdk_bdev_io_wait_entry *entry) 5378 { 5379 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5380 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5381 5382 if (bdev != entry->bdev) { 5383 SPDK_ERRLOG("bdevs do not match\n"); 5384 return -EINVAL; 5385 } 5386 5387 if (mgmt_ch->per_thread_cache_count > 0) { 5388 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5389 return -EINVAL; 5390 } 5391 5392 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5393 return 0; 5394 } 5395 5396 static inline void 5397 bdev_io_complete(void *ctx) 5398 { 5399 struct spdk_bdev_io *bdev_io = ctx; 5400 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5401 uint64_t tsc, tsc_diff; 5402 5403 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5404 /* 5405 * Send the completion to the thread that originally submitted the I/O, 5406 * which may not be the current thread in the case of QoS. 5407 */ 5408 if (bdev_io->internal.io_submit_ch) { 5409 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5410 bdev_io->internal.io_submit_ch = NULL; 5411 } 5412 5413 /* 5414 * Defer completion to avoid potential infinite recursion if the 5415 * user's completion callback issues a new I/O. 5416 */ 5417 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5418 bdev_io_complete, bdev_io); 5419 return; 5420 } 5421 5422 tsc = spdk_get_ticks(); 5423 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5424 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5425 bdev_io->internal.caller_ctx); 5426 5427 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5428 5429 if (bdev_io->internal.ch->histogram) { 5430 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5431 } 5432 5433 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5434 switch (bdev_io->type) { 5435 case SPDK_BDEV_IO_TYPE_READ: 5436 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5437 bdev_io->internal.ch->stat.num_read_ops++; 5438 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5439 break; 5440 case SPDK_BDEV_IO_TYPE_WRITE: 5441 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5442 bdev_io->internal.ch->stat.num_write_ops++; 5443 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5444 break; 5445 case SPDK_BDEV_IO_TYPE_UNMAP: 5446 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5447 bdev_io->internal.ch->stat.num_unmap_ops++; 5448 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5449 break; 5450 case SPDK_BDEV_IO_TYPE_ZCOPY: 5451 /* Track the data in the start phase only */ 5452 if (bdev_io->u.bdev.zcopy.start) { 5453 if (bdev_io->u.bdev.zcopy.populate) { 5454 bdev_io->internal.ch->stat.bytes_read += 5455 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5456 bdev_io->internal.ch->stat.num_read_ops++; 5457 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5458 } else { 5459 bdev_io->internal.ch->stat.bytes_written += 5460 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5461 bdev_io->internal.ch->stat.num_write_ops++; 5462 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5463 } 5464 } 5465 break; 5466 default: 5467 break; 5468 } 5469 } 5470 5471 #ifdef SPDK_CONFIG_VTUNE 5472 uint64_t now_tsc = spdk_get_ticks(); 5473 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5474 uint64_t data[5]; 5475 5476 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5477 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5478 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5479 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5480 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5481 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5482 5483 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5484 __itt_metadata_u64, 5, data); 5485 5486 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5487 bdev_io->internal.ch->start_tsc = now_tsc; 5488 } 5489 #endif 5490 5491 assert(bdev_io->internal.cb != NULL); 5492 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5493 5494 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5495 bdev_io->internal.caller_ctx); 5496 } 5497 5498 static void 5499 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 5500 { 5501 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5502 5503 if (bdev_io->u.reset.ch_ref != NULL) { 5504 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 5505 bdev_io->u.reset.ch_ref = NULL; 5506 } 5507 5508 bdev_io_complete(bdev_io); 5509 } 5510 5511 static void 5512 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 5513 { 5514 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5515 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5516 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5517 struct spdk_bdev_io *queued_reset; 5518 5519 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 5520 while (!TAILQ_EMPTY(&ch->queued_resets)) { 5521 queued_reset = TAILQ_FIRST(&ch->queued_resets); 5522 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 5523 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 5524 } 5525 5526 spdk_for_each_channel_continue(i, 0); 5527 } 5528 5529 void 5530 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 5531 { 5532 struct spdk_bdev *bdev = bdev_io->bdev; 5533 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5534 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5535 5536 bdev_io->internal.status = status; 5537 5538 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5539 bool unlock_channels = false; 5540 5541 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5542 SPDK_ERRLOG("NOMEM returned for reset\n"); 5543 } 5544 pthread_mutex_lock(&bdev->internal.mutex); 5545 if (bdev_io == bdev->internal.reset_in_progress) { 5546 bdev->internal.reset_in_progress = NULL; 5547 unlock_channels = true; 5548 } 5549 pthread_mutex_unlock(&bdev->internal.mutex); 5550 5551 if (unlock_channels) { 5552 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5553 bdev_io, bdev_reset_complete); 5554 return; 5555 } 5556 } else { 5557 _bdev_io_unset_bounce_buf(bdev_io); 5558 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 5559 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 5560 return; 5561 } 5562 } 5563 5564 bdev_io_complete(bdev_io); 5565 } 5566 5567 void 5568 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5569 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5570 { 5571 if (sc == SPDK_SCSI_STATUS_GOOD) { 5572 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5573 } else { 5574 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5575 bdev_io->internal.error.scsi.sc = sc; 5576 bdev_io->internal.error.scsi.sk = sk; 5577 bdev_io->internal.error.scsi.asc = asc; 5578 bdev_io->internal.error.scsi.ascq = ascq; 5579 } 5580 5581 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5582 } 5583 5584 void 5585 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5586 int *sc, int *sk, int *asc, int *ascq) 5587 { 5588 assert(sc != NULL); 5589 assert(sk != NULL); 5590 assert(asc != NULL); 5591 assert(ascq != NULL); 5592 5593 switch (bdev_io->internal.status) { 5594 case SPDK_BDEV_IO_STATUS_SUCCESS: 5595 *sc = SPDK_SCSI_STATUS_GOOD; 5596 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5597 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5598 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5599 break; 5600 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5601 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5602 break; 5603 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5604 *sc = bdev_io->internal.error.scsi.sc; 5605 *sk = bdev_io->internal.error.scsi.sk; 5606 *asc = bdev_io->internal.error.scsi.asc; 5607 *ascq = bdev_io->internal.error.scsi.ascq; 5608 break; 5609 default: 5610 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5611 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5612 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5613 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5614 break; 5615 } 5616 } 5617 5618 void 5619 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 5620 { 5621 if (aio_result == 0) { 5622 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5623 } else { 5624 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 5625 } 5626 5627 bdev_io->internal.error.aio_result = aio_result; 5628 5629 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5630 } 5631 5632 void 5633 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 5634 { 5635 assert(aio_result != NULL); 5636 5637 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 5638 *aio_result = bdev_io->internal.error.aio_result; 5639 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5640 *aio_result = 0; 5641 } else { 5642 *aio_result = -EIO; 5643 } 5644 } 5645 5646 void 5647 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5648 { 5649 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5650 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5651 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 5652 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 5653 } else { 5654 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5655 } 5656 5657 bdev_io->internal.error.nvme.cdw0 = cdw0; 5658 bdev_io->internal.error.nvme.sct = sct; 5659 bdev_io->internal.error.nvme.sc = sc; 5660 5661 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5662 } 5663 5664 void 5665 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5666 { 5667 assert(sct != NULL); 5668 assert(sc != NULL); 5669 assert(cdw0 != NULL); 5670 5671 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 5672 *sct = SPDK_NVME_SCT_GENERIC; 5673 *sc = SPDK_NVME_SC_SUCCESS; 5674 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5675 *cdw0 = 0; 5676 } else { 5677 *cdw0 = 1U; 5678 } 5679 return; 5680 } 5681 5682 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5683 *sct = bdev_io->internal.error.nvme.sct; 5684 *sc = bdev_io->internal.error.nvme.sc; 5685 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5686 *sct = SPDK_NVME_SCT_GENERIC; 5687 *sc = SPDK_NVME_SC_SUCCESS; 5688 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5689 *sct = SPDK_NVME_SCT_GENERIC; 5690 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5691 } else { 5692 *sct = SPDK_NVME_SCT_GENERIC; 5693 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5694 } 5695 5696 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5697 } 5698 5699 void 5700 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5701 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5702 { 5703 assert(first_sct != NULL); 5704 assert(first_sc != NULL); 5705 assert(second_sct != NULL); 5706 assert(second_sc != NULL); 5707 assert(cdw0 != NULL); 5708 5709 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5710 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5711 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5712 *first_sct = bdev_io->internal.error.nvme.sct; 5713 *first_sc = bdev_io->internal.error.nvme.sc; 5714 *second_sct = SPDK_NVME_SCT_GENERIC; 5715 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5716 } else { 5717 *first_sct = SPDK_NVME_SCT_GENERIC; 5718 *first_sc = SPDK_NVME_SC_SUCCESS; 5719 *second_sct = bdev_io->internal.error.nvme.sct; 5720 *second_sc = bdev_io->internal.error.nvme.sc; 5721 } 5722 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5723 *first_sct = SPDK_NVME_SCT_GENERIC; 5724 *first_sc = SPDK_NVME_SC_SUCCESS; 5725 *second_sct = SPDK_NVME_SCT_GENERIC; 5726 *second_sc = SPDK_NVME_SC_SUCCESS; 5727 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 5728 *first_sct = SPDK_NVME_SCT_GENERIC; 5729 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5730 *second_sct = SPDK_NVME_SCT_GENERIC; 5731 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5732 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 5733 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 5734 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 5735 *second_sct = SPDK_NVME_SCT_GENERIC; 5736 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5737 } else { 5738 *first_sct = SPDK_NVME_SCT_GENERIC; 5739 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5740 *second_sct = SPDK_NVME_SCT_GENERIC; 5741 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5742 } 5743 5744 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5745 } 5746 5747 struct spdk_thread * 5748 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 5749 { 5750 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 5751 } 5752 5753 struct spdk_io_channel * 5754 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 5755 { 5756 return bdev_io->internal.ch->channel; 5757 } 5758 5759 static int 5760 bdev_register(struct spdk_bdev *bdev) 5761 { 5762 char *bdev_name; 5763 char uuid[SPDK_UUID_STRING_LEN]; 5764 int ret; 5765 5766 assert(bdev->module != NULL); 5767 5768 if (!bdev->name) { 5769 SPDK_ERRLOG("Bdev name is NULL\n"); 5770 return -EINVAL; 5771 } 5772 5773 if (!strlen(bdev->name)) { 5774 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 5775 return -EINVAL; 5776 } 5777 5778 /* Users often register their own I/O devices using the bdev name. In 5779 * order to avoid conflicts, prepend bdev_. */ 5780 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 5781 if (!bdev_name) { 5782 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 5783 return -ENOMEM; 5784 } 5785 5786 bdev->internal.status = SPDK_BDEV_STATUS_READY; 5787 bdev->internal.measured_queue_depth = UINT64_MAX; 5788 bdev->internal.claim_module = NULL; 5789 bdev->internal.qd_poller = NULL; 5790 bdev->internal.qos = NULL; 5791 5792 TAILQ_INIT(&bdev->internal.open_descs); 5793 TAILQ_INIT(&bdev->internal.locked_ranges); 5794 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 5795 TAILQ_INIT(&bdev->aliases); 5796 5797 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 5798 if (ret != 0) { 5799 free(bdev_name); 5800 return ret; 5801 } 5802 5803 /* If the user didn't specify a uuid, generate one. */ 5804 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 5805 spdk_uuid_generate(&bdev->uuid); 5806 } 5807 5808 /* Add the UUID alias only if it's different than the name */ 5809 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 5810 if (strcmp(bdev->name, uuid) != 0) { 5811 ret = spdk_bdev_alias_add(bdev, uuid); 5812 if (ret != 0) { 5813 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 5814 bdev_name_del(&bdev->internal.bdev_name); 5815 free(bdev_name); 5816 return ret; 5817 } 5818 } 5819 5820 if (spdk_bdev_get_buf_align(bdev) > 1) { 5821 if (bdev->split_on_optimal_io_boundary) { 5822 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 5823 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 5824 } else { 5825 bdev->split_on_optimal_io_boundary = true; 5826 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 5827 } 5828 } 5829 5830 /* If the user didn't specify a write unit size, set it to one. */ 5831 if (bdev->write_unit_size == 0) { 5832 bdev->write_unit_size = 1; 5833 } 5834 5835 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 5836 if (bdev->acwu == 0) { 5837 bdev->acwu = 1; 5838 } 5839 5840 if (bdev->phys_blocklen == 0) { 5841 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 5842 } 5843 5844 bdev->internal.reset_in_progress = NULL; 5845 5846 spdk_io_device_register(__bdev_to_io_dev(bdev), 5847 bdev_channel_create, bdev_channel_destroy, 5848 sizeof(struct spdk_bdev_channel), 5849 bdev_name); 5850 5851 free(bdev_name); 5852 5853 pthread_mutex_init(&bdev->internal.mutex, NULL); 5854 5855 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 5856 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 5857 5858 return 0; 5859 } 5860 5861 static void 5862 bdev_destroy_cb(void *io_device) 5863 { 5864 int rc; 5865 struct spdk_bdev *bdev; 5866 spdk_bdev_unregister_cb cb_fn; 5867 void *cb_arg; 5868 5869 bdev = __bdev_from_io_dev(io_device); 5870 cb_fn = bdev->internal.unregister_cb; 5871 cb_arg = bdev->internal.unregister_ctx; 5872 5873 pthread_mutex_destroy(&bdev->internal.mutex); 5874 free(bdev->internal.qos); 5875 5876 rc = bdev->fn_table->destruct(bdev->ctxt); 5877 if (rc < 0) { 5878 SPDK_ERRLOG("destruct failed\n"); 5879 } 5880 if (rc <= 0 && cb_fn != NULL) { 5881 cb_fn(cb_arg, rc); 5882 } 5883 } 5884 5885 static void 5886 bdev_register_finished(void *arg) 5887 { 5888 struct spdk_bdev *bdev = arg; 5889 5890 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 5891 } 5892 5893 int 5894 spdk_bdev_register(struct spdk_bdev *bdev) 5895 { 5896 int rc = bdev_register(bdev); 5897 5898 if (rc == 0) { 5899 /* Examine configuration before initializing I/O */ 5900 bdev_examine(bdev); 5901 5902 spdk_bdev_wait_for_examine(bdev_register_finished, bdev); 5903 } 5904 5905 return rc; 5906 } 5907 5908 void 5909 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 5910 { 5911 if (bdev->internal.unregister_cb != NULL) { 5912 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 5913 } 5914 } 5915 5916 static void 5917 _remove_notify(void *arg) 5918 { 5919 struct spdk_bdev_desc *desc = arg; 5920 5921 pthread_mutex_lock(&desc->mutex); 5922 desc->refs--; 5923 5924 if (!desc->closed) { 5925 pthread_mutex_unlock(&desc->mutex); 5926 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 5927 return; 5928 } else if (0 == desc->refs) { 5929 /* This descriptor was closed after this remove_notify message was sent. 5930 * spdk_bdev_close() could not free the descriptor since this message was 5931 * in flight, so we free it now using bdev_desc_free(). 5932 */ 5933 pthread_mutex_unlock(&desc->mutex); 5934 bdev_desc_free(desc); 5935 return; 5936 } 5937 pthread_mutex_unlock(&desc->mutex); 5938 } 5939 5940 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 5941 * returns: 0 - bdev removed and ready to be destructed. 5942 * -EBUSY - bdev can't be destructed yet. */ 5943 static int 5944 bdev_unregister_unsafe(struct spdk_bdev *bdev) 5945 { 5946 struct spdk_bdev_desc *desc, *tmp; 5947 int rc = 0; 5948 char uuid[SPDK_UUID_STRING_LEN]; 5949 5950 /* Notify each descriptor about hotremoval */ 5951 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 5952 rc = -EBUSY; 5953 pthread_mutex_lock(&desc->mutex); 5954 /* 5955 * Defer invocation of the event_cb to a separate message that will 5956 * run later on its thread. This ensures this context unwinds and 5957 * we don't recursively unregister this bdev again if the event_cb 5958 * immediately closes its descriptor. 5959 */ 5960 desc->refs++; 5961 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 5962 pthread_mutex_unlock(&desc->mutex); 5963 } 5964 5965 /* If there are no descriptors, proceed removing the bdev */ 5966 if (rc == 0) { 5967 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 5968 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 5969 5970 /* Delete the name and the UUID alias */ 5971 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 5972 bdev_name_del_unsafe(&bdev->internal.bdev_name); 5973 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 5974 5975 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 5976 } 5977 5978 return rc; 5979 } 5980 5981 void 5982 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 5983 { 5984 struct spdk_thread *thread; 5985 int rc; 5986 5987 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 5988 5989 thread = spdk_get_thread(); 5990 if (!thread) { 5991 /* The user called this from a non-SPDK thread. */ 5992 if (cb_fn != NULL) { 5993 cb_fn(cb_arg, -ENOTSUP); 5994 } 5995 return; 5996 } 5997 5998 pthread_mutex_lock(&g_bdev_mgr.mutex); 5999 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6000 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6001 if (cb_fn) { 6002 cb_fn(cb_arg, -EBUSY); 6003 } 6004 return; 6005 } 6006 6007 pthread_mutex_lock(&bdev->internal.mutex); 6008 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6009 bdev->internal.unregister_cb = cb_fn; 6010 bdev->internal.unregister_ctx = cb_arg; 6011 6012 /* Call under lock. */ 6013 rc = bdev_unregister_unsafe(bdev); 6014 pthread_mutex_unlock(&bdev->internal.mutex); 6015 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6016 6017 if (rc == 0) { 6018 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6019 } 6020 } 6021 6022 static int 6023 bdev_start_qos(struct spdk_bdev *bdev) 6024 { 6025 struct set_qos_limit_ctx *ctx; 6026 6027 /* Enable QoS */ 6028 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6029 ctx = calloc(1, sizeof(*ctx)); 6030 if (ctx == NULL) { 6031 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6032 return -ENOMEM; 6033 } 6034 ctx->bdev = bdev; 6035 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6036 bdev_enable_qos_msg, ctx, 6037 bdev_enable_qos_done); 6038 } 6039 6040 return 0; 6041 } 6042 6043 static int 6044 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6045 { 6046 struct spdk_thread *thread; 6047 int rc = 0; 6048 6049 thread = spdk_get_thread(); 6050 if (!thread) { 6051 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6052 return -ENOTSUP; 6053 } 6054 6055 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6056 spdk_get_thread()); 6057 6058 desc->bdev = bdev; 6059 desc->thread = thread; 6060 desc->write = write; 6061 6062 pthread_mutex_lock(&bdev->internal.mutex); 6063 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6064 pthread_mutex_unlock(&bdev->internal.mutex); 6065 return -ENODEV; 6066 } 6067 6068 if (write && bdev->internal.claim_module) { 6069 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6070 bdev->name, bdev->internal.claim_module->name); 6071 pthread_mutex_unlock(&bdev->internal.mutex); 6072 return -EPERM; 6073 } 6074 6075 rc = bdev_start_qos(bdev); 6076 if (rc != 0) { 6077 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6078 pthread_mutex_unlock(&bdev->internal.mutex); 6079 return rc; 6080 } 6081 6082 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6083 6084 pthread_mutex_unlock(&bdev->internal.mutex); 6085 6086 return 0; 6087 } 6088 6089 int 6090 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6091 void *event_ctx, struct spdk_bdev_desc **_desc) 6092 { 6093 struct spdk_bdev_desc *desc; 6094 struct spdk_bdev *bdev; 6095 unsigned int event_id; 6096 int rc; 6097 6098 if (event_cb == NULL) { 6099 SPDK_ERRLOG("Missing event callback function\n"); 6100 return -EINVAL; 6101 } 6102 6103 pthread_mutex_lock(&g_bdev_mgr.mutex); 6104 6105 bdev = bdev_get_by_name(bdev_name); 6106 6107 if (bdev == NULL) { 6108 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6109 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6110 return -ENODEV; 6111 } 6112 6113 desc = calloc(1, sizeof(*desc)); 6114 if (desc == NULL) { 6115 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6116 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6117 return -ENOMEM; 6118 } 6119 6120 TAILQ_INIT(&desc->pending_media_events); 6121 TAILQ_INIT(&desc->free_media_events); 6122 6123 desc->callback.event_fn = event_cb; 6124 desc->callback.ctx = event_ctx; 6125 pthread_mutex_init(&desc->mutex, NULL); 6126 6127 if (bdev->media_events) { 6128 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6129 sizeof(*desc->media_events_buffer)); 6130 if (desc->media_events_buffer == NULL) { 6131 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6132 bdev_desc_free(desc); 6133 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6134 return -ENOMEM; 6135 } 6136 6137 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6138 TAILQ_INSERT_TAIL(&desc->free_media_events, 6139 &desc->media_events_buffer[event_id], tailq); 6140 } 6141 } 6142 6143 rc = bdev_open(bdev, write, desc); 6144 if (rc != 0) { 6145 bdev_desc_free(desc); 6146 desc = NULL; 6147 } 6148 6149 *_desc = desc; 6150 6151 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6152 6153 return rc; 6154 } 6155 6156 void 6157 spdk_bdev_close(struct spdk_bdev_desc *desc) 6158 { 6159 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6160 int rc; 6161 6162 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6163 spdk_get_thread()); 6164 6165 assert(desc->thread == spdk_get_thread()); 6166 6167 spdk_poller_unregister(&desc->io_timeout_poller); 6168 6169 pthread_mutex_lock(&g_bdev_mgr.mutex); 6170 pthread_mutex_lock(&bdev->internal.mutex); 6171 pthread_mutex_lock(&desc->mutex); 6172 6173 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6174 6175 desc->closed = true; 6176 6177 if (0 == desc->refs) { 6178 pthread_mutex_unlock(&desc->mutex); 6179 bdev_desc_free(desc); 6180 } else { 6181 pthread_mutex_unlock(&desc->mutex); 6182 } 6183 6184 /* If no more descriptors, kill QoS channel */ 6185 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6186 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6187 bdev->name, spdk_get_thread()); 6188 6189 if (bdev_qos_destroy(bdev)) { 6190 /* There isn't anything we can do to recover here. Just let the 6191 * old QoS poller keep running. The QoS handling won't change 6192 * cores when the user allocates a new channel, but it won't break. */ 6193 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6194 } 6195 } 6196 6197 spdk_bdev_set_qd_sampling_period(bdev, 0); 6198 6199 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6200 rc = bdev_unregister_unsafe(bdev); 6201 pthread_mutex_unlock(&bdev->internal.mutex); 6202 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6203 6204 if (rc == 0) { 6205 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6206 } 6207 } else { 6208 pthread_mutex_unlock(&bdev->internal.mutex); 6209 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6210 } 6211 } 6212 6213 int 6214 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6215 struct spdk_bdev_module *module) 6216 { 6217 if (bdev->internal.claim_module != NULL) { 6218 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6219 bdev->internal.claim_module->name); 6220 return -EPERM; 6221 } 6222 6223 if (desc && !desc->write) { 6224 desc->write = true; 6225 } 6226 6227 bdev->internal.claim_module = module; 6228 return 0; 6229 } 6230 6231 void 6232 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6233 { 6234 assert(bdev->internal.claim_module != NULL); 6235 bdev->internal.claim_module = NULL; 6236 } 6237 6238 struct spdk_bdev * 6239 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6240 { 6241 assert(desc != NULL); 6242 return desc->bdev; 6243 } 6244 6245 void 6246 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6247 { 6248 struct iovec *iovs; 6249 int iovcnt; 6250 6251 if (bdev_io == NULL) { 6252 return; 6253 } 6254 6255 switch (bdev_io->type) { 6256 case SPDK_BDEV_IO_TYPE_READ: 6257 case SPDK_BDEV_IO_TYPE_WRITE: 6258 case SPDK_BDEV_IO_TYPE_ZCOPY: 6259 iovs = bdev_io->u.bdev.iovs; 6260 iovcnt = bdev_io->u.bdev.iovcnt; 6261 break; 6262 default: 6263 iovs = NULL; 6264 iovcnt = 0; 6265 break; 6266 } 6267 6268 if (iovp) { 6269 *iovp = iovs; 6270 } 6271 if (iovcntp) { 6272 *iovcntp = iovcnt; 6273 } 6274 } 6275 6276 void * 6277 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 6278 { 6279 if (bdev_io == NULL) { 6280 return NULL; 6281 } 6282 6283 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 6284 return NULL; 6285 } 6286 6287 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 6288 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 6289 return bdev_io->u.bdev.md_buf; 6290 } 6291 6292 return NULL; 6293 } 6294 6295 void * 6296 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 6297 { 6298 if (bdev_io == NULL) { 6299 assert(false); 6300 return NULL; 6301 } 6302 6303 return bdev_io->internal.caller_ctx; 6304 } 6305 6306 void 6307 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 6308 { 6309 6310 if (spdk_bdev_module_list_find(bdev_module->name)) { 6311 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 6312 assert(false); 6313 } 6314 6315 /* 6316 * Modules with examine callbacks must be initialized first, so they are 6317 * ready to handle examine callbacks from later modules that will 6318 * register physical bdevs. 6319 */ 6320 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 6321 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6322 } else { 6323 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6324 } 6325 } 6326 6327 struct spdk_bdev_module * 6328 spdk_bdev_module_list_find(const char *name) 6329 { 6330 struct spdk_bdev_module *bdev_module; 6331 6332 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 6333 if (strcmp(name, bdev_module->name) == 0) { 6334 break; 6335 } 6336 } 6337 6338 return bdev_module; 6339 } 6340 6341 static void 6342 bdev_write_zero_buffer_next(void *_bdev_io) 6343 { 6344 struct spdk_bdev_io *bdev_io = _bdev_io; 6345 uint64_t num_bytes, num_blocks; 6346 void *md_buf = NULL; 6347 int rc; 6348 6349 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 6350 bdev_io->u.bdev.split_remaining_num_blocks, 6351 ZERO_BUFFER_SIZE); 6352 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 6353 6354 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 6355 md_buf = (char *)g_bdev_mgr.zero_buffer + 6356 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 6357 } 6358 6359 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 6360 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6361 g_bdev_mgr.zero_buffer, md_buf, 6362 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 6363 bdev_write_zero_buffer_done, bdev_io); 6364 if (rc == 0) { 6365 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 6366 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 6367 } else if (rc == -ENOMEM) { 6368 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 6369 } else { 6370 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6371 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6372 } 6373 } 6374 6375 static void 6376 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6377 { 6378 struct spdk_bdev_io *parent_io = cb_arg; 6379 6380 spdk_bdev_free_io(bdev_io); 6381 6382 if (!success) { 6383 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6384 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6385 return; 6386 } 6387 6388 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 6389 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6390 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6391 return; 6392 } 6393 6394 bdev_write_zero_buffer_next(parent_io); 6395 } 6396 6397 static void 6398 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 6399 { 6400 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6401 ctx->bdev->internal.qos_mod_in_progress = false; 6402 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6403 6404 if (ctx->cb_fn) { 6405 ctx->cb_fn(ctx->cb_arg, status); 6406 } 6407 free(ctx); 6408 } 6409 6410 static void 6411 bdev_disable_qos_done(void *cb_arg) 6412 { 6413 struct set_qos_limit_ctx *ctx = cb_arg; 6414 struct spdk_bdev *bdev = ctx->bdev; 6415 struct spdk_bdev_io *bdev_io; 6416 struct spdk_bdev_qos *qos; 6417 6418 pthread_mutex_lock(&bdev->internal.mutex); 6419 qos = bdev->internal.qos; 6420 bdev->internal.qos = NULL; 6421 pthread_mutex_unlock(&bdev->internal.mutex); 6422 6423 while (!TAILQ_EMPTY(&qos->queued)) { 6424 /* Send queued I/O back to their original thread for resubmission. */ 6425 bdev_io = TAILQ_FIRST(&qos->queued); 6426 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6427 6428 if (bdev_io->internal.io_submit_ch) { 6429 /* 6430 * Channel was changed when sending it to the QoS thread - change it back 6431 * before sending it back to the original thread. 6432 */ 6433 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6434 bdev_io->internal.io_submit_ch = NULL; 6435 } 6436 6437 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6438 _bdev_io_submit, bdev_io); 6439 } 6440 6441 if (qos->thread != NULL) { 6442 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6443 spdk_poller_unregister(&qos->poller); 6444 } 6445 6446 free(qos); 6447 6448 bdev_set_qos_limit_done(ctx, 0); 6449 } 6450 6451 static void 6452 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6453 { 6454 void *io_device = spdk_io_channel_iter_get_io_device(i); 6455 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6456 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6457 struct spdk_thread *thread; 6458 6459 pthread_mutex_lock(&bdev->internal.mutex); 6460 thread = bdev->internal.qos->thread; 6461 pthread_mutex_unlock(&bdev->internal.mutex); 6462 6463 if (thread != NULL) { 6464 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6465 } else { 6466 bdev_disable_qos_done(ctx); 6467 } 6468 } 6469 6470 static void 6471 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6472 { 6473 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6474 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6475 6476 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6477 6478 spdk_for_each_channel_continue(i, 0); 6479 } 6480 6481 static void 6482 bdev_update_qos_rate_limit_msg(void *cb_arg) 6483 { 6484 struct set_qos_limit_ctx *ctx = cb_arg; 6485 struct spdk_bdev *bdev = ctx->bdev; 6486 6487 pthread_mutex_lock(&bdev->internal.mutex); 6488 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6489 pthread_mutex_unlock(&bdev->internal.mutex); 6490 6491 bdev_set_qos_limit_done(ctx, 0); 6492 } 6493 6494 static void 6495 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6496 { 6497 void *io_device = spdk_io_channel_iter_get_io_device(i); 6498 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6499 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6500 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6501 6502 pthread_mutex_lock(&bdev->internal.mutex); 6503 bdev_enable_qos(bdev, bdev_ch); 6504 pthread_mutex_unlock(&bdev->internal.mutex); 6505 spdk_for_each_channel_continue(i, 0); 6506 } 6507 6508 static void 6509 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6510 { 6511 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6512 6513 bdev_set_qos_limit_done(ctx, status); 6514 } 6515 6516 static void 6517 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6518 { 6519 int i; 6520 6521 assert(bdev->internal.qos != NULL); 6522 6523 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6524 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6525 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6526 6527 if (limits[i] == 0) { 6528 bdev->internal.qos->rate_limits[i].limit = 6529 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6530 } 6531 } 6532 } 6533 } 6534 6535 void 6536 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6537 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6538 { 6539 struct set_qos_limit_ctx *ctx; 6540 uint32_t limit_set_complement; 6541 uint64_t min_limit_per_sec; 6542 int i; 6543 bool disable_rate_limit = true; 6544 6545 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6546 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6547 continue; 6548 } 6549 6550 if (limits[i] > 0) { 6551 disable_rate_limit = false; 6552 } 6553 6554 if (bdev_qos_is_iops_rate_limit(i) == true) { 6555 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 6556 } else { 6557 /* Change from megabyte to byte rate limit */ 6558 limits[i] = limits[i] * 1024 * 1024; 6559 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 6560 } 6561 6562 limit_set_complement = limits[i] % min_limit_per_sec; 6563 if (limit_set_complement) { 6564 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 6565 limits[i], min_limit_per_sec); 6566 limits[i] += min_limit_per_sec - limit_set_complement; 6567 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 6568 } 6569 } 6570 6571 ctx = calloc(1, sizeof(*ctx)); 6572 if (ctx == NULL) { 6573 cb_fn(cb_arg, -ENOMEM); 6574 return; 6575 } 6576 6577 ctx->cb_fn = cb_fn; 6578 ctx->cb_arg = cb_arg; 6579 ctx->bdev = bdev; 6580 6581 pthread_mutex_lock(&bdev->internal.mutex); 6582 if (bdev->internal.qos_mod_in_progress) { 6583 pthread_mutex_unlock(&bdev->internal.mutex); 6584 free(ctx); 6585 cb_fn(cb_arg, -EAGAIN); 6586 return; 6587 } 6588 bdev->internal.qos_mod_in_progress = true; 6589 6590 if (disable_rate_limit == true && bdev->internal.qos) { 6591 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6592 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 6593 (bdev->internal.qos->rate_limits[i].limit > 0 && 6594 bdev->internal.qos->rate_limits[i].limit != 6595 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 6596 disable_rate_limit = false; 6597 break; 6598 } 6599 } 6600 } 6601 6602 if (disable_rate_limit == false) { 6603 if (bdev->internal.qos == NULL) { 6604 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 6605 if (!bdev->internal.qos) { 6606 pthread_mutex_unlock(&bdev->internal.mutex); 6607 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 6608 bdev_set_qos_limit_done(ctx, -ENOMEM); 6609 return; 6610 } 6611 } 6612 6613 if (bdev->internal.qos->thread == NULL) { 6614 /* Enabling */ 6615 bdev_set_qos_rate_limits(bdev, limits); 6616 6617 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6618 bdev_enable_qos_msg, ctx, 6619 bdev_enable_qos_done); 6620 } else { 6621 /* Updating */ 6622 bdev_set_qos_rate_limits(bdev, limits); 6623 6624 spdk_thread_send_msg(bdev->internal.qos->thread, 6625 bdev_update_qos_rate_limit_msg, ctx); 6626 } 6627 } else { 6628 if (bdev->internal.qos != NULL) { 6629 bdev_set_qos_rate_limits(bdev, limits); 6630 6631 /* Disabling */ 6632 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6633 bdev_disable_qos_msg, ctx, 6634 bdev_disable_qos_msg_done); 6635 } else { 6636 pthread_mutex_unlock(&bdev->internal.mutex); 6637 bdev_set_qos_limit_done(ctx, 0); 6638 return; 6639 } 6640 } 6641 6642 pthread_mutex_unlock(&bdev->internal.mutex); 6643 } 6644 6645 struct spdk_bdev_histogram_ctx { 6646 spdk_bdev_histogram_status_cb cb_fn; 6647 void *cb_arg; 6648 struct spdk_bdev *bdev; 6649 int status; 6650 }; 6651 6652 static void 6653 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 6654 { 6655 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6656 6657 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6658 ctx->bdev->internal.histogram_in_progress = false; 6659 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6660 ctx->cb_fn(ctx->cb_arg, ctx->status); 6661 free(ctx); 6662 } 6663 6664 static void 6665 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 6666 { 6667 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6668 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6669 6670 if (ch->histogram != NULL) { 6671 spdk_histogram_data_free(ch->histogram); 6672 ch->histogram = NULL; 6673 } 6674 spdk_for_each_channel_continue(i, 0); 6675 } 6676 6677 static void 6678 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 6679 { 6680 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6681 6682 if (status != 0) { 6683 ctx->status = status; 6684 ctx->bdev->internal.histogram_enabled = false; 6685 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 6686 bdev_histogram_disable_channel_cb); 6687 } else { 6688 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6689 ctx->bdev->internal.histogram_in_progress = false; 6690 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6691 ctx->cb_fn(ctx->cb_arg, ctx->status); 6692 free(ctx); 6693 } 6694 } 6695 6696 static void 6697 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 6698 { 6699 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6700 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6701 int status = 0; 6702 6703 if (ch->histogram == NULL) { 6704 ch->histogram = spdk_histogram_data_alloc(); 6705 if (ch->histogram == NULL) { 6706 status = -ENOMEM; 6707 } 6708 } 6709 6710 spdk_for_each_channel_continue(i, status); 6711 } 6712 6713 void 6714 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 6715 void *cb_arg, bool enable) 6716 { 6717 struct spdk_bdev_histogram_ctx *ctx; 6718 6719 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 6720 if (ctx == NULL) { 6721 cb_fn(cb_arg, -ENOMEM); 6722 return; 6723 } 6724 6725 ctx->bdev = bdev; 6726 ctx->status = 0; 6727 ctx->cb_fn = cb_fn; 6728 ctx->cb_arg = cb_arg; 6729 6730 pthread_mutex_lock(&bdev->internal.mutex); 6731 if (bdev->internal.histogram_in_progress) { 6732 pthread_mutex_unlock(&bdev->internal.mutex); 6733 free(ctx); 6734 cb_fn(cb_arg, -EAGAIN); 6735 return; 6736 } 6737 6738 bdev->internal.histogram_in_progress = true; 6739 pthread_mutex_unlock(&bdev->internal.mutex); 6740 6741 bdev->internal.histogram_enabled = enable; 6742 6743 if (enable) { 6744 /* Allocate histogram for each channel */ 6745 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 6746 bdev_histogram_enable_channel_cb); 6747 } else { 6748 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 6749 bdev_histogram_disable_channel_cb); 6750 } 6751 } 6752 6753 struct spdk_bdev_histogram_data_ctx { 6754 spdk_bdev_histogram_data_cb cb_fn; 6755 void *cb_arg; 6756 struct spdk_bdev *bdev; 6757 /** merged histogram data from all channels */ 6758 struct spdk_histogram_data *histogram; 6759 }; 6760 6761 static void 6762 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 6763 { 6764 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6765 6766 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 6767 free(ctx); 6768 } 6769 6770 static void 6771 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 6772 { 6773 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6774 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6775 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6776 int status = 0; 6777 6778 if (ch->histogram == NULL) { 6779 status = -EFAULT; 6780 } else { 6781 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 6782 } 6783 6784 spdk_for_each_channel_continue(i, status); 6785 } 6786 6787 void 6788 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 6789 spdk_bdev_histogram_data_cb cb_fn, 6790 void *cb_arg) 6791 { 6792 struct spdk_bdev_histogram_data_ctx *ctx; 6793 6794 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 6795 if (ctx == NULL) { 6796 cb_fn(cb_arg, -ENOMEM, NULL); 6797 return; 6798 } 6799 6800 ctx->bdev = bdev; 6801 ctx->cb_fn = cb_fn; 6802 ctx->cb_arg = cb_arg; 6803 6804 ctx->histogram = histogram; 6805 6806 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 6807 bdev_histogram_get_channel_cb); 6808 } 6809 6810 size_t 6811 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 6812 size_t max_events) 6813 { 6814 struct media_event_entry *entry; 6815 size_t num_events = 0; 6816 6817 for (; num_events < max_events; ++num_events) { 6818 entry = TAILQ_FIRST(&desc->pending_media_events); 6819 if (entry == NULL) { 6820 break; 6821 } 6822 6823 events[num_events] = entry->event; 6824 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 6825 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 6826 } 6827 6828 return num_events; 6829 } 6830 6831 int 6832 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 6833 size_t num_events) 6834 { 6835 struct spdk_bdev_desc *desc; 6836 struct media_event_entry *entry; 6837 size_t event_id; 6838 int rc = 0; 6839 6840 assert(bdev->media_events); 6841 6842 pthread_mutex_lock(&bdev->internal.mutex); 6843 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6844 if (desc->write) { 6845 break; 6846 } 6847 } 6848 6849 if (desc == NULL || desc->media_events_buffer == NULL) { 6850 rc = -ENODEV; 6851 goto out; 6852 } 6853 6854 for (event_id = 0; event_id < num_events; ++event_id) { 6855 entry = TAILQ_FIRST(&desc->free_media_events); 6856 if (entry == NULL) { 6857 break; 6858 } 6859 6860 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 6861 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 6862 entry->event = events[event_id]; 6863 } 6864 6865 rc = event_id; 6866 out: 6867 pthread_mutex_unlock(&bdev->internal.mutex); 6868 return rc; 6869 } 6870 6871 void 6872 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 6873 { 6874 struct spdk_bdev_desc *desc; 6875 6876 pthread_mutex_lock(&bdev->internal.mutex); 6877 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6878 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 6879 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 6880 desc->callback.ctx); 6881 } 6882 } 6883 pthread_mutex_unlock(&bdev->internal.mutex); 6884 } 6885 6886 struct locked_lba_range_ctx { 6887 struct lba_range range; 6888 struct spdk_bdev *bdev; 6889 struct lba_range *current_range; 6890 struct lba_range *owner_range; 6891 struct spdk_poller *poller; 6892 lock_range_cb cb_fn; 6893 void *cb_arg; 6894 }; 6895 6896 static void 6897 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 6898 { 6899 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6900 6901 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 6902 free(ctx); 6903 } 6904 6905 static void 6906 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 6907 6908 static void 6909 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6910 { 6911 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6912 struct spdk_bdev *bdev = ctx->bdev; 6913 6914 if (status == -ENOMEM) { 6915 /* One of the channels could not allocate a range object. 6916 * So we have to go back and clean up any ranges that were 6917 * allocated successfully before we return error status to 6918 * the caller. We can reuse the unlock function to do that 6919 * clean up. 6920 */ 6921 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6922 bdev_unlock_lba_range_get_channel, ctx, 6923 bdev_lock_error_cleanup_cb); 6924 return; 6925 } 6926 6927 /* All channels have locked this range and no I/O overlapping the range 6928 * are outstanding! Set the owner_ch for the range object for the 6929 * locking channel, so that this channel will know that it is allowed 6930 * to write to this range. 6931 */ 6932 ctx->owner_range->owner_ch = ctx->range.owner_ch; 6933 ctx->cb_fn(ctx->cb_arg, status); 6934 6935 /* Don't free the ctx here. Its range is in the bdev's global list of 6936 * locked ranges still, and will be removed and freed when this range 6937 * is later unlocked. 6938 */ 6939 } 6940 6941 static int 6942 bdev_lock_lba_range_check_io(void *_i) 6943 { 6944 struct spdk_io_channel_iter *i = _i; 6945 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6946 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6947 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6948 struct lba_range *range = ctx->current_range; 6949 struct spdk_bdev_io *bdev_io; 6950 6951 spdk_poller_unregister(&ctx->poller); 6952 6953 /* The range is now in the locked_ranges, so no new IO can be submitted to this 6954 * range. But we need to wait until any outstanding IO overlapping with this range 6955 * are completed. 6956 */ 6957 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 6958 if (bdev_io_range_is_locked(bdev_io, range)) { 6959 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 6960 return SPDK_POLLER_BUSY; 6961 } 6962 } 6963 6964 spdk_for_each_channel_continue(i, 0); 6965 return SPDK_POLLER_BUSY; 6966 } 6967 6968 static void 6969 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6970 { 6971 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6972 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6973 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6974 struct lba_range *range; 6975 6976 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6977 if (range->length == ctx->range.length && 6978 range->offset == ctx->range.offset && 6979 range->locked_ctx == ctx->range.locked_ctx) { 6980 /* This range already exists on this channel, so don't add 6981 * it again. This can happen when a new channel is created 6982 * while the for_each_channel operation is in progress. 6983 * Do not check for outstanding I/O in that case, since the 6984 * range was locked before any I/O could be submitted to the 6985 * new channel. 6986 */ 6987 spdk_for_each_channel_continue(i, 0); 6988 return; 6989 } 6990 } 6991 6992 range = calloc(1, sizeof(*range)); 6993 if (range == NULL) { 6994 spdk_for_each_channel_continue(i, -ENOMEM); 6995 return; 6996 } 6997 6998 range->length = ctx->range.length; 6999 range->offset = ctx->range.offset; 7000 range->locked_ctx = ctx->range.locked_ctx; 7001 ctx->current_range = range; 7002 if (ctx->range.owner_ch == ch) { 7003 /* This is the range object for the channel that will hold 7004 * the lock. Store it in the ctx object so that we can easily 7005 * set its owner_ch after the lock is finally acquired. 7006 */ 7007 ctx->owner_range = range; 7008 } 7009 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 7010 bdev_lock_lba_range_check_io(i); 7011 } 7012 7013 static void 7014 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 7015 { 7016 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 7017 7018 /* We will add a copy of this range to each channel now. */ 7019 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 7020 bdev_lock_lba_range_cb); 7021 } 7022 7023 static bool 7024 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 7025 { 7026 struct lba_range *r; 7027 7028 TAILQ_FOREACH(r, tailq, tailq) { 7029 if (bdev_lba_range_overlapped(range, r)) { 7030 return true; 7031 } 7032 } 7033 return false; 7034 } 7035 7036 static int 7037 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7038 uint64_t offset, uint64_t length, 7039 lock_range_cb cb_fn, void *cb_arg) 7040 { 7041 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7042 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7043 struct locked_lba_range_ctx *ctx; 7044 7045 if (cb_arg == NULL) { 7046 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7047 return -EINVAL; 7048 } 7049 7050 ctx = calloc(1, sizeof(*ctx)); 7051 if (ctx == NULL) { 7052 return -ENOMEM; 7053 } 7054 7055 ctx->range.offset = offset; 7056 ctx->range.length = length; 7057 ctx->range.owner_ch = ch; 7058 ctx->range.locked_ctx = cb_arg; 7059 ctx->bdev = bdev; 7060 ctx->cb_fn = cb_fn; 7061 ctx->cb_arg = cb_arg; 7062 7063 pthread_mutex_lock(&bdev->internal.mutex); 7064 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7065 /* There is an active lock overlapping with this range. 7066 * Put it on the pending list until this range no 7067 * longer overlaps with another. 7068 */ 7069 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7070 } else { 7071 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7072 bdev_lock_lba_range_ctx(bdev, ctx); 7073 } 7074 pthread_mutex_unlock(&bdev->internal.mutex); 7075 return 0; 7076 } 7077 7078 static void 7079 bdev_lock_lba_range_ctx_msg(void *_ctx) 7080 { 7081 struct locked_lba_range_ctx *ctx = _ctx; 7082 7083 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7084 } 7085 7086 static void 7087 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7088 { 7089 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7090 struct locked_lba_range_ctx *pending_ctx; 7091 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7092 struct spdk_bdev *bdev = ch->bdev; 7093 struct lba_range *range, *tmp; 7094 7095 pthread_mutex_lock(&bdev->internal.mutex); 7096 /* Check if there are any pending locked ranges that overlap with this range 7097 * that was just unlocked. If there are, check that it doesn't overlap with any 7098 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7099 * the lock process. 7100 */ 7101 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7102 if (bdev_lba_range_overlapped(range, &ctx->range) && 7103 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7104 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7105 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7106 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7107 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7108 bdev_lock_lba_range_ctx_msg, pending_ctx); 7109 } 7110 } 7111 pthread_mutex_unlock(&bdev->internal.mutex); 7112 7113 ctx->cb_fn(ctx->cb_arg, status); 7114 free(ctx); 7115 } 7116 7117 static void 7118 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7119 { 7120 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7121 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7122 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7123 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7124 struct spdk_bdev_io *bdev_io; 7125 struct lba_range *range; 7126 7127 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7128 if (ctx->range.offset == range->offset && 7129 ctx->range.length == range->length && 7130 ctx->range.locked_ctx == range->locked_ctx) { 7131 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7132 free(range); 7133 break; 7134 } 7135 } 7136 7137 /* Note: we should almost always be able to assert that the range specified 7138 * was found. But there are some very rare corner cases where a new channel 7139 * gets created simultaneously with a range unlock, where this function 7140 * would execute on that new channel and wouldn't have the range. 7141 * We also use this to clean up range allocations when a later allocation 7142 * fails in the locking path. 7143 * So we can't actually assert() here. 7144 */ 7145 7146 /* Swap the locked IO into a temporary list, and then try to submit them again. 7147 * We could hyper-optimize this to only resubmit locked I/O that overlap 7148 * with the range that was just unlocked, but this isn't a performance path so 7149 * we go for simplicity here. 7150 */ 7151 TAILQ_INIT(&io_locked); 7152 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7153 while (!TAILQ_EMPTY(&io_locked)) { 7154 bdev_io = TAILQ_FIRST(&io_locked); 7155 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7156 bdev_io_submit(bdev_io); 7157 } 7158 7159 spdk_for_each_channel_continue(i, 0); 7160 } 7161 7162 static int 7163 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7164 uint64_t offset, uint64_t length, 7165 lock_range_cb cb_fn, void *cb_arg) 7166 { 7167 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7168 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7169 struct locked_lba_range_ctx *ctx; 7170 struct lba_range *range; 7171 bool range_found = false; 7172 7173 /* Let's make sure the specified channel actually has a lock on 7174 * the specified range. Note that the range must match exactly. 7175 */ 7176 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7177 if (range->offset == offset && range->length == length && 7178 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7179 range_found = true; 7180 break; 7181 } 7182 } 7183 7184 if (!range_found) { 7185 return -EINVAL; 7186 } 7187 7188 pthread_mutex_lock(&bdev->internal.mutex); 7189 /* We confirmed that this channel has locked the specified range. To 7190 * start the unlock the process, we find the range in the bdev's locked_ranges 7191 * and remove it. This ensures new channels don't inherit the locked range. 7192 * Then we will send a message to each channel (including the one specified 7193 * here) to remove the range from its per-channel list. 7194 */ 7195 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7196 if (range->offset == offset && range->length == length && 7197 range->locked_ctx == cb_arg) { 7198 break; 7199 } 7200 } 7201 if (range == NULL) { 7202 assert(false); 7203 pthread_mutex_unlock(&bdev->internal.mutex); 7204 return -EINVAL; 7205 } 7206 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7207 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7208 pthread_mutex_unlock(&bdev->internal.mutex); 7209 7210 ctx->cb_fn = cb_fn; 7211 ctx->cb_arg = cb_arg; 7212 7213 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7214 bdev_unlock_lba_range_cb); 7215 return 0; 7216 } 7217 7218 int 7219 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7220 int array_size) 7221 { 7222 if (!bdev) { 7223 return -EINVAL; 7224 } 7225 7226 if (bdev->fn_table->get_memory_domains) { 7227 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7228 } 7229 7230 return 0; 7231 } 7232 7233 SPDK_LOG_REGISTER_COMPONENT(bdev) 7234 7235 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 7236 { 7237 struct spdk_trace_tpoint_opts opts[] = { 7238 { 7239 "BDEV_IO_START", TRACE_BDEV_IO_START, 7240 OWNER_BDEV, OBJECT_BDEV_IO, 1, 7241 { 7242 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7243 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 7244 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7245 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 } 7246 } 7247 }, 7248 { 7249 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 7250 OWNER_BDEV, OBJECT_BDEV_IO, 0, 7251 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7252 }, 7253 { 7254 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 7255 OWNER_BDEV, OBJECT_NONE, 1, 7256 { 7257 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7258 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7259 } 7260 }, 7261 { 7262 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 7263 OWNER_BDEV, OBJECT_NONE, 0, 7264 { 7265 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7266 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7267 } 7268 }, 7269 }; 7270 7271 7272 spdk_trace_register_owner(OWNER_BDEV, 'b'); 7273 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 7274 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7275 } 7276