1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define NOMEM_THRESHOLD_COUNT 8 42 43 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 44 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 45 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 46 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 47 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 48 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 49 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 50 51 #define SPDK_BDEV_POOL_ALIGNMENT 512 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 60 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 61 }; 62 63 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 64 65 RB_HEAD(bdev_name_tree, spdk_bdev_name); 66 67 static int 68 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 69 { 70 return strcmp(name1->name, name2->name); 71 } 72 73 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 74 75 struct spdk_bdev_mgr { 76 struct spdk_mempool *bdev_io_pool; 77 78 struct spdk_mempool *buf_small_pool; 79 struct spdk_mempool *buf_large_pool; 80 81 void *zero_buffer; 82 83 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 84 85 struct spdk_bdev_list bdevs; 86 struct bdev_name_tree bdev_names; 87 88 bool init_complete; 89 bool module_init_complete; 90 91 pthread_mutex_t mutex; 92 93 #ifdef SPDK_CONFIG_VTUNE 94 __itt_domain *domain; 95 #endif 96 }; 97 98 static struct spdk_bdev_mgr g_bdev_mgr = { 99 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 100 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 101 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 102 .init_complete = false, 103 .module_init_complete = false, 104 .mutex = PTHREAD_MUTEX_INITIALIZER, 105 }; 106 107 typedef void (*lock_range_cb)(void *ctx, int status); 108 109 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 110 111 struct lba_range { 112 uint64_t offset; 113 uint64_t length; 114 void *locked_ctx; 115 struct spdk_bdev_channel *owner_ch; 116 TAILQ_ENTRY(lba_range) tailq; 117 }; 118 119 static struct spdk_bdev_opts g_bdev_opts = { 120 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 121 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 122 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 123 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 124 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 125 }; 126 127 static spdk_bdev_init_cb g_init_cb_fn = NULL; 128 static void *g_init_cb_arg = NULL; 129 130 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 131 static void *g_fini_cb_arg = NULL; 132 static struct spdk_thread *g_fini_thread = NULL; 133 134 struct spdk_bdev_qos_limit { 135 /** IOs or bytes allowed per second (i.e., 1s). */ 136 uint64_t limit; 137 138 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 139 * For remaining bytes, allowed to run negative if an I/O is submitted when 140 * some bytes are remaining, but the I/O is bigger than that amount. The 141 * excess will be deducted from the next timeslice. 142 */ 143 int64_t remaining_this_timeslice; 144 145 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 146 uint32_t min_per_timeslice; 147 148 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 149 uint32_t max_per_timeslice; 150 151 /** Function to check whether to queue the IO. */ 152 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 153 154 /** Function to update for the submitted IO. */ 155 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 156 }; 157 158 struct spdk_bdev_qos { 159 /** Types of structure of rate limits. */ 160 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 161 162 /** The channel that all I/O are funneled through. */ 163 struct spdk_bdev_channel *ch; 164 165 /** The thread on which the poller is running. */ 166 struct spdk_thread *thread; 167 168 /** Queue of I/O waiting to be issued. */ 169 bdev_io_tailq_t queued; 170 171 /** Size of a timeslice in tsc ticks. */ 172 uint64_t timeslice_size; 173 174 /** Timestamp of start of last timeslice. */ 175 uint64_t last_timeslice; 176 177 /** Poller that processes queued I/O commands each time slice. */ 178 struct spdk_poller *poller; 179 }; 180 181 struct spdk_bdev_mgmt_channel { 182 bdev_io_stailq_t need_buf_small; 183 bdev_io_stailq_t need_buf_large; 184 185 /* 186 * Each thread keeps a cache of bdev_io - this allows 187 * bdev threads which are *not* DPDK threads to still 188 * benefit from a per-thread bdev_io cache. Without 189 * this, non-DPDK threads fetching from the mempool 190 * incur a cmpxchg on get and put. 191 */ 192 bdev_io_stailq_t per_thread_cache; 193 uint32_t per_thread_cache_count; 194 uint32_t bdev_io_cache_size; 195 196 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 197 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 198 }; 199 200 /* 201 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 202 * will queue here their IO that awaits retry. It makes it possible to retry sending 203 * IO to one bdev after IO from other bdev completes. 204 */ 205 struct spdk_bdev_shared_resource { 206 /* The bdev management channel */ 207 struct spdk_bdev_mgmt_channel *mgmt_ch; 208 209 /* 210 * Count of I/O submitted to bdev module and waiting for completion. 211 * Incremented before submit_request() is called on an spdk_bdev_io. 212 */ 213 uint64_t io_outstanding; 214 215 /* 216 * Queue of IO awaiting retry because of a previous NOMEM status returned 217 * on this channel. 218 */ 219 bdev_io_tailq_t nomem_io; 220 221 /* 222 * Threshold which io_outstanding must drop to before retrying nomem_io. 223 */ 224 uint64_t nomem_threshold; 225 226 /* I/O channel allocated by a bdev module */ 227 struct spdk_io_channel *shared_ch; 228 229 /* Refcount of bdev channels using this resource */ 230 uint32_t ref; 231 232 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 233 }; 234 235 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 236 #define BDEV_CH_QOS_ENABLED (1 << 1) 237 238 struct spdk_bdev_channel { 239 struct spdk_bdev *bdev; 240 241 /* The channel for the underlying device */ 242 struct spdk_io_channel *channel; 243 244 /* Per io_device per thread data */ 245 struct spdk_bdev_shared_resource *shared_resource; 246 247 struct spdk_bdev_io_stat stat; 248 249 /* 250 * Count of I/O submitted to the underlying dev module through this channel 251 * and waiting for completion. 252 */ 253 uint64_t io_outstanding; 254 255 /* 256 * List of all submitted I/Os including I/O that are generated via splitting. 257 */ 258 bdev_io_tailq_t io_submitted; 259 260 /* 261 * List of spdk_bdev_io that are currently queued because they write to a locked 262 * LBA range. 263 */ 264 bdev_io_tailq_t io_locked; 265 266 uint32_t flags; 267 268 struct spdk_histogram_data *histogram; 269 270 #ifdef SPDK_CONFIG_VTUNE 271 uint64_t start_tsc; 272 uint64_t interval_tsc; 273 __itt_string_handle *handle; 274 struct spdk_bdev_io_stat prev_stat; 275 #endif 276 277 bdev_io_tailq_t queued_resets; 278 279 lba_range_tailq_t locked_ranges; 280 }; 281 282 struct media_event_entry { 283 struct spdk_bdev_media_event event; 284 TAILQ_ENTRY(media_event_entry) tailq; 285 }; 286 287 #define MEDIA_EVENT_POOL_SIZE 64 288 289 struct spdk_bdev_desc { 290 struct spdk_bdev *bdev; 291 struct spdk_thread *thread; 292 struct { 293 spdk_bdev_event_cb_t event_fn; 294 void *ctx; 295 } callback; 296 bool closed; 297 bool write; 298 bool memory_domains_supported; 299 pthread_mutex_t mutex; 300 uint32_t refs; 301 TAILQ_HEAD(, media_event_entry) pending_media_events; 302 TAILQ_HEAD(, media_event_entry) free_media_events; 303 struct media_event_entry *media_events_buffer; 304 TAILQ_ENTRY(spdk_bdev_desc) link; 305 306 uint64_t timeout_in_sec; 307 spdk_bdev_io_timeout_cb cb_fn; 308 void *cb_arg; 309 struct spdk_poller *io_timeout_poller; 310 }; 311 312 struct spdk_bdev_iostat_ctx { 313 struct spdk_bdev_io_stat *stat; 314 spdk_bdev_get_device_stat_cb cb; 315 void *cb_arg; 316 }; 317 318 struct set_qos_limit_ctx { 319 void (*cb_fn)(void *cb_arg, int status); 320 void *cb_arg; 321 struct spdk_bdev *bdev; 322 }; 323 324 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 325 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 326 327 static inline void bdev_io_complete(void *ctx); 328 329 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 330 static void bdev_write_zero_buffer_next(void *_bdev_io); 331 332 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 333 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 334 335 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 336 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 337 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 338 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 339 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 340 struct iovec *iov, int iovcnt, void *md_buf, 341 uint64_t offset_blocks, uint64_t num_blocks, 342 spdk_bdev_io_completion_cb cb, void *cb_arg, 343 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 344 345 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 346 uint64_t offset, uint64_t length, 347 lock_range_cb cb_fn, void *cb_arg); 348 349 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 350 uint64_t offset, uint64_t length, 351 lock_range_cb cb_fn, void *cb_arg); 352 353 static inline void bdev_io_complete(void *ctx); 354 355 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 356 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 357 358 void 359 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 360 { 361 if (!opts) { 362 SPDK_ERRLOG("opts should not be NULL\n"); 363 return; 364 } 365 366 if (!opts_size) { 367 SPDK_ERRLOG("opts_size should not be zero value\n"); 368 return; 369 } 370 371 opts->opts_size = opts_size; 372 373 #define SET_FIELD(field) \ 374 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 375 opts->field = g_bdev_opts.field; \ 376 } \ 377 378 SET_FIELD(bdev_io_pool_size); 379 SET_FIELD(bdev_io_cache_size); 380 SET_FIELD(bdev_auto_examine); 381 SET_FIELD(small_buf_pool_size); 382 SET_FIELD(large_buf_pool_size); 383 384 /* Do not remove this statement, you should always update this statement when you adding a new field, 385 * and do not forget to add the SET_FIELD statement for your added field. */ 386 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 387 388 #undef SET_FIELD 389 } 390 391 int 392 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 393 { 394 uint32_t min_pool_size; 395 396 if (!opts) { 397 SPDK_ERRLOG("opts cannot be NULL\n"); 398 return -1; 399 } 400 401 if (!opts->opts_size) { 402 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 403 return -1; 404 } 405 406 /* 407 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 408 * initialization. A second mgmt_ch will be created on the same thread when the application starts 409 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 410 */ 411 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 412 if (opts->bdev_io_pool_size < min_pool_size) { 413 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 414 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 415 spdk_thread_get_count()); 416 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 417 return -1; 418 } 419 420 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 421 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 422 return -1; 423 } 424 425 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 426 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 427 return -1; 428 } 429 430 #define SET_FIELD(field) \ 431 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 432 g_bdev_opts.field = opts->field; \ 433 } \ 434 435 SET_FIELD(bdev_io_pool_size); 436 SET_FIELD(bdev_io_cache_size); 437 SET_FIELD(bdev_auto_examine); 438 SET_FIELD(small_buf_pool_size); 439 SET_FIELD(large_buf_pool_size); 440 441 g_bdev_opts.opts_size = opts->opts_size; 442 443 #undef SET_FIELD 444 445 return 0; 446 } 447 448 static struct spdk_bdev * 449 bdev_get_by_name(const char *bdev_name) 450 { 451 struct spdk_bdev_name find; 452 struct spdk_bdev_name *res; 453 454 find.name = (char *)bdev_name; 455 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 456 if (res != NULL) { 457 return res->bdev; 458 } 459 460 return NULL; 461 } 462 463 struct spdk_bdev * 464 spdk_bdev_get_by_name(const char *bdev_name) 465 { 466 struct spdk_bdev *bdev; 467 468 pthread_mutex_lock(&g_bdev_mgr.mutex); 469 bdev = bdev_get_by_name(bdev_name); 470 pthread_mutex_unlock(&g_bdev_mgr.mutex); 471 472 return bdev; 473 } 474 475 struct spdk_bdev_wait_for_examine_ctx { 476 struct spdk_poller *poller; 477 spdk_bdev_wait_for_examine_cb cb_fn; 478 void *cb_arg; 479 }; 480 481 static bool bdev_module_all_actions_completed(void); 482 483 static int 484 bdev_wait_for_examine_cb(void *arg) 485 { 486 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 487 488 if (!bdev_module_all_actions_completed()) { 489 return SPDK_POLLER_IDLE; 490 } 491 492 spdk_poller_unregister(&ctx->poller); 493 ctx->cb_fn(ctx->cb_arg); 494 free(ctx); 495 496 return SPDK_POLLER_BUSY; 497 } 498 499 int 500 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 501 { 502 struct spdk_bdev_wait_for_examine_ctx *ctx; 503 504 ctx = calloc(1, sizeof(*ctx)); 505 if (ctx == NULL) { 506 return -ENOMEM; 507 } 508 ctx->cb_fn = cb_fn; 509 ctx->cb_arg = cb_arg; 510 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 511 512 return 0; 513 } 514 515 struct spdk_bdev_examine_item { 516 char *name; 517 TAILQ_ENTRY(spdk_bdev_examine_item) link; 518 }; 519 520 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 521 522 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 523 g_bdev_examine_allowlist); 524 525 static inline bool 526 bdev_examine_allowlist_check(const char *name) 527 { 528 struct spdk_bdev_examine_item *item; 529 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 530 if (strcmp(name, item->name) == 0) { 531 return true; 532 } 533 } 534 return false; 535 } 536 537 static inline void 538 bdev_examine_allowlist_free(void) 539 { 540 struct spdk_bdev_examine_item *item; 541 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 542 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 543 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 544 free(item->name); 545 free(item); 546 } 547 } 548 549 static inline bool 550 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 551 { 552 struct spdk_bdev_alias *tmp; 553 if (bdev_examine_allowlist_check(bdev->name)) { 554 return true; 555 } 556 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 557 if (bdev_examine_allowlist_check(tmp->alias.name)) { 558 return true; 559 } 560 } 561 return false; 562 } 563 564 static inline bool 565 bdev_ok_to_examine(struct spdk_bdev *bdev) 566 { 567 if (g_bdev_opts.bdev_auto_examine) { 568 return true; 569 } else { 570 return bdev_in_examine_allowlist(bdev); 571 } 572 } 573 574 static void 575 bdev_examine(struct spdk_bdev *bdev) 576 { 577 struct spdk_bdev_module *module; 578 uint32_t action; 579 580 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 581 if (module->examine_config && bdev_ok_to_examine(bdev)) { 582 action = module->internal.action_in_progress; 583 module->internal.action_in_progress++; 584 module->examine_config(bdev); 585 if (action != module->internal.action_in_progress) { 586 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 587 module->name); 588 } 589 } 590 } 591 592 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 593 if (bdev->internal.claim_module->examine_disk) { 594 bdev->internal.claim_module->internal.action_in_progress++; 595 bdev->internal.claim_module->examine_disk(bdev); 596 } 597 return; 598 } 599 600 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 601 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 602 module->internal.action_in_progress++; 603 module->examine_disk(bdev); 604 } 605 } 606 } 607 608 int 609 spdk_bdev_examine(const char *name) 610 { 611 struct spdk_bdev *bdev; 612 struct spdk_bdev_examine_item *item; 613 614 if (g_bdev_opts.bdev_auto_examine) { 615 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 616 return -EINVAL; 617 } 618 619 if (bdev_examine_allowlist_check(name)) { 620 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 621 return -EEXIST; 622 } 623 624 item = calloc(1, sizeof(*item)); 625 if (!item) { 626 return -ENOMEM; 627 } 628 item->name = strdup(name); 629 if (!item->name) { 630 free(item); 631 return -ENOMEM; 632 } 633 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 634 635 bdev = spdk_bdev_get_by_name(name); 636 if (bdev) { 637 bdev_examine(bdev); 638 } 639 return 0; 640 } 641 642 static inline void 643 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 644 { 645 struct spdk_bdev_examine_item *item; 646 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 647 spdk_json_write_object_begin(w); 648 spdk_json_write_named_string(w, "method", "bdev_examine"); 649 spdk_json_write_named_object_begin(w, "params"); 650 spdk_json_write_named_string(w, "name", item->name); 651 spdk_json_write_object_end(w); 652 spdk_json_write_object_end(w); 653 } 654 } 655 656 struct spdk_bdev * 657 spdk_bdev_first(void) 658 { 659 struct spdk_bdev *bdev; 660 661 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 662 if (bdev) { 663 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 664 } 665 666 return bdev; 667 } 668 669 struct spdk_bdev * 670 spdk_bdev_next(struct spdk_bdev *prev) 671 { 672 struct spdk_bdev *bdev; 673 674 bdev = TAILQ_NEXT(prev, internal.link); 675 if (bdev) { 676 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 677 } 678 679 return bdev; 680 } 681 682 static struct spdk_bdev * 683 _bdev_next_leaf(struct spdk_bdev *bdev) 684 { 685 while (bdev != NULL) { 686 if (bdev->internal.claim_module == NULL) { 687 return bdev; 688 } else { 689 bdev = TAILQ_NEXT(bdev, internal.link); 690 } 691 } 692 693 return bdev; 694 } 695 696 struct spdk_bdev * 697 spdk_bdev_first_leaf(void) 698 { 699 struct spdk_bdev *bdev; 700 701 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 702 703 if (bdev) { 704 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 705 } 706 707 return bdev; 708 } 709 710 struct spdk_bdev * 711 spdk_bdev_next_leaf(struct spdk_bdev *prev) 712 { 713 struct spdk_bdev *bdev; 714 715 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 716 717 if (bdev) { 718 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 719 } 720 721 return bdev; 722 } 723 724 static inline bool 725 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 726 { 727 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 728 } 729 730 void 731 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 732 { 733 struct iovec *iovs; 734 735 if (bdev_io->u.bdev.iovs == NULL) { 736 bdev_io->u.bdev.iovs = &bdev_io->iov; 737 bdev_io->u.bdev.iovcnt = 1; 738 } 739 740 iovs = bdev_io->u.bdev.iovs; 741 742 assert(iovs != NULL); 743 assert(bdev_io->u.bdev.iovcnt >= 1); 744 745 iovs[0].iov_base = buf; 746 iovs[0].iov_len = len; 747 } 748 749 void 750 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 751 { 752 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 753 bdev_io->u.bdev.md_buf = md_buf; 754 } 755 756 static bool 757 _is_buf_allocated(const struct iovec *iovs) 758 { 759 if (iovs == NULL) { 760 return false; 761 } 762 763 return iovs[0].iov_base != NULL; 764 } 765 766 static bool 767 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 768 { 769 int i; 770 uintptr_t iov_base; 771 772 if (spdk_likely(alignment == 1)) { 773 return true; 774 } 775 776 for (i = 0; i < iovcnt; i++) { 777 iov_base = (uintptr_t)iovs[i].iov_base; 778 if ((iov_base & (alignment - 1)) != 0) { 779 return false; 780 } 781 } 782 783 return true; 784 } 785 786 static void 787 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 788 { 789 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 790 void *buf; 791 792 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 793 buf = bdev_io->internal.buf; 794 bdev_io->internal.buf = NULL; 795 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 796 bdev_io->internal.get_aux_buf_cb = NULL; 797 } else { 798 assert(bdev_io->internal.get_buf_cb != NULL); 799 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 800 bdev_io->internal.get_buf_cb = NULL; 801 } 802 } 803 804 static void 805 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 806 { 807 struct spdk_bdev_io *bdev_io = ctx; 808 809 if (rc) { 810 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 811 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 812 } 813 bdev_io_get_buf_complete(bdev_io, !rc); 814 } 815 816 static void 817 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 818 { 819 int rc = 0; 820 821 /* save original md_buf */ 822 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 823 bdev_io->internal.orig_md_iov.iov_len = len; 824 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 825 bdev_io->internal.bounce_md_iov.iov_len = len; 826 /* set bounce md_buf */ 827 bdev_io->u.bdev.md_buf = md_buf; 828 829 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 830 if (bdev_io_use_memory_domain(bdev_io)) { 831 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 832 bdev_io->internal.ext_opts->memory_domain_ctx, 833 &bdev_io->internal.orig_md_iov, 1, 834 &bdev_io->internal.bounce_md_iov, 1, 835 bdev_io->internal.data_transfer_cpl, 836 bdev_io); 837 if (rc == 0) { 838 /* Continue to submit IO in completion callback */ 839 return; 840 } 841 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 842 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 843 } else { 844 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 845 } 846 } 847 848 assert(bdev_io->internal.data_transfer_cpl); 849 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 850 } 851 852 static void 853 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 854 { 855 struct spdk_bdev *bdev = bdev_io->bdev; 856 uint64_t md_len; 857 void *buf; 858 859 if (spdk_bdev_is_md_separate(bdev)) { 860 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 861 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 862 863 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 864 865 if (bdev_io->u.bdev.md_buf != NULL) { 866 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 867 return; 868 } else { 869 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 870 } 871 } 872 873 bdev_io_get_buf_complete(bdev_io, true); 874 } 875 876 static void 877 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 878 { 879 struct spdk_bdev_io *bdev_io = ctx; 880 881 if (rc) { 882 SPDK_ERRLOG("Failed to get data buffer\n"); 883 assert(bdev_io->internal.data_transfer_cpl); 884 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 885 return; 886 } 887 888 _bdev_io_set_md_buf(bdev_io); 889 } 890 891 static void 892 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 893 bdev_copy_bounce_buffer_cpl cpl_cb) 894 { 895 int rc = 0; 896 897 bdev_io->internal.data_transfer_cpl = cpl_cb; 898 /* save original iovec */ 899 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 900 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 901 /* set bounce iov */ 902 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 903 bdev_io->u.bdev.iovcnt = 1; 904 /* set bounce buffer for this operation */ 905 bdev_io->u.bdev.iovs[0].iov_base = buf; 906 bdev_io->u.bdev.iovs[0].iov_len = len; 907 /* if this is write path, copy data from original buffer to bounce buffer */ 908 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 909 if (bdev_io_use_memory_domain(bdev_io)) { 910 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 911 bdev_io->internal.ext_opts->memory_domain_ctx, 912 bdev_io->internal.orig_iovs, 913 (uint32_t) bdev_io->internal.orig_iovcnt, 914 bdev_io->u.bdev.iovs, 1, 915 _bdev_io_pull_bounce_data_buf_done, 916 bdev_io); 917 if (rc == 0) { 918 /* Continue to submit IO in completion callback */ 919 return; 920 } 921 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 922 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 923 } else { 924 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 925 } 926 } 927 928 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 929 } 930 931 static void 932 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 933 { 934 struct spdk_bdev *bdev = bdev_io->bdev; 935 bool buf_allocated; 936 uint64_t alignment; 937 void *aligned_buf; 938 939 bdev_io->internal.buf = buf; 940 941 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 942 bdev_io_get_buf_complete(bdev_io, true); 943 return; 944 } 945 946 alignment = spdk_bdev_get_buf_align(bdev); 947 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 948 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 949 950 if (buf_allocated) { 951 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 952 /* Continue in completion callback */ 953 return; 954 } else { 955 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 956 } 957 958 _bdev_io_set_md_buf(bdev_io); 959 } 960 961 static void 962 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 963 { 964 struct spdk_bdev *bdev = bdev_io->bdev; 965 struct spdk_mempool *pool; 966 struct spdk_bdev_io *tmp; 967 bdev_io_stailq_t *stailq; 968 struct spdk_bdev_mgmt_channel *ch; 969 uint64_t md_len, alignment; 970 971 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 972 alignment = spdk_bdev_get_buf_align(bdev); 973 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 974 975 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 976 SPDK_BDEV_POOL_ALIGNMENT) { 977 pool = g_bdev_mgr.buf_small_pool; 978 stailq = &ch->need_buf_small; 979 } else { 980 pool = g_bdev_mgr.buf_large_pool; 981 stailq = &ch->need_buf_large; 982 } 983 984 if (STAILQ_EMPTY(stailq)) { 985 spdk_mempool_put(pool, buf); 986 } else { 987 tmp = STAILQ_FIRST(stailq); 988 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 989 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 990 } 991 } 992 993 static void 994 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 995 { 996 assert(bdev_io->internal.buf != NULL); 997 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 998 bdev_io->internal.buf = NULL; 999 } 1000 1001 void 1002 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1003 { 1004 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1005 1006 assert(buf != NULL); 1007 _bdev_io_put_buf(bdev_io, buf, len); 1008 } 1009 1010 static void 1011 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1012 { 1013 struct spdk_bdev *bdev = bdev_ch->bdev; 1014 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1015 struct spdk_bdev_io *bdev_io; 1016 1017 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1018 /* 1019 * Allow some more I/O to complete before retrying the nomem_io queue. 1020 * Some drivers (such as nvme) cannot immediately take a new I/O in 1021 * the context of a completion, because the resources for the I/O are 1022 * not released until control returns to the bdev poller. Also, we 1023 * may require several small I/O to complete before a larger I/O 1024 * (that requires splitting) can be submitted. 1025 */ 1026 return; 1027 } 1028 1029 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1030 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1031 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1032 bdev_io->internal.ch->io_outstanding++; 1033 shared_resource->io_outstanding++; 1034 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1035 bdev_io->internal.error.nvme.cdw0 = 0; 1036 bdev_io->num_retries++; 1037 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1038 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1039 break; 1040 } 1041 } 1042 } 1043 1044 static inline void 1045 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1046 struct spdk_bdev_shared_resource *shared_resource) 1047 { 1048 assert(bdev_ch->io_outstanding > 0); 1049 assert(shared_resource->io_outstanding > 0); 1050 bdev_ch->io_outstanding--; 1051 shared_resource->io_outstanding--; 1052 } 1053 1054 static inline bool 1055 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1056 { 1057 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1058 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1059 1060 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1061 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1062 /* 1063 * Wait for some of the outstanding I/O to complete before we 1064 * retry any of the nomem_io. Normally we will wait for 1065 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1066 * depth channels we will instead wait for half to complete. 1067 */ 1068 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1069 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1070 return true; 1071 } 1072 1073 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1074 bdev_ch_retry_io(bdev_ch); 1075 } 1076 1077 return false; 1078 } 1079 1080 static void 1081 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1082 { 1083 struct spdk_bdev_io *bdev_io = ctx; 1084 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1085 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1086 1087 if (rc) { 1088 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1089 } 1090 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1091 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1092 */ 1093 bdev_io_put_buf(bdev_io); 1094 1095 /* Continue with IO completion flow */ 1096 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1097 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1098 return; 1099 } 1100 1101 bdev_io_complete(bdev_io); 1102 } 1103 1104 static inline void 1105 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1106 { 1107 int rc = 0; 1108 1109 /* do the same for metadata buffer */ 1110 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1111 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1112 1113 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1114 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1115 if (bdev_io_use_memory_domain(bdev_io)) { 1116 /* If memory domain is used then we need to call async push function */ 1117 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1118 bdev_io->internal.ext_opts->memory_domain_ctx, 1119 &bdev_io->internal.orig_md_iov, 1120 (uint32_t)bdev_io->internal.orig_iovcnt, 1121 &bdev_io->internal.bounce_md_iov, 1, 1122 bdev_io->internal.data_transfer_cpl, 1123 bdev_io); 1124 if (rc == 0) { 1125 /* Continue IO completion in async callback */ 1126 return; 1127 } 1128 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1129 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1130 } else { 1131 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1132 bdev_io->internal.orig_md_iov.iov_len); 1133 } 1134 } 1135 } 1136 1137 assert(bdev_io->internal.data_transfer_cpl); 1138 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1139 } 1140 1141 static void 1142 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1143 { 1144 struct spdk_bdev_io *bdev_io = ctx; 1145 1146 assert(bdev_io->internal.data_transfer_cpl); 1147 1148 if (rc) { 1149 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1150 return; 1151 } 1152 1153 /* set original buffer for this io */ 1154 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1155 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1156 /* disable bouncing buffer for this io */ 1157 bdev_io->internal.orig_iovcnt = 0; 1158 bdev_io->internal.orig_iovs = NULL; 1159 1160 _bdev_io_push_bounce_md_buffer(bdev_io); 1161 } 1162 1163 static inline void 1164 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1165 { 1166 int rc = 0; 1167 1168 bdev_io->internal.data_transfer_cpl = cpl_cb; 1169 1170 /* if this is read path, copy data from bounce buffer to original buffer */ 1171 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1172 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1173 if (bdev_io_use_memory_domain(bdev_io)) { 1174 /* If memory domain is used then we need to call async push function */ 1175 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1176 bdev_io->internal.ext_opts->memory_domain_ctx, 1177 bdev_io->internal.orig_iovs, 1178 (uint32_t)bdev_io->internal.orig_iovcnt, 1179 &bdev_io->internal.bounce_iov, 1, 1180 _bdev_io_push_bounce_data_buffer_done, 1181 bdev_io); 1182 if (rc == 0) { 1183 /* Continue IO completion in async callback */ 1184 return; 1185 } 1186 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1187 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1188 } else { 1189 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1190 bdev_io->internal.orig_iovcnt, 1191 bdev_io->internal.bounce_iov.iov_base, 1192 bdev_io->internal.bounce_iov.iov_len); 1193 } 1194 } 1195 1196 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1197 } 1198 1199 static void 1200 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1201 { 1202 struct spdk_bdev *bdev = bdev_io->bdev; 1203 struct spdk_mempool *pool; 1204 bdev_io_stailq_t *stailq; 1205 struct spdk_bdev_mgmt_channel *mgmt_ch; 1206 uint64_t alignment, md_len; 1207 void *buf; 1208 1209 alignment = spdk_bdev_get_buf_align(bdev); 1210 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1211 1212 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1213 SPDK_BDEV_POOL_ALIGNMENT) { 1214 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1215 len + alignment); 1216 bdev_io_get_buf_complete(bdev_io, false); 1217 return; 1218 } 1219 1220 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1221 1222 bdev_io->internal.buf_len = len; 1223 1224 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1225 SPDK_BDEV_POOL_ALIGNMENT) { 1226 pool = g_bdev_mgr.buf_small_pool; 1227 stailq = &mgmt_ch->need_buf_small; 1228 } else { 1229 pool = g_bdev_mgr.buf_large_pool; 1230 stailq = &mgmt_ch->need_buf_large; 1231 } 1232 1233 buf = spdk_mempool_get(pool); 1234 if (!buf) { 1235 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1236 } else { 1237 _bdev_io_set_buf(bdev_io, buf, len); 1238 } 1239 } 1240 1241 void 1242 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1243 { 1244 struct spdk_bdev *bdev = bdev_io->bdev; 1245 uint64_t alignment; 1246 1247 assert(cb != NULL); 1248 bdev_io->internal.get_buf_cb = cb; 1249 1250 alignment = spdk_bdev_get_buf_align(bdev); 1251 1252 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1253 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1254 /* Buffer already present and aligned */ 1255 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1256 return; 1257 } 1258 1259 bdev_io_get_buf(bdev_io, len); 1260 } 1261 1262 static void 1263 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1264 bool success) 1265 { 1266 if (!success) { 1267 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1268 bdev_io_complete(bdev_io); 1269 } else { 1270 bdev_io_submit(bdev_io); 1271 } 1272 } 1273 1274 static void 1275 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1276 uint64_t len) 1277 { 1278 assert(cb != NULL); 1279 bdev_io->internal.get_buf_cb = cb; 1280 1281 bdev_io_get_buf(bdev_io, len); 1282 } 1283 1284 void 1285 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1286 { 1287 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1288 1289 assert(cb != NULL); 1290 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1291 bdev_io->internal.get_aux_buf_cb = cb; 1292 bdev_io_get_buf(bdev_io, len); 1293 } 1294 1295 static int 1296 bdev_module_get_max_ctx_size(void) 1297 { 1298 struct spdk_bdev_module *bdev_module; 1299 int max_bdev_module_size = 0; 1300 1301 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1302 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1303 max_bdev_module_size = bdev_module->get_ctx_size(); 1304 } 1305 } 1306 1307 return max_bdev_module_size; 1308 } 1309 1310 static void 1311 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1312 { 1313 int i; 1314 struct spdk_bdev_qos *qos = bdev->internal.qos; 1315 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1316 1317 if (!qos) { 1318 return; 1319 } 1320 1321 spdk_bdev_get_qos_rate_limits(bdev, limits); 1322 1323 spdk_json_write_object_begin(w); 1324 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1325 1326 spdk_json_write_named_object_begin(w, "params"); 1327 spdk_json_write_named_string(w, "name", bdev->name); 1328 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1329 if (limits[i] > 0) { 1330 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1331 } 1332 } 1333 spdk_json_write_object_end(w); 1334 1335 spdk_json_write_object_end(w); 1336 } 1337 1338 void 1339 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1340 { 1341 struct spdk_bdev_module *bdev_module; 1342 struct spdk_bdev *bdev; 1343 1344 assert(w != NULL); 1345 1346 spdk_json_write_array_begin(w); 1347 1348 spdk_json_write_object_begin(w); 1349 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1350 spdk_json_write_named_object_begin(w, "params"); 1351 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1352 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1353 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1354 spdk_json_write_object_end(w); 1355 spdk_json_write_object_end(w); 1356 1357 bdev_examine_allowlist_config_json(w); 1358 1359 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1360 if (bdev_module->config_json) { 1361 bdev_module->config_json(w); 1362 } 1363 } 1364 1365 pthread_mutex_lock(&g_bdev_mgr.mutex); 1366 1367 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1368 if (bdev->fn_table->write_config_json) { 1369 bdev->fn_table->write_config_json(bdev, w); 1370 } 1371 1372 bdev_qos_config_json(bdev, w); 1373 } 1374 1375 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1376 1377 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1378 spdk_json_write_object_begin(w); 1379 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1380 spdk_json_write_object_end(w); 1381 1382 spdk_json_write_array_end(w); 1383 } 1384 1385 static int 1386 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1387 { 1388 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1389 struct spdk_bdev_io *bdev_io; 1390 uint32_t i; 1391 1392 STAILQ_INIT(&ch->need_buf_small); 1393 STAILQ_INIT(&ch->need_buf_large); 1394 1395 STAILQ_INIT(&ch->per_thread_cache); 1396 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1397 1398 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1399 ch->per_thread_cache_count = 0; 1400 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1401 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1402 assert(bdev_io != NULL); 1403 ch->per_thread_cache_count++; 1404 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1405 } 1406 1407 TAILQ_INIT(&ch->shared_resources); 1408 TAILQ_INIT(&ch->io_wait_queue); 1409 1410 return 0; 1411 } 1412 1413 static void 1414 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1415 { 1416 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1417 struct spdk_bdev_io *bdev_io; 1418 1419 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1420 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1421 } 1422 1423 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1424 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1425 } 1426 1427 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1428 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1429 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1430 ch->per_thread_cache_count--; 1431 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1432 } 1433 1434 assert(ch->per_thread_cache_count == 0); 1435 } 1436 1437 static void 1438 bdev_init_complete(int rc) 1439 { 1440 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1441 void *cb_arg = g_init_cb_arg; 1442 struct spdk_bdev_module *m; 1443 1444 g_bdev_mgr.init_complete = true; 1445 g_init_cb_fn = NULL; 1446 g_init_cb_arg = NULL; 1447 1448 /* 1449 * For modules that need to know when subsystem init is complete, 1450 * inform them now. 1451 */ 1452 if (rc == 0) { 1453 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1454 if (m->init_complete) { 1455 m->init_complete(); 1456 } 1457 } 1458 } 1459 1460 cb_fn(cb_arg, rc); 1461 } 1462 1463 static bool 1464 bdev_module_all_actions_completed(void) 1465 { 1466 struct spdk_bdev_module *m; 1467 1468 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1469 if (m->internal.action_in_progress > 0) { 1470 return false; 1471 } 1472 } 1473 return true; 1474 } 1475 1476 static void 1477 bdev_module_action_complete(void) 1478 { 1479 /* 1480 * Don't finish bdev subsystem initialization if 1481 * module pre-initialization is still in progress, or 1482 * the subsystem been already initialized. 1483 */ 1484 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1485 return; 1486 } 1487 1488 /* 1489 * Check all bdev modules for inits/examinations in progress. If any 1490 * exist, return immediately since we cannot finish bdev subsystem 1491 * initialization until all are completed. 1492 */ 1493 if (!bdev_module_all_actions_completed()) { 1494 return; 1495 } 1496 1497 /* 1498 * Modules already finished initialization - now that all 1499 * the bdev modules have finished their asynchronous I/O 1500 * processing, the entire bdev layer can be marked as complete. 1501 */ 1502 bdev_init_complete(0); 1503 } 1504 1505 static void 1506 bdev_module_action_done(struct spdk_bdev_module *module) 1507 { 1508 assert(module->internal.action_in_progress > 0); 1509 module->internal.action_in_progress--; 1510 bdev_module_action_complete(); 1511 } 1512 1513 void 1514 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1515 { 1516 bdev_module_action_done(module); 1517 } 1518 1519 void 1520 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1521 { 1522 bdev_module_action_done(module); 1523 } 1524 1525 /** The last initialized bdev module */ 1526 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1527 1528 static void 1529 bdev_init_failed(void *cb_arg) 1530 { 1531 struct spdk_bdev_module *module = cb_arg; 1532 1533 module->internal.action_in_progress--; 1534 bdev_init_complete(-1); 1535 } 1536 1537 static int 1538 bdev_modules_init(void) 1539 { 1540 struct spdk_bdev_module *module; 1541 int rc = 0; 1542 1543 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1544 g_resume_bdev_module = module; 1545 if (module->async_init) { 1546 module->internal.action_in_progress = 1; 1547 } 1548 rc = module->module_init(); 1549 if (rc != 0) { 1550 /* Bump action_in_progress to prevent other modules from completion of modules_init 1551 * Send message to defer application shutdown until resources are cleaned up */ 1552 module->internal.action_in_progress = 1; 1553 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1554 return rc; 1555 } 1556 } 1557 1558 g_resume_bdev_module = NULL; 1559 return 0; 1560 } 1561 1562 void 1563 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1564 { 1565 int cache_size; 1566 int rc = 0; 1567 char mempool_name[32]; 1568 1569 assert(cb_fn != NULL); 1570 1571 g_init_cb_fn = cb_fn; 1572 g_init_cb_arg = cb_arg; 1573 1574 spdk_notify_type_register("bdev_register"); 1575 spdk_notify_type_register("bdev_unregister"); 1576 1577 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1578 1579 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1580 g_bdev_opts.bdev_io_pool_size, 1581 sizeof(struct spdk_bdev_io) + 1582 bdev_module_get_max_ctx_size(), 1583 0, 1584 SPDK_ENV_SOCKET_ID_ANY); 1585 1586 if (g_bdev_mgr.bdev_io_pool == NULL) { 1587 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1588 bdev_init_complete(-1); 1589 return; 1590 } 1591 1592 /** 1593 * Ensure no more than half of the total buffers end up local caches, by 1594 * using spdk_env_get_core_count() to determine how many local caches we need 1595 * to account for. 1596 */ 1597 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1598 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1599 1600 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1601 g_bdev_opts.small_buf_pool_size, 1602 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1603 SPDK_BDEV_POOL_ALIGNMENT, 1604 cache_size, 1605 SPDK_ENV_SOCKET_ID_ANY); 1606 if (!g_bdev_mgr.buf_small_pool) { 1607 SPDK_ERRLOG("create rbuf small pool failed\n"); 1608 bdev_init_complete(-1); 1609 return; 1610 } 1611 1612 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1613 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1614 1615 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1616 g_bdev_opts.large_buf_pool_size, 1617 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1618 SPDK_BDEV_POOL_ALIGNMENT, 1619 cache_size, 1620 SPDK_ENV_SOCKET_ID_ANY); 1621 if (!g_bdev_mgr.buf_large_pool) { 1622 SPDK_ERRLOG("create rbuf large pool failed\n"); 1623 bdev_init_complete(-1); 1624 return; 1625 } 1626 1627 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1628 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1629 if (!g_bdev_mgr.zero_buffer) { 1630 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1631 bdev_init_complete(-1); 1632 return; 1633 } 1634 1635 #ifdef SPDK_CONFIG_VTUNE 1636 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1637 #endif 1638 1639 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1640 bdev_mgmt_channel_destroy, 1641 sizeof(struct spdk_bdev_mgmt_channel), 1642 "bdev_mgr"); 1643 1644 rc = bdev_modules_init(); 1645 g_bdev_mgr.module_init_complete = true; 1646 if (rc != 0) { 1647 SPDK_ERRLOG("bdev modules init failed\n"); 1648 return; 1649 } 1650 1651 bdev_module_action_complete(); 1652 } 1653 1654 static void 1655 bdev_mgr_unregister_cb(void *io_device) 1656 { 1657 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1658 1659 if (g_bdev_mgr.bdev_io_pool) { 1660 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1661 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1662 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1663 g_bdev_opts.bdev_io_pool_size); 1664 } 1665 1666 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1667 } 1668 1669 if (g_bdev_mgr.buf_small_pool) { 1670 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1671 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1672 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1673 g_bdev_opts.small_buf_pool_size); 1674 assert(false); 1675 } 1676 1677 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1678 } 1679 1680 if (g_bdev_mgr.buf_large_pool) { 1681 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1682 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1683 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1684 g_bdev_opts.large_buf_pool_size); 1685 assert(false); 1686 } 1687 1688 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1689 } 1690 1691 spdk_free(g_bdev_mgr.zero_buffer); 1692 1693 bdev_examine_allowlist_free(); 1694 1695 cb_fn(g_fini_cb_arg); 1696 g_fini_cb_fn = NULL; 1697 g_fini_cb_arg = NULL; 1698 g_bdev_mgr.init_complete = false; 1699 g_bdev_mgr.module_init_complete = false; 1700 } 1701 1702 static void 1703 bdev_module_fini_iter(void *arg) 1704 { 1705 struct spdk_bdev_module *bdev_module; 1706 1707 /* FIXME: Handling initialization failures is broken now, 1708 * so we won't even try cleaning up after successfully 1709 * initialized modules. if module_init_complete is false, 1710 * just call spdk_bdev_mgr_unregister_cb 1711 */ 1712 if (!g_bdev_mgr.module_init_complete) { 1713 bdev_mgr_unregister_cb(NULL); 1714 return; 1715 } 1716 1717 /* Start iterating from the last touched module */ 1718 if (!g_resume_bdev_module) { 1719 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1720 } else { 1721 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1722 internal.tailq); 1723 } 1724 1725 while (bdev_module) { 1726 if (bdev_module->async_fini) { 1727 /* Save our place so we can resume later. We must 1728 * save the variable here, before calling module_fini() 1729 * below, because in some cases the module may immediately 1730 * call spdk_bdev_module_fini_done() and re-enter 1731 * this function to continue iterating. */ 1732 g_resume_bdev_module = bdev_module; 1733 } 1734 1735 if (bdev_module->module_fini) { 1736 bdev_module->module_fini(); 1737 } 1738 1739 if (bdev_module->async_fini) { 1740 return; 1741 } 1742 1743 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1744 internal.tailq); 1745 } 1746 1747 g_resume_bdev_module = NULL; 1748 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1749 } 1750 1751 void 1752 spdk_bdev_module_fini_done(void) 1753 { 1754 if (spdk_get_thread() != g_fini_thread) { 1755 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1756 } else { 1757 bdev_module_fini_iter(NULL); 1758 } 1759 } 1760 1761 static void 1762 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1763 { 1764 struct spdk_bdev *bdev = cb_arg; 1765 1766 if (bdeverrno && bdev) { 1767 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1768 bdev->name); 1769 1770 /* 1771 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1772 * bdev; try to continue by manually removing this bdev from the list and continue 1773 * with the next bdev in the list. 1774 */ 1775 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1776 } 1777 1778 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1779 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1780 /* 1781 * Bdev module finish need to be deferred as we might be in the middle of some context 1782 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1783 * after returning. 1784 */ 1785 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1786 return; 1787 } 1788 1789 /* 1790 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1791 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1792 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1793 * base bdevs. 1794 * 1795 * Also, walk the list in the reverse order. 1796 */ 1797 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1798 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1799 if (bdev->internal.claim_module != NULL) { 1800 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1801 bdev->name, bdev->internal.claim_module->name); 1802 continue; 1803 } 1804 1805 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1806 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1807 return; 1808 } 1809 1810 /* 1811 * If any bdev fails to unclaim underlying bdev properly, we may face the 1812 * case of bdev list consisting of claimed bdevs only (if claims are managed 1813 * correctly, this would mean there's a loop in the claims graph which is 1814 * clearly impossible). Warn and unregister last bdev on the list then. 1815 */ 1816 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1817 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1818 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1819 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1820 return; 1821 } 1822 } 1823 1824 static void 1825 bdev_module_fini_start_iter(void *arg) 1826 { 1827 struct spdk_bdev_module *bdev_module; 1828 1829 if (!g_resume_bdev_module) { 1830 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1831 } else { 1832 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1833 } 1834 1835 while (bdev_module) { 1836 if (bdev_module->async_fini_start) { 1837 /* Save our place so we can resume later. We must 1838 * save the variable here, before calling fini_start() 1839 * below, because in some cases the module may immediately 1840 * call spdk_bdev_module_fini_start_done() and re-enter 1841 * this function to continue iterating. */ 1842 g_resume_bdev_module = bdev_module; 1843 } 1844 1845 if (bdev_module->fini_start) { 1846 bdev_module->fini_start(); 1847 } 1848 1849 if (bdev_module->async_fini_start) { 1850 return; 1851 } 1852 1853 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1854 } 1855 1856 g_resume_bdev_module = NULL; 1857 1858 bdev_finish_unregister_bdevs_iter(NULL, 0); 1859 } 1860 1861 void 1862 spdk_bdev_module_fini_start_done(void) 1863 { 1864 if (spdk_get_thread() != g_fini_thread) { 1865 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1866 } else { 1867 bdev_module_fini_start_iter(NULL); 1868 } 1869 } 1870 1871 static void 1872 bdev_finish_wait_for_examine_done(void *cb_arg) 1873 { 1874 bdev_module_fini_start_iter(NULL); 1875 } 1876 1877 void 1878 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1879 { 1880 int rc; 1881 1882 assert(cb_fn != NULL); 1883 1884 g_fini_thread = spdk_get_thread(); 1885 1886 g_fini_cb_fn = cb_fn; 1887 g_fini_cb_arg = cb_arg; 1888 1889 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 1890 if (rc != 0) { 1891 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 1892 bdev_finish_wait_for_examine_done(NULL); 1893 } 1894 } 1895 1896 struct spdk_bdev_io * 1897 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1898 { 1899 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1900 struct spdk_bdev_io *bdev_io; 1901 1902 if (ch->per_thread_cache_count > 0) { 1903 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1904 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1905 ch->per_thread_cache_count--; 1906 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1907 /* 1908 * Don't try to look for bdev_ios in the global pool if there are 1909 * waiters on bdev_ios - we don't want this caller to jump the line. 1910 */ 1911 bdev_io = NULL; 1912 } else { 1913 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1914 } 1915 1916 return bdev_io; 1917 } 1918 1919 void 1920 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1921 { 1922 struct spdk_bdev_mgmt_channel *ch; 1923 1924 assert(bdev_io != NULL); 1925 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1926 1927 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1928 1929 if (bdev_io->internal.buf != NULL) { 1930 bdev_io_put_buf(bdev_io); 1931 } 1932 1933 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1934 ch->per_thread_cache_count++; 1935 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1936 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1937 struct spdk_bdev_io_wait_entry *entry; 1938 1939 entry = TAILQ_FIRST(&ch->io_wait_queue); 1940 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1941 entry->cb_fn(entry->cb_arg); 1942 } 1943 } else { 1944 /* We should never have a full cache with entries on the io wait queue. */ 1945 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1946 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1947 } 1948 } 1949 1950 static bool 1951 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1952 { 1953 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1954 1955 switch (limit) { 1956 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1957 return true; 1958 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1959 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1960 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1961 return false; 1962 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1963 default: 1964 return false; 1965 } 1966 } 1967 1968 static bool 1969 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1970 { 1971 switch (bdev_io->type) { 1972 case SPDK_BDEV_IO_TYPE_NVME_IO: 1973 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1974 case SPDK_BDEV_IO_TYPE_READ: 1975 case SPDK_BDEV_IO_TYPE_WRITE: 1976 return true; 1977 case SPDK_BDEV_IO_TYPE_ZCOPY: 1978 if (bdev_io->u.bdev.zcopy.start) { 1979 return true; 1980 } else { 1981 return false; 1982 } 1983 default: 1984 return false; 1985 } 1986 } 1987 1988 static bool 1989 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1990 { 1991 switch (bdev_io->type) { 1992 case SPDK_BDEV_IO_TYPE_NVME_IO: 1993 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1994 /* Bit 1 (0x2) set for read operation */ 1995 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1996 return true; 1997 } else { 1998 return false; 1999 } 2000 case SPDK_BDEV_IO_TYPE_READ: 2001 return true; 2002 case SPDK_BDEV_IO_TYPE_ZCOPY: 2003 /* Populate to read from disk */ 2004 if (bdev_io->u.bdev.zcopy.populate) { 2005 return true; 2006 } else { 2007 return false; 2008 } 2009 default: 2010 return false; 2011 } 2012 } 2013 2014 static uint64_t 2015 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2016 { 2017 struct spdk_bdev *bdev = bdev_io->bdev; 2018 2019 switch (bdev_io->type) { 2020 case SPDK_BDEV_IO_TYPE_NVME_IO: 2021 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2022 return bdev_io->u.nvme_passthru.nbytes; 2023 case SPDK_BDEV_IO_TYPE_READ: 2024 case SPDK_BDEV_IO_TYPE_WRITE: 2025 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2026 case SPDK_BDEV_IO_TYPE_ZCOPY: 2027 /* Track the data in the start phase only */ 2028 if (bdev_io->u.bdev.zcopy.start) { 2029 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2030 } else { 2031 return 0; 2032 } 2033 default: 2034 return 0; 2035 } 2036 } 2037 2038 static bool 2039 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2040 { 2041 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2042 return true; 2043 } else { 2044 return false; 2045 } 2046 } 2047 2048 static bool 2049 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2050 { 2051 if (bdev_is_read_io(io) == false) { 2052 return false; 2053 } 2054 2055 return bdev_qos_rw_queue_io(limit, io); 2056 } 2057 2058 static bool 2059 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2060 { 2061 if (bdev_is_read_io(io) == true) { 2062 return false; 2063 } 2064 2065 return bdev_qos_rw_queue_io(limit, io); 2066 } 2067 2068 static void 2069 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2070 { 2071 limit->remaining_this_timeslice--; 2072 } 2073 2074 static void 2075 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2076 { 2077 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2078 } 2079 2080 static void 2081 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2082 { 2083 if (bdev_is_read_io(io) == false) { 2084 return; 2085 } 2086 2087 return bdev_qos_rw_bps_update_quota(limit, io); 2088 } 2089 2090 static void 2091 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2092 { 2093 if (bdev_is_read_io(io) == true) { 2094 return; 2095 } 2096 2097 return bdev_qos_rw_bps_update_quota(limit, io); 2098 } 2099 2100 static void 2101 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2102 { 2103 int i; 2104 2105 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2106 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2107 qos->rate_limits[i].queue_io = NULL; 2108 qos->rate_limits[i].update_quota = NULL; 2109 continue; 2110 } 2111 2112 switch (i) { 2113 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2114 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2115 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2116 break; 2117 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2118 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2119 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2120 break; 2121 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2122 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2123 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2124 break; 2125 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2126 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2127 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2128 break; 2129 default: 2130 break; 2131 } 2132 } 2133 } 2134 2135 static void 2136 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2137 struct spdk_bdev_io *bdev_io, 2138 enum spdk_bdev_io_status status) 2139 { 2140 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2141 2142 bdev_io->internal.in_submit_request = true; 2143 bdev_ch->io_outstanding++; 2144 shared_resource->io_outstanding++; 2145 spdk_bdev_io_complete(bdev_io, status); 2146 bdev_io->internal.in_submit_request = false; 2147 } 2148 2149 static inline void 2150 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2151 { 2152 struct spdk_bdev *bdev = bdev_io->bdev; 2153 struct spdk_io_channel *ch = bdev_ch->channel; 2154 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2155 2156 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2157 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2158 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2159 2160 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2161 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 2162 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 2163 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2164 SPDK_BDEV_IO_STATUS_SUCCESS); 2165 return; 2166 } 2167 } 2168 2169 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2170 bdev_ch->io_outstanding++; 2171 shared_resource->io_outstanding++; 2172 bdev_io->internal.in_submit_request = true; 2173 bdev->fn_table->submit_request(ch, bdev_io); 2174 bdev_io->internal.in_submit_request = false; 2175 } else { 2176 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2177 } 2178 } 2179 2180 static bool 2181 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2182 { 2183 int i; 2184 2185 if (bdev_qos_io_to_limit(bdev_io) == true) { 2186 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2187 if (!qos->rate_limits[i].queue_io) { 2188 continue; 2189 } 2190 2191 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2192 bdev_io) == true) { 2193 return true; 2194 } 2195 } 2196 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2197 if (!qos->rate_limits[i].update_quota) { 2198 continue; 2199 } 2200 2201 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2202 } 2203 } 2204 2205 return false; 2206 } 2207 2208 static int 2209 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2210 { 2211 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2212 int submitted_ios = 0; 2213 2214 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2215 if (!bdev_qos_queue_io(qos, bdev_io)) { 2216 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2217 bdev_io_do_submit(ch, bdev_io); 2218 submitted_ios++; 2219 } 2220 } 2221 2222 return submitted_ios; 2223 } 2224 2225 static void 2226 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2227 { 2228 int rc; 2229 2230 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2231 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2232 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2233 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2234 &bdev_io->internal.waitq_entry); 2235 if (rc != 0) { 2236 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2237 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2238 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2239 } 2240 } 2241 2242 static bool 2243 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2244 { 2245 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 2246 uint32_t max_size = bdev_io->bdev->max_segment_size; 2247 int max_segs = bdev_io->bdev->max_num_segments; 2248 2249 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 2250 2251 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2252 return false; 2253 } 2254 2255 if (io_boundary) { 2256 uint64_t start_stripe, end_stripe; 2257 2258 start_stripe = bdev_io->u.bdev.offset_blocks; 2259 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2260 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2261 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2262 start_stripe >>= spdk_u32log2(io_boundary); 2263 end_stripe >>= spdk_u32log2(io_boundary); 2264 } else { 2265 start_stripe /= io_boundary; 2266 end_stripe /= io_boundary; 2267 } 2268 2269 if (start_stripe != end_stripe) { 2270 return true; 2271 } 2272 } 2273 2274 if (max_segs) { 2275 if (bdev_io->u.bdev.iovcnt > max_segs) { 2276 return true; 2277 } 2278 } 2279 2280 if (max_size) { 2281 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2282 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2283 return true; 2284 } 2285 } 2286 } 2287 2288 return false; 2289 } 2290 2291 static bool 2292 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2293 { 2294 uint32_t num_unmap_segments; 2295 2296 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2297 return false; 2298 } 2299 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2300 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2301 return true; 2302 } 2303 2304 return false; 2305 } 2306 2307 static bool 2308 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2309 { 2310 if (!bdev_io->bdev->max_write_zeroes) { 2311 return false; 2312 } 2313 2314 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2315 return true; 2316 } 2317 2318 return false; 2319 } 2320 2321 static bool 2322 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2323 { 2324 switch (bdev_io->type) { 2325 case SPDK_BDEV_IO_TYPE_READ: 2326 case SPDK_BDEV_IO_TYPE_WRITE: 2327 return bdev_rw_should_split(bdev_io); 2328 case SPDK_BDEV_IO_TYPE_UNMAP: 2329 return bdev_unmap_should_split(bdev_io); 2330 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2331 return bdev_write_zeroes_should_split(bdev_io); 2332 default: 2333 return false; 2334 } 2335 } 2336 2337 static uint32_t 2338 _to_next_boundary(uint64_t offset, uint32_t boundary) 2339 { 2340 return (boundary - (offset % boundary)); 2341 } 2342 2343 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2344 2345 static void _bdev_rw_split(void *_bdev_io); 2346 2347 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2348 2349 static void 2350 _bdev_unmap_split(void *_bdev_io) 2351 { 2352 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2353 } 2354 2355 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2356 2357 static void 2358 _bdev_write_zeroes_split(void *_bdev_io) 2359 { 2360 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2361 } 2362 2363 static int 2364 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2365 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2366 { 2367 int rc; 2368 uint64_t current_offset, current_remaining; 2369 spdk_bdev_io_wait_cb io_wait_fn; 2370 2371 current_offset = *offset; 2372 current_remaining = *remaining; 2373 2374 bdev_io->u.bdev.split_outstanding++; 2375 2376 io_wait_fn = _bdev_rw_split; 2377 switch (bdev_io->type) { 2378 case SPDK_BDEV_IO_TYPE_READ: 2379 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2380 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2381 iov, iovcnt, md_buf, current_offset, 2382 num_blocks, 2383 bdev_io_split_done, bdev_io, 2384 bdev_io->internal.ext_opts, true); 2385 break; 2386 case SPDK_BDEV_IO_TYPE_WRITE: 2387 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2388 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2389 iov, iovcnt, md_buf, current_offset, 2390 num_blocks, 2391 bdev_io_split_done, bdev_io, 2392 bdev_io->internal.ext_opts, true); 2393 break; 2394 case SPDK_BDEV_IO_TYPE_UNMAP: 2395 io_wait_fn = _bdev_unmap_split; 2396 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2397 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2398 current_offset, num_blocks, 2399 bdev_io_split_done, bdev_io); 2400 break; 2401 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2402 io_wait_fn = _bdev_write_zeroes_split; 2403 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2404 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2405 current_offset, num_blocks, 2406 bdev_io_split_done, bdev_io); 2407 break; 2408 default: 2409 assert(false); 2410 rc = -EINVAL; 2411 break; 2412 } 2413 2414 if (rc == 0) { 2415 current_offset += num_blocks; 2416 current_remaining -= num_blocks; 2417 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2418 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2419 *offset = current_offset; 2420 *remaining = current_remaining; 2421 } else { 2422 bdev_io->u.bdev.split_outstanding--; 2423 if (rc == -ENOMEM) { 2424 if (bdev_io->u.bdev.split_outstanding == 0) { 2425 /* No I/O is outstanding. Hence we should wait here. */ 2426 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2427 } 2428 } else { 2429 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2430 if (bdev_io->u.bdev.split_outstanding == 0) { 2431 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2432 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2433 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2434 } 2435 } 2436 } 2437 2438 return rc; 2439 } 2440 2441 static void 2442 _bdev_rw_split(void *_bdev_io) 2443 { 2444 struct iovec *parent_iov, *iov; 2445 struct spdk_bdev_io *bdev_io = _bdev_io; 2446 struct spdk_bdev *bdev = bdev_io->bdev; 2447 uint64_t parent_offset, current_offset, remaining; 2448 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2449 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2450 uint32_t iovcnt, iov_len, child_iovsize; 2451 uint32_t blocklen = bdev->blocklen; 2452 uint32_t io_boundary = bdev->optimal_io_boundary; 2453 uint32_t max_segment_size = bdev->max_segment_size; 2454 uint32_t max_child_iovcnt = bdev->max_num_segments; 2455 void *md_buf = NULL; 2456 int rc; 2457 2458 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2459 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2460 BDEV_IO_NUM_CHILD_IOV; 2461 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2462 2463 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2464 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2465 parent_offset = bdev_io->u.bdev.offset_blocks; 2466 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2467 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2468 2469 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2470 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2471 if (parent_iov_offset < parent_iov->iov_len) { 2472 break; 2473 } 2474 parent_iov_offset -= parent_iov->iov_len; 2475 } 2476 2477 child_iovcnt = 0; 2478 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2479 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2480 to_next_boundary = spdk_min(remaining, to_next_boundary); 2481 to_next_boundary_bytes = to_next_boundary * blocklen; 2482 2483 iov = &bdev_io->child_iov[child_iovcnt]; 2484 iovcnt = 0; 2485 2486 if (bdev_io->u.bdev.md_buf) { 2487 md_buf = (char *)bdev_io->u.bdev.md_buf + 2488 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2489 } 2490 2491 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2492 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2493 iovcnt < child_iovsize) { 2494 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2495 iov_len = parent_iov->iov_len - parent_iov_offset; 2496 2497 iov_len = spdk_min(iov_len, max_segment_size); 2498 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2499 to_next_boundary_bytes -= iov_len; 2500 2501 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2502 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2503 2504 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2505 parent_iov_offset += iov_len; 2506 } else { 2507 parent_iovpos++; 2508 parent_iov_offset = 0; 2509 } 2510 child_iovcnt++; 2511 iovcnt++; 2512 } 2513 2514 if (to_next_boundary_bytes > 0) { 2515 /* We had to stop this child I/O early because we ran out of 2516 * child_iov space or were limited by max_num_segments. 2517 * Ensure the iovs to be aligned with block size and 2518 * then adjust to_next_boundary before starting the 2519 * child I/O. 2520 */ 2521 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2522 iovcnt == child_iovsize); 2523 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2524 if (to_last_block_bytes != 0) { 2525 uint32_t child_iovpos = child_iovcnt - 1; 2526 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2527 * so the loop will naturally end 2528 */ 2529 2530 to_last_block_bytes = blocklen - to_last_block_bytes; 2531 to_next_boundary_bytes += to_last_block_bytes; 2532 while (to_last_block_bytes > 0 && iovcnt > 0) { 2533 iov_len = spdk_min(to_last_block_bytes, 2534 bdev_io->child_iov[child_iovpos].iov_len); 2535 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2536 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2537 child_iovpos--; 2538 if (--iovcnt == 0) { 2539 /* If the child IO is less than a block size just return. 2540 * If the first child IO of any split round is less than 2541 * a block size, an error exit. 2542 */ 2543 if (bdev_io->u.bdev.split_outstanding == 0) { 2544 SPDK_ERRLOG("The first child io was less than a block size\n"); 2545 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2546 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2547 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2548 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2549 } 2550 2551 return; 2552 } 2553 } 2554 2555 to_last_block_bytes -= iov_len; 2556 2557 if (parent_iov_offset == 0) { 2558 parent_iovpos--; 2559 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2560 } 2561 parent_iov_offset -= iov_len; 2562 } 2563 2564 assert(to_last_block_bytes == 0); 2565 } 2566 to_next_boundary -= to_next_boundary_bytes / blocklen; 2567 } 2568 2569 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2570 ¤t_offset, &remaining); 2571 if (spdk_unlikely(rc)) { 2572 return; 2573 } 2574 } 2575 } 2576 2577 static void 2578 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2579 { 2580 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2581 uint32_t num_children_reqs = 0; 2582 int rc; 2583 2584 offset = bdev_io->u.bdev.split_current_offset_blocks; 2585 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2586 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2587 2588 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2589 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2590 2591 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2592 &offset, &remaining); 2593 if (spdk_likely(rc == 0)) { 2594 num_children_reqs++; 2595 } else { 2596 return; 2597 } 2598 } 2599 } 2600 2601 static void 2602 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2603 { 2604 uint64_t offset, write_zeroes_blocks, remaining; 2605 uint32_t num_children_reqs = 0; 2606 int rc; 2607 2608 offset = bdev_io->u.bdev.split_current_offset_blocks; 2609 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2610 2611 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2612 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2613 2614 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2615 &offset, &remaining); 2616 if (spdk_likely(rc == 0)) { 2617 num_children_reqs++; 2618 } else { 2619 return; 2620 } 2621 } 2622 } 2623 2624 static void 2625 parent_bdev_io_complete(void *ctx, int rc) 2626 { 2627 struct spdk_bdev_io *parent_io = ctx; 2628 2629 if (rc) { 2630 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2631 } 2632 2633 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2634 parent_io->internal.caller_ctx); 2635 } 2636 2637 static void 2638 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2639 { 2640 struct spdk_bdev_io *parent_io = cb_arg; 2641 2642 spdk_bdev_free_io(bdev_io); 2643 2644 if (!success) { 2645 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2646 /* If any child I/O failed, stop further splitting process. */ 2647 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2648 parent_io->u.bdev.split_remaining_num_blocks = 0; 2649 } 2650 parent_io->u.bdev.split_outstanding--; 2651 if (parent_io->u.bdev.split_outstanding != 0) { 2652 return; 2653 } 2654 2655 /* 2656 * Parent I/O finishes when all blocks are consumed. 2657 */ 2658 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2659 assert(parent_io->internal.cb != bdev_io_split_done); 2660 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2661 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2662 2663 if (parent_io->internal.orig_iovcnt != 0) { 2664 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2665 /* bdev IO will be completed in the callback */ 2666 } else { 2667 parent_bdev_io_complete(parent_io, 0); 2668 } 2669 return; 2670 } 2671 2672 /* 2673 * Continue with the splitting process. This function will complete the parent I/O if the 2674 * splitting is done. 2675 */ 2676 switch (parent_io->type) { 2677 case SPDK_BDEV_IO_TYPE_READ: 2678 case SPDK_BDEV_IO_TYPE_WRITE: 2679 _bdev_rw_split(parent_io); 2680 break; 2681 case SPDK_BDEV_IO_TYPE_UNMAP: 2682 bdev_unmap_split(parent_io); 2683 break; 2684 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2685 bdev_write_zeroes_split(parent_io); 2686 break; 2687 default: 2688 assert(false); 2689 break; 2690 } 2691 } 2692 2693 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2694 bool success); 2695 2696 static void 2697 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2698 { 2699 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2700 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2701 bdev_io->u.bdev.split_outstanding = 0; 2702 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2703 2704 switch (bdev_io->type) { 2705 case SPDK_BDEV_IO_TYPE_READ: 2706 case SPDK_BDEV_IO_TYPE_WRITE: 2707 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2708 _bdev_rw_split(bdev_io); 2709 } else { 2710 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2711 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2712 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2713 } 2714 break; 2715 case SPDK_BDEV_IO_TYPE_UNMAP: 2716 bdev_unmap_split(bdev_io); 2717 break; 2718 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2719 bdev_write_zeroes_split(bdev_io); 2720 break; 2721 default: 2722 assert(false); 2723 break; 2724 } 2725 } 2726 2727 static void 2728 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2729 { 2730 if (!success) { 2731 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2732 return; 2733 } 2734 2735 _bdev_rw_split(bdev_io); 2736 } 2737 2738 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2739 * be inlined, at least on some compilers. 2740 */ 2741 static inline void 2742 _bdev_io_submit(void *ctx) 2743 { 2744 struct spdk_bdev_io *bdev_io = ctx; 2745 struct spdk_bdev *bdev = bdev_io->bdev; 2746 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2747 uint64_t tsc; 2748 2749 tsc = spdk_get_ticks(); 2750 bdev_io->internal.submit_tsc = tsc; 2751 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, 2752 (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2753 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2754 2755 if (spdk_likely(bdev_ch->flags == 0)) { 2756 bdev_io_do_submit(bdev_ch, bdev_io); 2757 return; 2758 } 2759 2760 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2761 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2762 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2763 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2764 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2765 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2766 } else { 2767 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2768 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2769 } 2770 } else { 2771 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2772 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2773 } 2774 } 2775 2776 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2777 2778 bool 2779 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2780 { 2781 if (range1->length == 0 || range2->length == 0) { 2782 return false; 2783 } 2784 2785 if (range1->offset + range1->length <= range2->offset) { 2786 return false; 2787 } 2788 2789 if (range2->offset + range2->length <= range1->offset) { 2790 return false; 2791 } 2792 2793 return true; 2794 } 2795 2796 static bool 2797 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2798 { 2799 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2800 struct lba_range r; 2801 2802 switch (bdev_io->type) { 2803 case SPDK_BDEV_IO_TYPE_NVME_IO: 2804 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2805 /* Don't try to decode the NVMe command - just assume worst-case and that 2806 * it overlaps a locked range. 2807 */ 2808 return true; 2809 case SPDK_BDEV_IO_TYPE_WRITE: 2810 case SPDK_BDEV_IO_TYPE_UNMAP: 2811 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2812 case SPDK_BDEV_IO_TYPE_ZCOPY: 2813 r.offset = bdev_io->u.bdev.offset_blocks; 2814 r.length = bdev_io->u.bdev.num_blocks; 2815 if (!bdev_lba_range_overlapped(range, &r)) { 2816 /* This I/O doesn't overlap the specified LBA range. */ 2817 return false; 2818 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2819 /* This I/O overlaps, but the I/O is on the same channel that locked this 2820 * range, and the caller_ctx is the same as the locked_ctx. This means 2821 * that this I/O is associated with the lock, and is allowed to execute. 2822 */ 2823 return false; 2824 } else { 2825 return true; 2826 } 2827 default: 2828 return false; 2829 } 2830 } 2831 2832 void 2833 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2834 { 2835 struct spdk_bdev *bdev = bdev_io->bdev; 2836 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2837 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2838 2839 assert(thread != NULL); 2840 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2841 2842 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2843 struct lba_range *range; 2844 2845 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2846 if (bdev_io_range_is_locked(bdev_io, range)) { 2847 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2848 return; 2849 } 2850 } 2851 } 2852 2853 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2854 2855 if (bdev_io_should_split(bdev_io)) { 2856 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2857 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2858 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2859 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2860 bdev_io_split(NULL, bdev_io); 2861 return; 2862 } 2863 2864 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2865 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2866 _bdev_io_submit(bdev_io); 2867 } else { 2868 bdev_io->internal.io_submit_ch = ch; 2869 bdev_io->internal.ch = bdev->internal.qos->ch; 2870 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2871 } 2872 } else { 2873 _bdev_io_submit(bdev_io); 2874 } 2875 } 2876 2877 static inline void 2878 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 2879 { 2880 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 2881 2882 /* Zero part we don't copy */ 2883 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 2884 memcpy(opts_copy, opts, opts->size); 2885 opts_copy->size = sizeof(*opts_copy); 2886 opts_copy->metadata = bdev_io->u.bdev.md_buf; 2887 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 2888 bdev_io->u.bdev.ext_opts = opts_copy; 2889 } 2890 2891 static inline void 2892 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 2893 { 2894 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 2895 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 2896 * For write operation we need to pull buffers from memory domain before submitting IO. 2897 * Once read operation completes, we need to use memory_domain push functionality to 2898 * update data in original memory domain IO buffer 2899 * This IO request will go through a regular IO flow, so clear memory domains pointers in 2900 * the copied ext_opts */ 2901 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 2902 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 2903 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 2904 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2905 } 2906 2907 static inline void 2908 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 2909 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 2910 { 2911 if (opts) { 2912 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 2913 assert(opts->size <= sizeof(*opts)); 2914 /* 2915 * copy if size is smaller than opts struct to avoid having to check size 2916 * on every access to bdev_io->u.bdev.ext_opts 2917 */ 2918 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 2919 _bdev_io_copy_ext_opts(bdev_io, opts); 2920 if (use_pull_push) { 2921 _bdev_io_ext_use_bounce_buffer(bdev_io); 2922 return; 2923 } 2924 } 2925 } 2926 bdev_io_submit(bdev_io); 2927 } 2928 2929 static void 2930 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2931 { 2932 struct spdk_bdev *bdev = bdev_io->bdev; 2933 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2934 struct spdk_io_channel *ch = bdev_ch->channel; 2935 2936 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2937 2938 bdev_io->internal.in_submit_request = true; 2939 bdev->fn_table->submit_request(ch, bdev_io); 2940 bdev_io->internal.in_submit_request = false; 2941 } 2942 2943 void 2944 bdev_io_init(struct spdk_bdev_io *bdev_io, 2945 struct spdk_bdev *bdev, void *cb_arg, 2946 spdk_bdev_io_completion_cb cb) 2947 { 2948 bdev_io->bdev = bdev; 2949 bdev_io->internal.caller_ctx = cb_arg; 2950 bdev_io->internal.cb = cb; 2951 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2952 bdev_io->internal.in_submit_request = false; 2953 bdev_io->internal.buf = NULL; 2954 bdev_io->internal.io_submit_ch = NULL; 2955 bdev_io->internal.orig_iovs = NULL; 2956 bdev_io->internal.orig_iovcnt = 0; 2957 bdev_io->internal.orig_md_iov.iov_base = NULL; 2958 bdev_io->internal.error.nvme.cdw0 = 0; 2959 bdev_io->num_retries = 0; 2960 bdev_io->internal.get_buf_cb = NULL; 2961 bdev_io->internal.get_aux_buf_cb = NULL; 2962 bdev_io->internal.ext_opts = NULL; 2963 bdev_io->internal.data_transfer_cpl = NULL; 2964 } 2965 2966 static bool 2967 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2968 { 2969 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2970 } 2971 2972 bool 2973 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2974 { 2975 bool supported; 2976 2977 supported = bdev_io_type_supported(bdev, io_type); 2978 2979 if (!supported) { 2980 switch (io_type) { 2981 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2982 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2983 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2984 break; 2985 default: 2986 break; 2987 } 2988 } 2989 2990 return supported; 2991 } 2992 2993 int 2994 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2995 { 2996 if (bdev->fn_table->dump_info_json) { 2997 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2998 } 2999 3000 return 0; 3001 } 3002 3003 static void 3004 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3005 { 3006 uint32_t max_per_timeslice = 0; 3007 int i; 3008 3009 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3010 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3011 qos->rate_limits[i].max_per_timeslice = 0; 3012 continue; 3013 } 3014 3015 max_per_timeslice = qos->rate_limits[i].limit * 3016 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3017 3018 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3019 qos->rate_limits[i].min_per_timeslice); 3020 3021 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3022 } 3023 3024 bdev_qos_set_ops(qos); 3025 } 3026 3027 static int 3028 bdev_channel_poll_qos(void *arg) 3029 { 3030 struct spdk_bdev_qos *qos = arg; 3031 uint64_t now = spdk_get_ticks(); 3032 int i; 3033 3034 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3035 /* We received our callback earlier than expected - return 3036 * immediately and wait to do accounting until at least one 3037 * timeslice has actually expired. This should never happen 3038 * with a well-behaved timer implementation. 3039 */ 3040 return SPDK_POLLER_IDLE; 3041 } 3042 3043 /* Reset for next round of rate limiting */ 3044 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3045 /* We may have allowed the IOs or bytes to slightly overrun in the last 3046 * timeslice. remaining_this_timeslice is signed, so if it's negative 3047 * here, we'll account for the overrun so that the next timeslice will 3048 * be appropriately reduced. 3049 */ 3050 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3051 qos->rate_limits[i].remaining_this_timeslice = 0; 3052 } 3053 } 3054 3055 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3056 qos->last_timeslice += qos->timeslice_size; 3057 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3058 qos->rate_limits[i].remaining_this_timeslice += 3059 qos->rate_limits[i].max_per_timeslice; 3060 } 3061 } 3062 3063 return bdev_qos_io_submit(qos->ch, qos); 3064 } 3065 3066 static void 3067 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3068 { 3069 struct spdk_bdev_shared_resource *shared_resource; 3070 struct lba_range *range; 3071 3072 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3073 range = TAILQ_FIRST(&ch->locked_ranges); 3074 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3075 free(range); 3076 } 3077 3078 spdk_put_io_channel(ch->channel); 3079 3080 shared_resource = ch->shared_resource; 3081 3082 assert(TAILQ_EMPTY(&ch->io_locked)); 3083 assert(TAILQ_EMPTY(&ch->io_submitted)); 3084 assert(ch->io_outstanding == 0); 3085 assert(shared_resource->ref > 0); 3086 shared_resource->ref--; 3087 if (shared_resource->ref == 0) { 3088 assert(shared_resource->io_outstanding == 0); 3089 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3090 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3091 free(shared_resource); 3092 } 3093 } 3094 3095 /* Caller must hold bdev->internal.mutex. */ 3096 static void 3097 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3098 { 3099 struct spdk_bdev_qos *qos = bdev->internal.qos; 3100 int i; 3101 3102 /* Rate limiting on this bdev enabled */ 3103 if (qos) { 3104 if (qos->ch == NULL) { 3105 struct spdk_io_channel *io_ch; 3106 3107 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3108 bdev->name, spdk_get_thread()); 3109 3110 /* No qos channel has been selected, so set one up */ 3111 3112 /* Take another reference to ch */ 3113 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3114 assert(io_ch != NULL); 3115 qos->ch = ch; 3116 3117 qos->thread = spdk_io_channel_get_thread(io_ch); 3118 3119 TAILQ_INIT(&qos->queued); 3120 3121 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3122 if (bdev_qos_is_iops_rate_limit(i) == true) { 3123 qos->rate_limits[i].min_per_timeslice = 3124 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3125 } else { 3126 qos->rate_limits[i].min_per_timeslice = 3127 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3128 } 3129 3130 if (qos->rate_limits[i].limit == 0) { 3131 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3132 } 3133 } 3134 bdev_qos_update_max_quota_per_timeslice(qos); 3135 qos->timeslice_size = 3136 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3137 qos->last_timeslice = spdk_get_ticks(); 3138 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3139 qos, 3140 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3141 } 3142 3143 ch->flags |= BDEV_CH_QOS_ENABLED; 3144 } 3145 } 3146 3147 struct poll_timeout_ctx { 3148 struct spdk_bdev_desc *desc; 3149 uint64_t timeout_in_sec; 3150 spdk_bdev_io_timeout_cb cb_fn; 3151 void *cb_arg; 3152 }; 3153 3154 static void 3155 bdev_desc_free(struct spdk_bdev_desc *desc) 3156 { 3157 pthread_mutex_destroy(&desc->mutex); 3158 free(desc->media_events_buffer); 3159 free(desc); 3160 } 3161 3162 static void 3163 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 3164 { 3165 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3166 struct spdk_bdev_desc *desc = ctx->desc; 3167 3168 free(ctx); 3169 3170 pthread_mutex_lock(&desc->mutex); 3171 desc->refs--; 3172 if (desc->closed == true && desc->refs == 0) { 3173 pthread_mutex_unlock(&desc->mutex); 3174 bdev_desc_free(desc); 3175 return; 3176 } 3177 pthread_mutex_unlock(&desc->mutex); 3178 } 3179 3180 static void 3181 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 3182 { 3183 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3184 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3185 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 3186 struct spdk_bdev_desc *desc = ctx->desc; 3187 struct spdk_bdev_io *bdev_io; 3188 uint64_t now; 3189 3190 pthread_mutex_lock(&desc->mutex); 3191 if (desc->closed == true) { 3192 pthread_mutex_unlock(&desc->mutex); 3193 spdk_for_each_channel_continue(i, -1); 3194 return; 3195 } 3196 pthread_mutex_unlock(&desc->mutex); 3197 3198 now = spdk_get_ticks(); 3199 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3200 /* Exclude any I/O that are generated via splitting. */ 3201 if (bdev_io->internal.cb == bdev_io_split_done) { 3202 continue; 3203 } 3204 3205 /* Once we find an I/O that has not timed out, we can immediately 3206 * exit the loop. 3207 */ 3208 if (now < (bdev_io->internal.submit_tsc + 3209 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3210 goto end; 3211 } 3212 3213 if (bdev_io->internal.desc == desc) { 3214 ctx->cb_fn(ctx->cb_arg, bdev_io); 3215 } 3216 } 3217 3218 end: 3219 spdk_for_each_channel_continue(i, 0); 3220 } 3221 3222 static int 3223 bdev_poll_timeout_io(void *arg) 3224 { 3225 struct spdk_bdev_desc *desc = arg; 3226 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3227 struct poll_timeout_ctx *ctx; 3228 3229 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3230 if (!ctx) { 3231 SPDK_ERRLOG("failed to allocate memory\n"); 3232 return SPDK_POLLER_BUSY; 3233 } 3234 ctx->desc = desc; 3235 ctx->cb_arg = desc->cb_arg; 3236 ctx->cb_fn = desc->cb_fn; 3237 ctx->timeout_in_sec = desc->timeout_in_sec; 3238 3239 /* Take a ref on the descriptor in case it gets closed while we are checking 3240 * all of the channels. 3241 */ 3242 pthread_mutex_lock(&desc->mutex); 3243 desc->refs++; 3244 pthread_mutex_unlock(&desc->mutex); 3245 3246 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3247 bdev_channel_poll_timeout_io, 3248 ctx, 3249 bdev_channel_poll_timeout_io_done); 3250 3251 return SPDK_POLLER_BUSY; 3252 } 3253 3254 int 3255 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3256 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3257 { 3258 assert(desc->thread == spdk_get_thread()); 3259 3260 spdk_poller_unregister(&desc->io_timeout_poller); 3261 3262 if (timeout_in_sec) { 3263 assert(cb_fn != NULL); 3264 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3265 desc, 3266 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3267 1000); 3268 if (desc->io_timeout_poller == NULL) { 3269 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3270 return -1; 3271 } 3272 } 3273 3274 desc->cb_fn = cb_fn; 3275 desc->cb_arg = cb_arg; 3276 desc->timeout_in_sec = timeout_in_sec; 3277 3278 return 0; 3279 } 3280 3281 static int 3282 bdev_channel_create(void *io_device, void *ctx_buf) 3283 { 3284 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3285 struct spdk_bdev_channel *ch = ctx_buf; 3286 struct spdk_io_channel *mgmt_io_ch; 3287 struct spdk_bdev_mgmt_channel *mgmt_ch; 3288 struct spdk_bdev_shared_resource *shared_resource; 3289 struct lba_range *range; 3290 3291 ch->bdev = bdev; 3292 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3293 if (!ch->channel) { 3294 return -1; 3295 } 3296 3297 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3298 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3299 3300 assert(ch->histogram == NULL); 3301 if (bdev->internal.histogram_enabled) { 3302 ch->histogram = spdk_histogram_data_alloc(); 3303 if (ch->histogram == NULL) { 3304 SPDK_ERRLOG("Could not allocate histogram\n"); 3305 } 3306 } 3307 3308 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3309 if (!mgmt_io_ch) { 3310 spdk_put_io_channel(ch->channel); 3311 return -1; 3312 } 3313 3314 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3315 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3316 if (shared_resource->shared_ch == ch->channel) { 3317 spdk_put_io_channel(mgmt_io_ch); 3318 shared_resource->ref++; 3319 break; 3320 } 3321 } 3322 3323 if (shared_resource == NULL) { 3324 shared_resource = calloc(1, sizeof(*shared_resource)); 3325 if (shared_resource == NULL) { 3326 spdk_put_io_channel(ch->channel); 3327 spdk_put_io_channel(mgmt_io_ch); 3328 return -1; 3329 } 3330 3331 shared_resource->mgmt_ch = mgmt_ch; 3332 shared_resource->io_outstanding = 0; 3333 TAILQ_INIT(&shared_resource->nomem_io); 3334 shared_resource->nomem_threshold = 0; 3335 shared_resource->shared_ch = ch->channel; 3336 shared_resource->ref = 1; 3337 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3338 } 3339 3340 memset(&ch->stat, 0, sizeof(ch->stat)); 3341 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3342 ch->io_outstanding = 0; 3343 TAILQ_INIT(&ch->queued_resets); 3344 TAILQ_INIT(&ch->locked_ranges); 3345 ch->flags = 0; 3346 ch->shared_resource = shared_resource; 3347 3348 TAILQ_INIT(&ch->io_submitted); 3349 TAILQ_INIT(&ch->io_locked); 3350 3351 #ifdef SPDK_CONFIG_VTUNE 3352 { 3353 char *name; 3354 __itt_init_ittlib(NULL, 0); 3355 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3356 if (!name) { 3357 bdev_channel_destroy_resource(ch); 3358 return -1; 3359 } 3360 ch->handle = __itt_string_handle_create(name); 3361 free(name); 3362 ch->start_tsc = spdk_get_ticks(); 3363 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3364 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3365 } 3366 #endif 3367 3368 pthread_mutex_lock(&bdev->internal.mutex); 3369 bdev_enable_qos(bdev, ch); 3370 3371 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3372 struct lba_range *new_range; 3373 3374 new_range = calloc(1, sizeof(*new_range)); 3375 if (new_range == NULL) { 3376 pthread_mutex_unlock(&bdev->internal.mutex); 3377 bdev_channel_destroy_resource(ch); 3378 return -1; 3379 } 3380 new_range->length = range->length; 3381 new_range->offset = range->offset; 3382 new_range->locked_ctx = range->locked_ctx; 3383 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3384 } 3385 3386 pthread_mutex_unlock(&bdev->internal.mutex); 3387 3388 return 0; 3389 } 3390 3391 /* 3392 * Abort I/O that are waiting on a data buffer. These types of I/O are 3393 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3394 */ 3395 static void 3396 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3397 { 3398 bdev_io_stailq_t tmp; 3399 struct spdk_bdev_io *bdev_io; 3400 3401 STAILQ_INIT(&tmp); 3402 3403 while (!STAILQ_EMPTY(queue)) { 3404 bdev_io = STAILQ_FIRST(queue); 3405 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3406 if (bdev_io->internal.ch == ch) { 3407 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3408 } else { 3409 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3410 } 3411 } 3412 3413 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3414 } 3415 3416 /* 3417 * Abort I/O that are queued waiting for submission. These types of I/O are 3418 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3419 */ 3420 static void 3421 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3422 { 3423 struct spdk_bdev_io *bdev_io, *tmp; 3424 3425 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3426 if (bdev_io->internal.ch == ch) { 3427 TAILQ_REMOVE(queue, bdev_io, internal.link); 3428 /* 3429 * spdk_bdev_io_complete() assumes that the completed I/O had 3430 * been submitted to the bdev module. Since in this case it 3431 * hadn't, bump io_outstanding to account for the decrement 3432 * that spdk_bdev_io_complete() will do. 3433 */ 3434 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3435 ch->io_outstanding++; 3436 ch->shared_resource->io_outstanding++; 3437 } 3438 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3439 } 3440 } 3441 } 3442 3443 static bool 3444 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3445 { 3446 struct spdk_bdev_io *bdev_io; 3447 3448 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3449 if (bdev_io == bio_to_abort) { 3450 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3451 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3452 return true; 3453 } 3454 } 3455 3456 return false; 3457 } 3458 3459 static bool 3460 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3461 { 3462 struct spdk_bdev_io *bdev_io; 3463 3464 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3465 if (bdev_io == bio_to_abort) { 3466 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3467 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3468 return true; 3469 } 3470 } 3471 3472 return false; 3473 } 3474 3475 static void 3476 bdev_qos_channel_destroy(void *cb_arg) 3477 { 3478 struct spdk_bdev_qos *qos = cb_arg; 3479 3480 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3481 spdk_poller_unregister(&qos->poller); 3482 3483 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3484 3485 free(qos); 3486 } 3487 3488 static int 3489 bdev_qos_destroy(struct spdk_bdev *bdev) 3490 { 3491 int i; 3492 3493 /* 3494 * Cleanly shutting down the QoS poller is tricky, because 3495 * during the asynchronous operation the user could open 3496 * a new descriptor and create a new channel, spawning 3497 * a new QoS poller. 3498 * 3499 * The strategy is to create a new QoS structure here and swap it 3500 * in. The shutdown path then continues to refer to the old one 3501 * until it completes and then releases it. 3502 */ 3503 struct spdk_bdev_qos *new_qos, *old_qos; 3504 3505 old_qos = bdev->internal.qos; 3506 3507 new_qos = calloc(1, sizeof(*new_qos)); 3508 if (!new_qos) { 3509 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3510 return -ENOMEM; 3511 } 3512 3513 /* Copy the old QoS data into the newly allocated structure */ 3514 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3515 3516 /* Zero out the key parts of the QoS structure */ 3517 new_qos->ch = NULL; 3518 new_qos->thread = NULL; 3519 new_qos->poller = NULL; 3520 TAILQ_INIT(&new_qos->queued); 3521 /* 3522 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3523 * It will be used later for the new QoS structure. 3524 */ 3525 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3526 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3527 new_qos->rate_limits[i].min_per_timeslice = 0; 3528 new_qos->rate_limits[i].max_per_timeslice = 0; 3529 } 3530 3531 bdev->internal.qos = new_qos; 3532 3533 if (old_qos->thread == NULL) { 3534 free(old_qos); 3535 } else { 3536 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3537 } 3538 3539 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3540 * been destroyed yet. The destruction path will end up waiting for the final 3541 * channel to be put before it releases resources. */ 3542 3543 return 0; 3544 } 3545 3546 static void 3547 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3548 { 3549 total->bytes_read += add->bytes_read; 3550 total->num_read_ops += add->num_read_ops; 3551 total->bytes_written += add->bytes_written; 3552 total->num_write_ops += add->num_write_ops; 3553 total->bytes_unmapped += add->bytes_unmapped; 3554 total->num_unmap_ops += add->num_unmap_ops; 3555 total->read_latency_ticks += add->read_latency_ticks; 3556 total->write_latency_ticks += add->write_latency_ticks; 3557 total->unmap_latency_ticks += add->unmap_latency_ticks; 3558 } 3559 3560 static void 3561 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3562 { 3563 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3564 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3565 3566 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3567 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3568 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3569 } 3570 3571 static void 3572 bdev_channel_destroy(void *io_device, void *ctx_buf) 3573 { 3574 struct spdk_bdev_channel *ch = ctx_buf; 3575 3576 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3577 spdk_get_thread()); 3578 3579 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3580 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3581 3582 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3583 pthread_mutex_lock(&ch->bdev->internal.mutex); 3584 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3585 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3586 3587 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3588 3589 bdev_channel_abort_queued_ios(ch); 3590 3591 if (ch->histogram) { 3592 spdk_histogram_data_free(ch->histogram); 3593 } 3594 3595 bdev_channel_destroy_resource(ch); 3596 } 3597 3598 /* 3599 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3600 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3601 */ 3602 static int 3603 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3604 { 3605 struct spdk_bdev_name *tmp; 3606 3607 bdev_name->name = strdup(name); 3608 if (bdev_name->name == NULL) { 3609 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3610 return -ENOMEM; 3611 } 3612 3613 bdev_name->bdev = bdev; 3614 3615 pthread_mutex_lock(&g_bdev_mgr.mutex); 3616 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3617 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3618 3619 if (tmp != NULL) { 3620 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3621 free(bdev_name->name); 3622 return -EEXIST; 3623 } 3624 3625 return 0; 3626 } 3627 3628 static void 3629 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3630 { 3631 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3632 free(bdev_name->name); 3633 } 3634 3635 static void 3636 bdev_name_del(struct spdk_bdev_name *bdev_name) 3637 { 3638 pthread_mutex_lock(&g_bdev_mgr.mutex); 3639 bdev_name_del_unsafe(bdev_name); 3640 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3641 } 3642 3643 int 3644 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3645 { 3646 struct spdk_bdev_alias *tmp; 3647 int ret; 3648 3649 if (alias == NULL) { 3650 SPDK_ERRLOG("Empty alias passed\n"); 3651 return -EINVAL; 3652 } 3653 3654 tmp = calloc(1, sizeof(*tmp)); 3655 if (tmp == NULL) { 3656 SPDK_ERRLOG("Unable to allocate alias\n"); 3657 return -ENOMEM; 3658 } 3659 3660 ret = bdev_name_add(&tmp->alias, bdev, alias); 3661 if (ret != 0) { 3662 free(tmp); 3663 return ret; 3664 } 3665 3666 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3667 3668 return 0; 3669 } 3670 3671 static int 3672 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3673 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3674 { 3675 struct spdk_bdev_alias *tmp; 3676 3677 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3678 if (strcmp(alias, tmp->alias.name) == 0) { 3679 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3680 alias_del_fn(&tmp->alias); 3681 free(tmp); 3682 return 0; 3683 } 3684 } 3685 3686 return -ENOENT; 3687 } 3688 3689 int 3690 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3691 { 3692 int rc; 3693 3694 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3695 if (rc == -ENOENT) { 3696 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3697 } 3698 3699 return rc; 3700 } 3701 3702 void 3703 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3704 { 3705 struct spdk_bdev_alias *p, *tmp; 3706 3707 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3708 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3709 bdev_name_del(&p->alias); 3710 free(p); 3711 } 3712 } 3713 3714 struct spdk_io_channel * 3715 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3716 { 3717 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3718 } 3719 3720 void * 3721 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3722 { 3723 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3724 void *ctx = NULL; 3725 3726 if (bdev->fn_table->get_module_ctx) { 3727 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3728 } 3729 3730 return ctx; 3731 } 3732 3733 const char * 3734 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3735 { 3736 return bdev->module->name; 3737 } 3738 3739 const char * 3740 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3741 { 3742 return bdev->name; 3743 } 3744 3745 const char * 3746 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3747 { 3748 return bdev->product_name; 3749 } 3750 3751 const struct spdk_bdev_aliases_list * 3752 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3753 { 3754 return &bdev->aliases; 3755 } 3756 3757 uint32_t 3758 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3759 { 3760 return bdev->blocklen; 3761 } 3762 3763 uint32_t 3764 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3765 { 3766 return bdev->write_unit_size; 3767 } 3768 3769 uint64_t 3770 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3771 { 3772 return bdev->blockcnt; 3773 } 3774 3775 const char * 3776 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3777 { 3778 return qos_rpc_type[type]; 3779 } 3780 3781 void 3782 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3783 { 3784 int i; 3785 3786 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3787 3788 pthread_mutex_lock(&bdev->internal.mutex); 3789 if (bdev->internal.qos) { 3790 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3791 if (bdev->internal.qos->rate_limits[i].limit != 3792 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3793 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3794 if (bdev_qos_is_iops_rate_limit(i) == false) { 3795 /* Change from Byte to Megabyte which is user visible. */ 3796 limits[i] = limits[i] / 1024 / 1024; 3797 } 3798 } 3799 } 3800 } 3801 pthread_mutex_unlock(&bdev->internal.mutex); 3802 } 3803 3804 size_t 3805 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3806 { 3807 return 1 << bdev->required_alignment; 3808 } 3809 3810 uint32_t 3811 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3812 { 3813 return bdev->optimal_io_boundary; 3814 } 3815 3816 bool 3817 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3818 { 3819 return bdev->write_cache; 3820 } 3821 3822 const struct spdk_uuid * 3823 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3824 { 3825 return &bdev->uuid; 3826 } 3827 3828 uint16_t 3829 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3830 { 3831 return bdev->acwu; 3832 } 3833 3834 uint32_t 3835 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3836 { 3837 return bdev->md_len; 3838 } 3839 3840 bool 3841 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3842 { 3843 return (bdev->md_len != 0) && bdev->md_interleave; 3844 } 3845 3846 bool 3847 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3848 { 3849 return (bdev->md_len != 0) && !bdev->md_interleave; 3850 } 3851 3852 bool 3853 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3854 { 3855 return bdev->zoned; 3856 } 3857 3858 uint32_t 3859 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3860 { 3861 if (spdk_bdev_is_md_interleaved(bdev)) { 3862 return bdev->blocklen - bdev->md_len; 3863 } else { 3864 return bdev->blocklen; 3865 } 3866 } 3867 3868 uint32_t 3869 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3870 { 3871 return bdev->phys_blocklen; 3872 } 3873 3874 static uint32_t 3875 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3876 { 3877 if (!spdk_bdev_is_md_interleaved(bdev)) { 3878 return bdev->blocklen + bdev->md_len; 3879 } else { 3880 return bdev->blocklen; 3881 } 3882 } 3883 3884 /* We have to use the typedef in the function declaration to appease astyle. */ 3885 typedef enum spdk_dif_type spdk_dif_type_t; 3886 3887 spdk_dif_type_t 3888 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3889 { 3890 if (bdev->md_len != 0) { 3891 return bdev->dif_type; 3892 } else { 3893 return SPDK_DIF_DISABLE; 3894 } 3895 } 3896 3897 bool 3898 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3899 { 3900 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3901 return bdev->dif_is_head_of_md; 3902 } else { 3903 return false; 3904 } 3905 } 3906 3907 bool 3908 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3909 enum spdk_dif_check_type check_type) 3910 { 3911 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3912 return false; 3913 } 3914 3915 switch (check_type) { 3916 case SPDK_DIF_CHECK_TYPE_REFTAG: 3917 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3918 case SPDK_DIF_CHECK_TYPE_APPTAG: 3919 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3920 case SPDK_DIF_CHECK_TYPE_GUARD: 3921 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3922 default: 3923 return false; 3924 } 3925 } 3926 3927 uint64_t 3928 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3929 { 3930 return bdev->internal.measured_queue_depth; 3931 } 3932 3933 uint64_t 3934 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3935 { 3936 return bdev->internal.period; 3937 } 3938 3939 uint64_t 3940 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3941 { 3942 return bdev->internal.weighted_io_time; 3943 } 3944 3945 uint64_t 3946 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3947 { 3948 return bdev->internal.io_time; 3949 } 3950 3951 static void bdev_update_qd_sampling_period(void *ctx); 3952 3953 static void 3954 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3955 { 3956 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3957 3958 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3959 3960 if (bdev->internal.measured_queue_depth) { 3961 bdev->internal.io_time += bdev->internal.period; 3962 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3963 } 3964 3965 bdev->internal.qd_poll_in_progress = false; 3966 3967 bdev_update_qd_sampling_period(bdev); 3968 } 3969 3970 static void 3971 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3972 { 3973 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3974 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3975 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3976 3977 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3978 spdk_for_each_channel_continue(i, 0); 3979 } 3980 3981 static int 3982 bdev_calculate_measured_queue_depth(void *ctx) 3983 { 3984 struct spdk_bdev *bdev = ctx; 3985 3986 bdev->internal.qd_poll_in_progress = true; 3987 bdev->internal.temporary_queue_depth = 0; 3988 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3989 _calculate_measured_qd_cpl); 3990 return SPDK_POLLER_BUSY; 3991 } 3992 3993 static void 3994 bdev_update_qd_sampling_period(void *ctx) 3995 { 3996 struct spdk_bdev *bdev = ctx; 3997 3998 if (bdev->internal.period == bdev->internal.new_period) { 3999 return; 4000 } 4001 4002 if (bdev->internal.qd_poll_in_progress) { 4003 return; 4004 } 4005 4006 bdev->internal.period = bdev->internal.new_period; 4007 4008 spdk_poller_unregister(&bdev->internal.qd_poller); 4009 if (bdev->internal.period != 0) { 4010 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4011 bdev, bdev->internal.period); 4012 } else { 4013 spdk_bdev_close(bdev->internal.qd_desc); 4014 bdev->internal.qd_desc = NULL; 4015 } 4016 } 4017 4018 static void 4019 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4020 { 4021 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4022 } 4023 4024 void 4025 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4026 { 4027 int rc; 4028 4029 if (bdev->internal.new_period == period) { 4030 return; 4031 } 4032 4033 bdev->internal.new_period = period; 4034 4035 if (bdev->internal.qd_desc != NULL) { 4036 assert(bdev->internal.period != 0); 4037 4038 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4039 bdev_update_qd_sampling_period, bdev); 4040 return; 4041 } 4042 4043 assert(bdev->internal.period == 0); 4044 4045 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4046 NULL, &bdev->internal.qd_desc); 4047 if (rc != 0) { 4048 return; 4049 } 4050 4051 bdev->internal.period = period; 4052 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4053 bdev, period); 4054 } 4055 4056 struct bdev_get_current_qd_ctx { 4057 uint64_t current_qd; 4058 spdk_bdev_get_current_qd_cb cb_fn; 4059 void *cb_arg; 4060 }; 4061 4062 static void 4063 bdev_get_current_qd_done(struct spdk_io_channel_iter *i, int status) 4064 { 4065 struct bdev_get_current_qd_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4066 void *io_dev = spdk_io_channel_iter_get_io_device(i); 4067 4068 ctx->cb_fn(__bdev_from_io_dev(io_dev), ctx->current_qd, ctx->cb_arg, 0); 4069 4070 free(ctx); 4071 } 4072 4073 static void 4074 bdev_get_current_qd(struct spdk_io_channel_iter *i) 4075 { 4076 struct bdev_get_current_qd_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4077 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 4078 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 4079 4080 ctx->current_qd += bdev_ch->io_outstanding; 4081 4082 spdk_for_each_channel_continue(i, 0); 4083 } 4084 4085 void 4086 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4087 void *cb_arg) 4088 { 4089 struct bdev_get_current_qd_ctx *ctx; 4090 4091 assert(cb_fn != NULL); 4092 4093 ctx = calloc(1, sizeof(*ctx)); 4094 if (ctx == NULL) { 4095 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4096 return; 4097 } 4098 4099 ctx->cb_fn = cb_fn; 4100 ctx->cb_arg = cb_arg; 4101 4102 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4103 bdev_get_current_qd, 4104 ctx, 4105 bdev_get_current_qd_done); 4106 } 4107 4108 static void 4109 _resize_notify(void *arg) 4110 { 4111 struct spdk_bdev_desc *desc = arg; 4112 4113 pthread_mutex_lock(&desc->mutex); 4114 desc->refs--; 4115 if (!desc->closed) { 4116 pthread_mutex_unlock(&desc->mutex); 4117 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4118 desc->bdev, 4119 desc->callback.ctx); 4120 return; 4121 } else if (0 == desc->refs) { 4122 /* This descriptor was closed after this resize_notify message was sent. 4123 * spdk_bdev_close() could not free the descriptor since this message was 4124 * in flight, so we free it now using bdev_desc_free(). 4125 */ 4126 pthread_mutex_unlock(&desc->mutex); 4127 bdev_desc_free(desc); 4128 return; 4129 } 4130 pthread_mutex_unlock(&desc->mutex); 4131 } 4132 4133 int 4134 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4135 { 4136 struct spdk_bdev_desc *desc; 4137 int ret; 4138 4139 if (size == bdev->blockcnt) { 4140 return 0; 4141 } 4142 4143 pthread_mutex_lock(&bdev->internal.mutex); 4144 4145 /* bdev has open descriptors */ 4146 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4147 bdev->blockcnt > size) { 4148 ret = -EBUSY; 4149 } else { 4150 bdev->blockcnt = size; 4151 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4152 pthread_mutex_lock(&desc->mutex); 4153 if (!desc->closed) { 4154 desc->refs++; 4155 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4156 } 4157 pthread_mutex_unlock(&desc->mutex); 4158 } 4159 ret = 0; 4160 } 4161 4162 pthread_mutex_unlock(&bdev->internal.mutex); 4163 4164 return ret; 4165 } 4166 4167 /* 4168 * Convert I/O offset and length from bytes to blocks. 4169 * 4170 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4171 */ 4172 static uint64_t 4173 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4174 uint64_t num_bytes, uint64_t *num_blocks) 4175 { 4176 uint32_t block_size = bdev->blocklen; 4177 uint8_t shift_cnt; 4178 4179 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4180 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4181 shift_cnt = spdk_u32log2(block_size); 4182 *offset_blocks = offset_bytes >> shift_cnt; 4183 *num_blocks = num_bytes >> shift_cnt; 4184 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4185 (num_bytes - (*num_blocks << shift_cnt)); 4186 } else { 4187 *offset_blocks = offset_bytes / block_size; 4188 *num_blocks = num_bytes / block_size; 4189 return (offset_bytes % block_size) | (num_bytes % block_size); 4190 } 4191 } 4192 4193 static bool 4194 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4195 { 4196 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4197 * has been an overflow and hence the offset has been wrapped around */ 4198 if (offset_blocks + num_blocks < offset_blocks) { 4199 return false; 4200 } 4201 4202 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4203 if (offset_blocks + num_blocks > bdev->blockcnt) { 4204 return false; 4205 } 4206 4207 return true; 4208 } 4209 4210 static void 4211 bdev_seek_complete_cb(void *ctx) 4212 { 4213 struct spdk_bdev_io *bdev_io = ctx; 4214 4215 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4216 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4217 } 4218 4219 static int 4220 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4221 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4222 spdk_bdev_io_completion_cb cb, void *cb_arg) 4223 { 4224 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4225 struct spdk_bdev_io *bdev_io; 4226 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4227 4228 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4229 4230 /* Check if offset_blocks is valid looking at the validity of one block */ 4231 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4232 return -EINVAL; 4233 } 4234 4235 bdev_io = bdev_channel_get_io(channel); 4236 if (!bdev_io) { 4237 return -ENOMEM; 4238 } 4239 4240 bdev_io->internal.ch = channel; 4241 bdev_io->internal.desc = desc; 4242 bdev_io->type = io_type; 4243 bdev_io->u.bdev.offset_blocks = offset_blocks; 4244 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4245 4246 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4247 /* In case bdev doesn't support seek to next data/hole offset, 4248 * it is assumed that only data and no holes are present */ 4249 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4250 bdev_io->u.bdev.seek.offset = offset_blocks; 4251 } else { 4252 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4253 } 4254 4255 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4256 return 0; 4257 } 4258 4259 bdev_io_submit(bdev_io); 4260 return 0; 4261 } 4262 4263 int 4264 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4265 uint64_t offset_blocks, 4266 spdk_bdev_io_completion_cb cb, void *cb_arg) 4267 { 4268 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4269 } 4270 4271 int 4272 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4273 uint64_t offset_blocks, 4274 spdk_bdev_io_completion_cb cb, void *cb_arg) 4275 { 4276 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4277 } 4278 4279 uint64_t 4280 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4281 { 4282 return bdev_io->u.bdev.seek.offset; 4283 } 4284 4285 static int 4286 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4287 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4288 spdk_bdev_io_completion_cb cb, void *cb_arg) 4289 { 4290 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4291 struct spdk_bdev_io *bdev_io; 4292 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4293 4294 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4295 return -EINVAL; 4296 } 4297 4298 bdev_io = bdev_channel_get_io(channel); 4299 if (!bdev_io) { 4300 return -ENOMEM; 4301 } 4302 4303 bdev_io->internal.ch = channel; 4304 bdev_io->internal.desc = desc; 4305 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4306 bdev_io->u.bdev.iovs = &bdev_io->iov; 4307 bdev_io->u.bdev.iovs[0].iov_base = buf; 4308 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4309 bdev_io->u.bdev.iovcnt = 1; 4310 bdev_io->u.bdev.md_buf = md_buf; 4311 bdev_io->u.bdev.num_blocks = num_blocks; 4312 bdev_io->u.bdev.offset_blocks = offset_blocks; 4313 bdev_io->u.bdev.ext_opts = NULL; 4314 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4315 4316 bdev_io_submit(bdev_io); 4317 return 0; 4318 } 4319 4320 int 4321 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4322 void *buf, uint64_t offset, uint64_t nbytes, 4323 spdk_bdev_io_completion_cb cb, void *cb_arg) 4324 { 4325 uint64_t offset_blocks, num_blocks; 4326 4327 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4328 nbytes, &num_blocks) != 0) { 4329 return -EINVAL; 4330 } 4331 4332 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4333 } 4334 4335 int 4336 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4337 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4338 spdk_bdev_io_completion_cb cb, void *cb_arg) 4339 { 4340 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4341 } 4342 4343 int 4344 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4345 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4346 spdk_bdev_io_completion_cb cb, void *cb_arg) 4347 { 4348 struct iovec iov = { 4349 .iov_base = buf, 4350 }; 4351 4352 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4353 return -EINVAL; 4354 } 4355 4356 if (md_buf && !_is_buf_allocated(&iov)) { 4357 return -EINVAL; 4358 } 4359 4360 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4361 cb, cb_arg); 4362 } 4363 4364 int 4365 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4366 struct iovec *iov, int iovcnt, 4367 uint64_t offset, uint64_t nbytes, 4368 spdk_bdev_io_completion_cb cb, void *cb_arg) 4369 { 4370 uint64_t offset_blocks, num_blocks; 4371 4372 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4373 nbytes, &num_blocks) != 0) { 4374 return -EINVAL; 4375 } 4376 4377 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4378 } 4379 4380 static int 4381 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4382 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4383 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4384 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4385 { 4386 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4387 struct spdk_bdev_io *bdev_io; 4388 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4389 4390 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4391 return -EINVAL; 4392 } 4393 4394 bdev_io = bdev_channel_get_io(channel); 4395 if (!bdev_io) { 4396 return -ENOMEM; 4397 } 4398 4399 bdev_io->internal.ch = channel; 4400 bdev_io->internal.desc = desc; 4401 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4402 bdev_io->u.bdev.iovs = iov; 4403 bdev_io->u.bdev.iovcnt = iovcnt; 4404 bdev_io->u.bdev.md_buf = md_buf; 4405 bdev_io->u.bdev.num_blocks = num_blocks; 4406 bdev_io->u.bdev.offset_blocks = offset_blocks; 4407 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4408 bdev_io->internal.ext_opts = opts; 4409 bdev_io->u.bdev.ext_opts = opts; 4410 4411 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4412 4413 return 0; 4414 } 4415 4416 int 4417 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4418 struct iovec *iov, int iovcnt, 4419 uint64_t offset_blocks, uint64_t num_blocks, 4420 spdk_bdev_io_completion_cb cb, void *cb_arg) 4421 { 4422 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4423 num_blocks, cb, cb_arg, NULL, false); 4424 } 4425 4426 int 4427 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4428 struct iovec *iov, int iovcnt, void *md_buf, 4429 uint64_t offset_blocks, uint64_t num_blocks, 4430 spdk_bdev_io_completion_cb cb, void *cb_arg) 4431 { 4432 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4433 return -EINVAL; 4434 } 4435 4436 if (md_buf && !_is_buf_allocated(iov)) { 4437 return -EINVAL; 4438 } 4439 4440 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4441 num_blocks, cb, cb_arg, NULL, false); 4442 } 4443 4444 static inline bool 4445 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4446 { 4447 /* 4448 * We check if opts size is at least of size when we first introduced 4449 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4450 * are not checked internal. 4451 */ 4452 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4453 sizeof(opts->metadata) && 4454 opts->size <= sizeof(*opts) && 4455 /* When memory domain is used, the user must provide data buffers */ 4456 (!opts->memory_domain || (iov && iov[0].iov_base)); 4457 } 4458 4459 int 4460 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4461 struct iovec *iov, int iovcnt, 4462 uint64_t offset_blocks, uint64_t num_blocks, 4463 spdk_bdev_io_completion_cb cb, void *cb_arg, 4464 struct spdk_bdev_ext_io_opts *opts) 4465 { 4466 void *md = NULL; 4467 4468 if (opts) { 4469 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4470 return -EINVAL; 4471 } 4472 md = opts->metadata; 4473 } 4474 4475 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4476 return -EINVAL; 4477 } 4478 4479 if (md && !_is_buf_allocated(iov)) { 4480 return -EINVAL; 4481 } 4482 4483 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4484 num_blocks, cb, cb_arg, opts, false); 4485 } 4486 4487 static int 4488 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4489 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4490 spdk_bdev_io_completion_cb cb, void *cb_arg) 4491 { 4492 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4493 struct spdk_bdev_io *bdev_io; 4494 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4495 4496 if (!desc->write) { 4497 return -EBADF; 4498 } 4499 4500 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4501 return -EINVAL; 4502 } 4503 4504 bdev_io = bdev_channel_get_io(channel); 4505 if (!bdev_io) { 4506 return -ENOMEM; 4507 } 4508 4509 bdev_io->internal.ch = channel; 4510 bdev_io->internal.desc = desc; 4511 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4512 bdev_io->u.bdev.iovs = &bdev_io->iov; 4513 bdev_io->u.bdev.iovs[0].iov_base = buf; 4514 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4515 bdev_io->u.bdev.iovcnt = 1; 4516 bdev_io->u.bdev.md_buf = md_buf; 4517 bdev_io->u.bdev.num_blocks = num_blocks; 4518 bdev_io->u.bdev.offset_blocks = offset_blocks; 4519 bdev_io->u.bdev.ext_opts = NULL; 4520 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4521 4522 bdev_io_submit(bdev_io); 4523 return 0; 4524 } 4525 4526 int 4527 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4528 void *buf, uint64_t offset, uint64_t nbytes, 4529 spdk_bdev_io_completion_cb cb, void *cb_arg) 4530 { 4531 uint64_t offset_blocks, num_blocks; 4532 4533 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4534 nbytes, &num_blocks) != 0) { 4535 return -EINVAL; 4536 } 4537 4538 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4539 } 4540 4541 int 4542 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4543 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4544 spdk_bdev_io_completion_cb cb, void *cb_arg) 4545 { 4546 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4547 cb, cb_arg); 4548 } 4549 4550 int 4551 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4552 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4553 spdk_bdev_io_completion_cb cb, void *cb_arg) 4554 { 4555 struct iovec iov = { 4556 .iov_base = buf, 4557 }; 4558 4559 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4560 return -EINVAL; 4561 } 4562 4563 if (md_buf && !_is_buf_allocated(&iov)) { 4564 return -EINVAL; 4565 } 4566 4567 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4568 cb, cb_arg); 4569 } 4570 4571 static int 4572 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4573 struct iovec *iov, int iovcnt, void *md_buf, 4574 uint64_t offset_blocks, uint64_t num_blocks, 4575 spdk_bdev_io_completion_cb cb, void *cb_arg, 4576 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4577 { 4578 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4579 struct spdk_bdev_io *bdev_io; 4580 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4581 4582 if (!desc->write) { 4583 return -EBADF; 4584 } 4585 4586 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4587 return -EINVAL; 4588 } 4589 4590 bdev_io = bdev_channel_get_io(channel); 4591 if (!bdev_io) { 4592 return -ENOMEM; 4593 } 4594 4595 bdev_io->internal.ch = channel; 4596 bdev_io->internal.desc = desc; 4597 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4598 bdev_io->u.bdev.iovs = iov; 4599 bdev_io->u.bdev.iovcnt = iovcnt; 4600 bdev_io->u.bdev.md_buf = md_buf; 4601 bdev_io->u.bdev.num_blocks = num_blocks; 4602 bdev_io->u.bdev.offset_blocks = offset_blocks; 4603 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4604 bdev_io->internal.ext_opts = opts; 4605 bdev_io->u.bdev.ext_opts = opts; 4606 4607 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4608 4609 return 0; 4610 } 4611 4612 int 4613 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4614 struct iovec *iov, int iovcnt, 4615 uint64_t offset, uint64_t len, 4616 spdk_bdev_io_completion_cb cb, void *cb_arg) 4617 { 4618 uint64_t offset_blocks, num_blocks; 4619 4620 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4621 len, &num_blocks) != 0) { 4622 return -EINVAL; 4623 } 4624 4625 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4626 } 4627 4628 int 4629 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4630 struct iovec *iov, int iovcnt, 4631 uint64_t offset_blocks, uint64_t num_blocks, 4632 spdk_bdev_io_completion_cb cb, void *cb_arg) 4633 { 4634 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4635 num_blocks, cb, cb_arg, NULL, false); 4636 } 4637 4638 int 4639 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4640 struct iovec *iov, int iovcnt, void *md_buf, 4641 uint64_t offset_blocks, uint64_t num_blocks, 4642 spdk_bdev_io_completion_cb cb, void *cb_arg) 4643 { 4644 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4645 return -EINVAL; 4646 } 4647 4648 if (md_buf && !_is_buf_allocated(iov)) { 4649 return -EINVAL; 4650 } 4651 4652 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4653 num_blocks, cb, cb_arg, NULL, false); 4654 } 4655 4656 int 4657 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4658 struct iovec *iov, int iovcnt, 4659 uint64_t offset_blocks, uint64_t num_blocks, 4660 spdk_bdev_io_completion_cb cb, void *cb_arg, 4661 struct spdk_bdev_ext_io_opts *opts) 4662 { 4663 void *md = NULL; 4664 4665 if (opts) { 4666 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4667 return -EINVAL; 4668 } 4669 md = opts->metadata; 4670 } 4671 4672 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4673 return -EINVAL; 4674 } 4675 4676 if (md && !_is_buf_allocated(iov)) { 4677 return -EINVAL; 4678 } 4679 4680 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4681 num_blocks, cb, cb_arg, opts, false); 4682 } 4683 4684 static void 4685 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4686 { 4687 struct spdk_bdev_io *parent_io = cb_arg; 4688 struct spdk_bdev *bdev = parent_io->bdev; 4689 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4690 int i, rc = 0; 4691 4692 if (!success) { 4693 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4694 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4695 spdk_bdev_free_io(bdev_io); 4696 return; 4697 } 4698 4699 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4700 rc = memcmp(read_buf, 4701 parent_io->u.bdev.iovs[i].iov_base, 4702 parent_io->u.bdev.iovs[i].iov_len); 4703 if (rc) { 4704 break; 4705 } 4706 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4707 } 4708 4709 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 4710 rc = memcmp(bdev_io->u.bdev.md_buf, 4711 parent_io->u.bdev.md_buf, 4712 spdk_bdev_get_md_size(bdev)); 4713 } 4714 4715 spdk_bdev_free_io(bdev_io); 4716 4717 if (rc == 0) { 4718 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4719 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4720 } else { 4721 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4722 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4723 } 4724 } 4725 4726 static void 4727 bdev_compare_do_read(void *_bdev_io) 4728 { 4729 struct spdk_bdev_io *bdev_io = _bdev_io; 4730 int rc; 4731 4732 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4733 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4734 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4735 bdev_compare_do_read_done, bdev_io); 4736 4737 if (rc == -ENOMEM) { 4738 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4739 } else if (rc != 0) { 4740 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4741 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4742 } 4743 } 4744 4745 static int 4746 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4747 struct iovec *iov, int iovcnt, void *md_buf, 4748 uint64_t offset_blocks, uint64_t num_blocks, 4749 spdk_bdev_io_completion_cb cb, void *cb_arg) 4750 { 4751 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4752 struct spdk_bdev_io *bdev_io; 4753 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4754 4755 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4756 return -EINVAL; 4757 } 4758 4759 bdev_io = bdev_channel_get_io(channel); 4760 if (!bdev_io) { 4761 return -ENOMEM; 4762 } 4763 4764 bdev_io->internal.ch = channel; 4765 bdev_io->internal.desc = desc; 4766 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4767 bdev_io->u.bdev.iovs = iov; 4768 bdev_io->u.bdev.iovcnt = iovcnt; 4769 bdev_io->u.bdev.md_buf = md_buf; 4770 bdev_io->u.bdev.num_blocks = num_blocks; 4771 bdev_io->u.bdev.offset_blocks = offset_blocks; 4772 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4773 bdev_io->u.bdev.ext_opts = NULL; 4774 4775 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4776 bdev_io_submit(bdev_io); 4777 return 0; 4778 } 4779 4780 bdev_compare_do_read(bdev_io); 4781 4782 return 0; 4783 } 4784 4785 int 4786 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4787 struct iovec *iov, int iovcnt, 4788 uint64_t offset_blocks, uint64_t num_blocks, 4789 spdk_bdev_io_completion_cb cb, void *cb_arg) 4790 { 4791 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4792 num_blocks, cb, cb_arg); 4793 } 4794 4795 int 4796 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4797 struct iovec *iov, int iovcnt, void *md_buf, 4798 uint64_t offset_blocks, uint64_t num_blocks, 4799 spdk_bdev_io_completion_cb cb, void *cb_arg) 4800 { 4801 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4802 return -EINVAL; 4803 } 4804 4805 if (md_buf && !_is_buf_allocated(iov)) { 4806 return -EINVAL; 4807 } 4808 4809 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4810 num_blocks, cb, cb_arg); 4811 } 4812 4813 static int 4814 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4815 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4816 spdk_bdev_io_completion_cb cb, void *cb_arg) 4817 { 4818 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4819 struct spdk_bdev_io *bdev_io; 4820 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4821 4822 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4823 return -EINVAL; 4824 } 4825 4826 bdev_io = bdev_channel_get_io(channel); 4827 if (!bdev_io) { 4828 return -ENOMEM; 4829 } 4830 4831 bdev_io->internal.ch = channel; 4832 bdev_io->internal.desc = desc; 4833 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4834 bdev_io->u.bdev.iovs = &bdev_io->iov; 4835 bdev_io->u.bdev.iovs[0].iov_base = buf; 4836 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4837 bdev_io->u.bdev.iovcnt = 1; 4838 bdev_io->u.bdev.md_buf = md_buf; 4839 bdev_io->u.bdev.num_blocks = num_blocks; 4840 bdev_io->u.bdev.offset_blocks = offset_blocks; 4841 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4842 bdev_io->u.bdev.ext_opts = NULL; 4843 4844 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4845 bdev_io_submit(bdev_io); 4846 return 0; 4847 } 4848 4849 bdev_compare_do_read(bdev_io); 4850 4851 return 0; 4852 } 4853 4854 int 4855 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4856 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4857 spdk_bdev_io_completion_cb cb, void *cb_arg) 4858 { 4859 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4860 cb, cb_arg); 4861 } 4862 4863 int 4864 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4865 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4866 spdk_bdev_io_completion_cb cb, void *cb_arg) 4867 { 4868 struct iovec iov = { 4869 .iov_base = buf, 4870 }; 4871 4872 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4873 return -EINVAL; 4874 } 4875 4876 if (md_buf && !_is_buf_allocated(&iov)) { 4877 return -EINVAL; 4878 } 4879 4880 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4881 cb, cb_arg); 4882 } 4883 4884 static void 4885 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4886 { 4887 struct spdk_bdev_io *bdev_io = ctx; 4888 4889 if (unlock_status) { 4890 SPDK_ERRLOG("LBA range unlock failed\n"); 4891 } 4892 4893 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4894 false, bdev_io->internal.caller_ctx); 4895 } 4896 4897 static void 4898 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4899 { 4900 bdev_io->internal.status = status; 4901 4902 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4903 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4904 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4905 } 4906 4907 static void 4908 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4909 { 4910 struct spdk_bdev_io *parent_io = cb_arg; 4911 4912 if (!success) { 4913 SPDK_ERRLOG("Compare and write operation failed\n"); 4914 } 4915 4916 spdk_bdev_free_io(bdev_io); 4917 4918 bdev_comparev_and_writev_blocks_unlock(parent_io, 4919 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4920 } 4921 4922 static void 4923 bdev_compare_and_write_do_write(void *_bdev_io) 4924 { 4925 struct spdk_bdev_io *bdev_io = _bdev_io; 4926 int rc; 4927 4928 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4929 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4930 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4931 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4932 bdev_compare_and_write_do_write_done, bdev_io); 4933 4934 4935 if (rc == -ENOMEM) { 4936 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4937 } else if (rc != 0) { 4938 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4939 } 4940 } 4941 4942 static void 4943 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4944 { 4945 struct spdk_bdev_io *parent_io = cb_arg; 4946 4947 spdk_bdev_free_io(bdev_io); 4948 4949 if (!success) { 4950 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4951 return; 4952 } 4953 4954 bdev_compare_and_write_do_write(parent_io); 4955 } 4956 4957 static void 4958 bdev_compare_and_write_do_compare(void *_bdev_io) 4959 { 4960 struct spdk_bdev_io *bdev_io = _bdev_io; 4961 int rc; 4962 4963 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4964 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4965 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4966 bdev_compare_and_write_do_compare_done, bdev_io); 4967 4968 if (rc == -ENOMEM) { 4969 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4970 } else if (rc != 0) { 4971 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4972 } 4973 } 4974 4975 static void 4976 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4977 { 4978 struct spdk_bdev_io *bdev_io = ctx; 4979 4980 if (status) { 4981 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4982 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4983 return; 4984 } 4985 4986 bdev_compare_and_write_do_compare(bdev_io); 4987 } 4988 4989 int 4990 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4991 struct iovec *compare_iov, int compare_iovcnt, 4992 struct iovec *write_iov, int write_iovcnt, 4993 uint64_t offset_blocks, uint64_t num_blocks, 4994 spdk_bdev_io_completion_cb cb, void *cb_arg) 4995 { 4996 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4997 struct spdk_bdev_io *bdev_io; 4998 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4999 5000 if (!desc->write) { 5001 return -EBADF; 5002 } 5003 5004 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5005 return -EINVAL; 5006 } 5007 5008 if (num_blocks > bdev->acwu) { 5009 return -EINVAL; 5010 } 5011 5012 bdev_io = bdev_channel_get_io(channel); 5013 if (!bdev_io) { 5014 return -ENOMEM; 5015 } 5016 5017 bdev_io->internal.ch = channel; 5018 bdev_io->internal.desc = desc; 5019 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5020 bdev_io->u.bdev.iovs = compare_iov; 5021 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5022 bdev_io->u.bdev.fused_iovs = write_iov; 5023 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5024 bdev_io->u.bdev.md_buf = NULL; 5025 bdev_io->u.bdev.num_blocks = num_blocks; 5026 bdev_io->u.bdev.offset_blocks = offset_blocks; 5027 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5028 bdev_io->u.bdev.ext_opts = NULL; 5029 5030 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5031 bdev_io_submit(bdev_io); 5032 return 0; 5033 } 5034 5035 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5036 bdev_comparev_and_writev_blocks_locked, bdev_io); 5037 } 5038 5039 int 5040 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5041 struct iovec *iov, int iovcnt, 5042 uint64_t offset_blocks, uint64_t num_blocks, 5043 bool populate, 5044 spdk_bdev_io_completion_cb cb, void *cb_arg) 5045 { 5046 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5047 struct spdk_bdev_io *bdev_io; 5048 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5049 5050 if (!desc->write) { 5051 return -EBADF; 5052 } 5053 5054 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5055 return -EINVAL; 5056 } 5057 5058 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5059 return -ENOTSUP; 5060 } 5061 5062 bdev_io = bdev_channel_get_io(channel); 5063 if (!bdev_io) { 5064 return -ENOMEM; 5065 } 5066 5067 bdev_io->internal.ch = channel; 5068 bdev_io->internal.desc = desc; 5069 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5070 bdev_io->u.bdev.num_blocks = num_blocks; 5071 bdev_io->u.bdev.offset_blocks = offset_blocks; 5072 bdev_io->u.bdev.iovs = iov; 5073 bdev_io->u.bdev.iovcnt = iovcnt; 5074 bdev_io->u.bdev.md_buf = NULL; 5075 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5076 bdev_io->u.bdev.zcopy.commit = 0; 5077 bdev_io->u.bdev.zcopy.start = 1; 5078 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5079 bdev_io->u.bdev.ext_opts = NULL; 5080 5081 bdev_io_submit(bdev_io); 5082 5083 return 0; 5084 } 5085 5086 int 5087 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5088 spdk_bdev_io_completion_cb cb, void *cb_arg) 5089 { 5090 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5091 return -EINVAL; 5092 } 5093 5094 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5095 bdev_io->u.bdev.zcopy.start = 0; 5096 bdev_io->internal.caller_ctx = cb_arg; 5097 bdev_io->internal.cb = cb; 5098 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5099 5100 bdev_io_submit(bdev_io); 5101 5102 return 0; 5103 } 5104 5105 int 5106 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5107 uint64_t offset, uint64_t len, 5108 spdk_bdev_io_completion_cb cb, void *cb_arg) 5109 { 5110 uint64_t offset_blocks, num_blocks; 5111 5112 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5113 len, &num_blocks) != 0) { 5114 return -EINVAL; 5115 } 5116 5117 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5118 } 5119 5120 int 5121 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5122 uint64_t offset_blocks, uint64_t num_blocks, 5123 spdk_bdev_io_completion_cb cb, void *cb_arg) 5124 { 5125 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5126 struct spdk_bdev_io *bdev_io; 5127 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5128 5129 if (!desc->write) { 5130 return -EBADF; 5131 } 5132 5133 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5134 return -EINVAL; 5135 } 5136 5137 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5138 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5139 return -ENOTSUP; 5140 } 5141 5142 bdev_io = bdev_channel_get_io(channel); 5143 5144 if (!bdev_io) { 5145 return -ENOMEM; 5146 } 5147 5148 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5149 bdev_io->internal.ch = channel; 5150 bdev_io->internal.desc = desc; 5151 bdev_io->u.bdev.offset_blocks = offset_blocks; 5152 bdev_io->u.bdev.num_blocks = num_blocks; 5153 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5154 bdev_io->u.bdev.ext_opts = NULL; 5155 5156 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5157 bdev_io_submit(bdev_io); 5158 return 0; 5159 } 5160 5161 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5162 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5163 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5164 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5165 bdev_write_zero_buffer_next(bdev_io); 5166 5167 return 0; 5168 } 5169 5170 int 5171 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5172 uint64_t offset, uint64_t nbytes, 5173 spdk_bdev_io_completion_cb cb, void *cb_arg) 5174 { 5175 uint64_t offset_blocks, num_blocks; 5176 5177 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5178 nbytes, &num_blocks) != 0) { 5179 return -EINVAL; 5180 } 5181 5182 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5183 } 5184 5185 int 5186 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5187 uint64_t offset_blocks, uint64_t num_blocks, 5188 spdk_bdev_io_completion_cb cb, void *cb_arg) 5189 { 5190 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5191 struct spdk_bdev_io *bdev_io; 5192 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5193 5194 if (!desc->write) { 5195 return -EBADF; 5196 } 5197 5198 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5199 return -EINVAL; 5200 } 5201 5202 if (num_blocks == 0) { 5203 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5204 return -EINVAL; 5205 } 5206 5207 bdev_io = bdev_channel_get_io(channel); 5208 if (!bdev_io) { 5209 return -ENOMEM; 5210 } 5211 5212 bdev_io->internal.ch = channel; 5213 bdev_io->internal.desc = desc; 5214 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5215 5216 bdev_io->u.bdev.iovs = &bdev_io->iov; 5217 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5218 bdev_io->u.bdev.iovs[0].iov_len = 0; 5219 bdev_io->u.bdev.iovcnt = 1; 5220 5221 bdev_io->u.bdev.offset_blocks = offset_blocks; 5222 bdev_io->u.bdev.num_blocks = num_blocks; 5223 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5224 bdev_io->u.bdev.ext_opts = NULL; 5225 5226 bdev_io_submit(bdev_io); 5227 return 0; 5228 } 5229 5230 int 5231 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5232 uint64_t offset, uint64_t length, 5233 spdk_bdev_io_completion_cb cb, void *cb_arg) 5234 { 5235 uint64_t offset_blocks, num_blocks; 5236 5237 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5238 length, &num_blocks) != 0) { 5239 return -EINVAL; 5240 } 5241 5242 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5243 } 5244 5245 int 5246 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5247 uint64_t offset_blocks, uint64_t num_blocks, 5248 spdk_bdev_io_completion_cb cb, void *cb_arg) 5249 { 5250 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5251 struct spdk_bdev_io *bdev_io; 5252 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5253 5254 if (!desc->write) { 5255 return -EBADF; 5256 } 5257 5258 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5259 return -EINVAL; 5260 } 5261 5262 bdev_io = bdev_channel_get_io(channel); 5263 if (!bdev_io) { 5264 return -ENOMEM; 5265 } 5266 5267 bdev_io->internal.ch = channel; 5268 bdev_io->internal.desc = desc; 5269 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5270 bdev_io->u.bdev.iovs = NULL; 5271 bdev_io->u.bdev.iovcnt = 0; 5272 bdev_io->u.bdev.offset_blocks = offset_blocks; 5273 bdev_io->u.bdev.num_blocks = num_blocks; 5274 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5275 5276 bdev_io_submit(bdev_io); 5277 return 0; 5278 } 5279 5280 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5281 5282 static void 5283 bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status) 5284 { 5285 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 5286 struct spdk_bdev_io *bdev_io; 5287 5288 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5289 5290 if (status == -EBUSY) { 5291 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5292 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5293 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5294 } else { 5295 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5296 * start the reset. */ 5297 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5298 bdev_io_submit_reset(bdev_io); 5299 } 5300 } else { 5301 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5302 SPDK_DEBUGLOG(bdev, 5303 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5304 ch->bdev->name); 5305 /* Mark the completion status as a SUCCESS and complete the reset. */ 5306 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5307 } 5308 } 5309 5310 static void 5311 bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i) 5312 { 5313 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 5314 struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch); 5315 int status = 0; 5316 5317 if (cur_ch->io_outstanding > 0) { 5318 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5319 * further iteration over the rest of the channels and pass non-zero status 5320 * to the callback function. */ 5321 status = -EBUSY; 5322 } 5323 spdk_for_each_channel_continue(i, status); 5324 } 5325 5326 static int 5327 bdev_reset_poll_for_outstanding_io(void *ctx) 5328 { 5329 struct spdk_bdev_channel *ch = ctx; 5330 struct spdk_bdev_io *bdev_io; 5331 5332 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5333 5334 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5335 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io, 5336 ch, bdev_reset_check_outstanding_io_done); 5337 5338 return SPDK_POLLER_BUSY; 5339 } 5340 5341 static void 5342 bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status) 5343 { 5344 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 5345 struct spdk_bdev *bdev = ch->bdev; 5346 struct spdk_bdev_io *bdev_io; 5347 5348 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5349 5350 if (bdev->reset_io_drain_timeout == 0) { 5351 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5352 5353 bdev_io_submit_reset(bdev_io); 5354 return; 5355 } 5356 5357 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5358 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5359 5360 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5361 * submit the reset to the underlying module only if outstanding I/O 5362 * remain after reset_io_drain_timeout seconds have passed. */ 5363 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io, 5364 ch, bdev_reset_check_outstanding_io_done); 5365 } 5366 5367 static void 5368 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 5369 { 5370 struct spdk_io_channel *ch; 5371 struct spdk_bdev_channel *channel; 5372 struct spdk_bdev_mgmt_channel *mgmt_channel; 5373 struct spdk_bdev_shared_resource *shared_resource; 5374 bdev_io_tailq_t tmp_queued; 5375 5376 TAILQ_INIT(&tmp_queued); 5377 5378 ch = spdk_io_channel_iter_get_channel(i); 5379 channel = spdk_io_channel_get_ctx(ch); 5380 shared_resource = channel->shared_resource; 5381 mgmt_channel = shared_resource->mgmt_ch; 5382 5383 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5384 5385 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5386 /* The QoS object is always valid and readable while 5387 * the channel flag is set, so the lock here should not 5388 * be necessary. We're not in the fast path though, so 5389 * just take it anyway. */ 5390 pthread_mutex_lock(&channel->bdev->internal.mutex); 5391 if (channel->bdev->internal.qos->ch == channel) { 5392 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5393 } 5394 pthread_mutex_unlock(&channel->bdev->internal.mutex); 5395 } 5396 5397 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5398 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 5399 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 5400 bdev_abort_all_queued_io(&tmp_queued, channel); 5401 5402 spdk_for_each_channel_continue(i, 0); 5403 } 5404 5405 static void 5406 bdev_start_reset(void *ctx) 5407 { 5408 struct spdk_bdev_channel *ch = ctx; 5409 5410 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 5411 ch, bdev_reset_freeze_channel_done); 5412 } 5413 5414 static void 5415 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5416 { 5417 struct spdk_bdev *bdev = ch->bdev; 5418 5419 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5420 5421 pthread_mutex_lock(&bdev->internal.mutex); 5422 if (bdev->internal.reset_in_progress == NULL) { 5423 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5424 /* 5425 * Take a channel reference for the target bdev for the life of this 5426 * reset. This guards against the channel getting destroyed while 5427 * spdk_for_each_channel() calls related to this reset IO are in 5428 * progress. We will release the reference when this reset is 5429 * completed. 5430 */ 5431 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5432 bdev_start_reset(ch); 5433 } 5434 pthread_mutex_unlock(&bdev->internal.mutex); 5435 } 5436 5437 int 5438 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5439 spdk_bdev_io_completion_cb cb, void *cb_arg) 5440 { 5441 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5442 struct spdk_bdev_io *bdev_io; 5443 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5444 5445 bdev_io = bdev_channel_get_io(channel); 5446 if (!bdev_io) { 5447 return -ENOMEM; 5448 } 5449 5450 bdev_io->internal.ch = channel; 5451 bdev_io->internal.desc = desc; 5452 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5453 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5454 bdev_io->u.reset.ch_ref = NULL; 5455 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5456 5457 pthread_mutex_lock(&bdev->internal.mutex); 5458 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5459 pthread_mutex_unlock(&bdev->internal.mutex); 5460 5461 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5462 internal.ch_link); 5463 5464 bdev_channel_start_reset(channel); 5465 5466 return 0; 5467 } 5468 5469 void 5470 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5471 struct spdk_bdev_io_stat *stat) 5472 { 5473 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5474 5475 *stat = channel->stat; 5476 } 5477 5478 static void 5479 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 5480 { 5481 void *io_device = spdk_io_channel_iter_get_io_device(i); 5482 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5483 5484 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 5485 bdev_iostat_ctx->cb_arg, 0); 5486 free(bdev_iostat_ctx); 5487 } 5488 5489 static void 5490 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 5491 { 5492 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5493 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 5494 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5495 5496 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 5497 spdk_for_each_channel_continue(i, 0); 5498 } 5499 5500 void 5501 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5502 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5503 { 5504 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5505 5506 assert(bdev != NULL); 5507 assert(stat != NULL); 5508 assert(cb != NULL); 5509 5510 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5511 if (bdev_iostat_ctx == NULL) { 5512 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5513 cb(bdev, stat, cb_arg, -ENOMEM); 5514 return; 5515 } 5516 5517 bdev_iostat_ctx->stat = stat; 5518 bdev_iostat_ctx->cb = cb; 5519 bdev_iostat_ctx->cb_arg = cb_arg; 5520 5521 /* Start with the statistics from previously deleted channels. */ 5522 pthread_mutex_lock(&bdev->internal.mutex); 5523 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 5524 pthread_mutex_unlock(&bdev->internal.mutex); 5525 5526 /* Then iterate and add the statistics from each existing channel. */ 5527 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5528 bdev_get_each_channel_stat, 5529 bdev_iostat_ctx, 5530 bdev_get_device_stat_done); 5531 } 5532 5533 int 5534 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5535 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5536 spdk_bdev_io_completion_cb cb, void *cb_arg) 5537 { 5538 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5539 struct spdk_bdev_io *bdev_io; 5540 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5541 5542 if (!desc->write) { 5543 return -EBADF; 5544 } 5545 5546 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5547 return -ENOTSUP; 5548 } 5549 5550 bdev_io = bdev_channel_get_io(channel); 5551 if (!bdev_io) { 5552 return -ENOMEM; 5553 } 5554 5555 bdev_io->internal.ch = channel; 5556 bdev_io->internal.desc = desc; 5557 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5558 bdev_io->u.nvme_passthru.cmd = *cmd; 5559 bdev_io->u.nvme_passthru.buf = buf; 5560 bdev_io->u.nvme_passthru.nbytes = nbytes; 5561 bdev_io->u.nvme_passthru.md_buf = NULL; 5562 bdev_io->u.nvme_passthru.md_len = 0; 5563 5564 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5565 5566 bdev_io_submit(bdev_io); 5567 return 0; 5568 } 5569 5570 int 5571 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5572 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5573 spdk_bdev_io_completion_cb cb, void *cb_arg) 5574 { 5575 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5576 struct spdk_bdev_io *bdev_io; 5577 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5578 5579 if (!desc->write) { 5580 /* 5581 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5582 * to easily determine if the command is a read or write, but for now just 5583 * do not allow io_passthru with a read-only descriptor. 5584 */ 5585 return -EBADF; 5586 } 5587 5588 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5589 return -ENOTSUP; 5590 } 5591 5592 bdev_io = bdev_channel_get_io(channel); 5593 if (!bdev_io) { 5594 return -ENOMEM; 5595 } 5596 5597 bdev_io->internal.ch = channel; 5598 bdev_io->internal.desc = desc; 5599 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5600 bdev_io->u.nvme_passthru.cmd = *cmd; 5601 bdev_io->u.nvme_passthru.buf = buf; 5602 bdev_io->u.nvme_passthru.nbytes = nbytes; 5603 bdev_io->u.nvme_passthru.md_buf = NULL; 5604 bdev_io->u.nvme_passthru.md_len = 0; 5605 5606 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5607 5608 bdev_io_submit(bdev_io); 5609 return 0; 5610 } 5611 5612 int 5613 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5614 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5615 spdk_bdev_io_completion_cb cb, void *cb_arg) 5616 { 5617 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5618 struct spdk_bdev_io *bdev_io; 5619 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5620 5621 if (!desc->write) { 5622 /* 5623 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5624 * to easily determine if the command is a read or write, but for now just 5625 * do not allow io_passthru with a read-only descriptor. 5626 */ 5627 return -EBADF; 5628 } 5629 5630 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5631 return -ENOTSUP; 5632 } 5633 5634 bdev_io = bdev_channel_get_io(channel); 5635 if (!bdev_io) { 5636 return -ENOMEM; 5637 } 5638 5639 bdev_io->internal.ch = channel; 5640 bdev_io->internal.desc = desc; 5641 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5642 bdev_io->u.nvme_passthru.cmd = *cmd; 5643 bdev_io->u.nvme_passthru.buf = buf; 5644 bdev_io->u.nvme_passthru.nbytes = nbytes; 5645 bdev_io->u.nvme_passthru.md_buf = md_buf; 5646 bdev_io->u.nvme_passthru.md_len = md_len; 5647 5648 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5649 5650 bdev_io_submit(bdev_io); 5651 return 0; 5652 } 5653 5654 static void bdev_abort_retry(void *ctx); 5655 static void bdev_abort(struct spdk_bdev_io *parent_io); 5656 5657 static void 5658 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5659 { 5660 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5661 struct spdk_bdev_io *parent_io = cb_arg; 5662 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5663 5664 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5665 5666 spdk_bdev_free_io(bdev_io); 5667 5668 if (!success) { 5669 /* Check if the target I/O completed in the meantime. */ 5670 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5671 if (tmp_io == bio_to_abort) { 5672 break; 5673 } 5674 } 5675 5676 /* If the target I/O still exists, set the parent to failed. */ 5677 if (tmp_io != NULL) { 5678 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5679 } 5680 } 5681 5682 parent_io->u.bdev.split_outstanding--; 5683 if (parent_io->u.bdev.split_outstanding == 0) { 5684 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5685 bdev_abort_retry(parent_io); 5686 } else { 5687 bdev_io_complete(parent_io); 5688 } 5689 } 5690 } 5691 5692 static int 5693 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5694 struct spdk_bdev_io *bio_to_abort, 5695 spdk_bdev_io_completion_cb cb, void *cb_arg) 5696 { 5697 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5698 struct spdk_bdev_io *bdev_io; 5699 5700 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5701 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5702 /* TODO: Abort reset or abort request. */ 5703 return -ENOTSUP; 5704 } 5705 5706 bdev_io = bdev_channel_get_io(channel); 5707 if (bdev_io == NULL) { 5708 return -ENOMEM; 5709 } 5710 5711 bdev_io->internal.ch = channel; 5712 bdev_io->internal.desc = desc; 5713 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5714 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5715 5716 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5717 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5718 5719 /* Parent abort request is not submitted directly, but to manage its 5720 * execution add it to the submitted list here. 5721 */ 5722 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5723 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5724 5725 bdev_abort(bdev_io); 5726 5727 return 0; 5728 } 5729 5730 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5731 5732 /* Submit the abort request to the underlying bdev module. */ 5733 bdev_io_submit(bdev_io); 5734 5735 return 0; 5736 } 5737 5738 static uint32_t 5739 _bdev_abort(struct spdk_bdev_io *parent_io) 5740 { 5741 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5742 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5743 void *bio_cb_arg; 5744 struct spdk_bdev_io *bio_to_abort; 5745 uint32_t matched_ios; 5746 int rc; 5747 5748 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5749 5750 /* matched_ios is returned and will be kept by the caller. 5751 * 5752 * This funcion will be used for two cases, 1) the same cb_arg is used for 5753 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5754 * Incrementing split_outstanding directly here may confuse readers especially 5755 * for the 1st case. 5756 * 5757 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5758 * works as expected. 5759 */ 5760 matched_ios = 0; 5761 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5762 5763 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5764 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5765 continue; 5766 } 5767 5768 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5769 /* Any I/O which was submitted after this abort command should be excluded. */ 5770 continue; 5771 } 5772 5773 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5774 if (rc != 0) { 5775 if (rc == -ENOMEM) { 5776 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5777 } else { 5778 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5779 } 5780 break; 5781 } 5782 matched_ios++; 5783 } 5784 5785 return matched_ios; 5786 } 5787 5788 static void 5789 bdev_abort_retry(void *ctx) 5790 { 5791 struct spdk_bdev_io *parent_io = ctx; 5792 uint32_t matched_ios; 5793 5794 matched_ios = _bdev_abort(parent_io); 5795 5796 if (matched_ios == 0) { 5797 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5798 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5799 } else { 5800 /* For retry, the case that no target I/O was found is success 5801 * because it means target I/Os completed in the meantime. 5802 */ 5803 bdev_io_complete(parent_io); 5804 } 5805 return; 5806 } 5807 5808 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5809 parent_io->u.bdev.split_outstanding = matched_ios; 5810 } 5811 5812 static void 5813 bdev_abort(struct spdk_bdev_io *parent_io) 5814 { 5815 uint32_t matched_ios; 5816 5817 matched_ios = _bdev_abort(parent_io); 5818 5819 if (matched_ios == 0) { 5820 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5821 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5822 } else { 5823 /* The case the no target I/O was found is failure. */ 5824 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5825 bdev_io_complete(parent_io); 5826 } 5827 return; 5828 } 5829 5830 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5831 parent_io->u.bdev.split_outstanding = matched_ios; 5832 } 5833 5834 int 5835 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5836 void *bio_cb_arg, 5837 spdk_bdev_io_completion_cb cb, void *cb_arg) 5838 { 5839 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5840 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5841 struct spdk_bdev_io *bdev_io; 5842 5843 if (bio_cb_arg == NULL) { 5844 return -EINVAL; 5845 } 5846 5847 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5848 return -ENOTSUP; 5849 } 5850 5851 bdev_io = bdev_channel_get_io(channel); 5852 if (bdev_io == NULL) { 5853 return -ENOMEM; 5854 } 5855 5856 bdev_io->internal.ch = channel; 5857 bdev_io->internal.desc = desc; 5858 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5859 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5860 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5861 5862 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5863 5864 /* Parent abort request is not submitted directly, but to manage its execution, 5865 * add it to the submitted list here. 5866 */ 5867 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5868 5869 bdev_abort(bdev_io); 5870 5871 return 0; 5872 } 5873 5874 int 5875 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5876 struct spdk_bdev_io_wait_entry *entry) 5877 { 5878 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5879 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5880 5881 if (bdev != entry->bdev) { 5882 SPDK_ERRLOG("bdevs do not match\n"); 5883 return -EINVAL; 5884 } 5885 5886 if (mgmt_ch->per_thread_cache_count > 0) { 5887 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5888 return -EINVAL; 5889 } 5890 5891 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5892 return 0; 5893 } 5894 5895 static inline void 5896 bdev_io_complete(void *ctx) 5897 { 5898 struct spdk_bdev_io *bdev_io = ctx; 5899 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5900 uint64_t tsc, tsc_diff; 5901 5902 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5903 /* 5904 * Send the completion to the thread that originally submitted the I/O, 5905 * which may not be the current thread in the case of QoS. 5906 */ 5907 if (bdev_io->internal.io_submit_ch) { 5908 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5909 bdev_io->internal.io_submit_ch = NULL; 5910 } 5911 5912 /* 5913 * Defer completion to avoid potential infinite recursion if the 5914 * user's completion callback issues a new I/O. 5915 */ 5916 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5917 bdev_io_complete, bdev_io); 5918 return; 5919 } 5920 5921 tsc = spdk_get_ticks(); 5922 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5923 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5924 bdev_io->internal.caller_ctx); 5925 5926 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5927 5928 if (bdev_io->internal.ch->histogram) { 5929 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5930 } 5931 5932 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5933 switch (bdev_io->type) { 5934 case SPDK_BDEV_IO_TYPE_READ: 5935 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5936 bdev_io->internal.ch->stat.num_read_ops++; 5937 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5938 break; 5939 case SPDK_BDEV_IO_TYPE_WRITE: 5940 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5941 bdev_io->internal.ch->stat.num_write_ops++; 5942 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5943 break; 5944 case SPDK_BDEV_IO_TYPE_UNMAP: 5945 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5946 bdev_io->internal.ch->stat.num_unmap_ops++; 5947 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5948 break; 5949 case SPDK_BDEV_IO_TYPE_ZCOPY: 5950 /* Track the data in the start phase only */ 5951 if (bdev_io->u.bdev.zcopy.start) { 5952 if (bdev_io->u.bdev.zcopy.populate) { 5953 bdev_io->internal.ch->stat.bytes_read += 5954 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5955 bdev_io->internal.ch->stat.num_read_ops++; 5956 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5957 } else { 5958 bdev_io->internal.ch->stat.bytes_written += 5959 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5960 bdev_io->internal.ch->stat.num_write_ops++; 5961 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5962 } 5963 } 5964 break; 5965 default: 5966 break; 5967 } 5968 } 5969 5970 #ifdef SPDK_CONFIG_VTUNE 5971 uint64_t now_tsc = spdk_get_ticks(); 5972 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5973 uint64_t data[5]; 5974 5975 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5976 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5977 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5978 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5979 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5980 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5981 5982 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5983 __itt_metadata_u64, 5, data); 5984 5985 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5986 bdev_io->internal.ch->start_tsc = now_tsc; 5987 } 5988 #endif 5989 5990 assert(bdev_io->internal.cb != NULL); 5991 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5992 5993 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5994 bdev_io->internal.caller_ctx); 5995 } 5996 5997 static void bdev_destroy_cb(void *io_device); 5998 5999 static void 6000 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 6001 { 6002 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 6003 struct spdk_bdev *bdev = bdev_io->bdev; 6004 6005 if (bdev_io->u.reset.ch_ref != NULL) { 6006 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6007 bdev_io->u.reset.ch_ref = NULL; 6008 } 6009 6010 bdev_io_complete(bdev_io); 6011 6012 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6013 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6014 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6015 } 6016 } 6017 6018 static void 6019 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 6020 { 6021 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 6022 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6023 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6024 struct spdk_bdev_io *queued_reset; 6025 6026 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6027 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6028 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6029 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6030 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6031 } 6032 6033 spdk_for_each_channel_continue(i, 0); 6034 } 6035 6036 void 6037 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6038 { 6039 struct spdk_bdev *bdev = bdev_io->bdev; 6040 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6041 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6042 6043 bdev_io->internal.status = status; 6044 6045 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6046 bool unlock_channels = false; 6047 6048 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6049 SPDK_ERRLOG("NOMEM returned for reset\n"); 6050 } 6051 pthread_mutex_lock(&bdev->internal.mutex); 6052 if (bdev_io == bdev->internal.reset_in_progress) { 6053 bdev->internal.reset_in_progress = NULL; 6054 unlock_channels = true; 6055 } 6056 pthread_mutex_unlock(&bdev->internal.mutex); 6057 6058 if (unlock_channels) { 6059 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 6060 bdev_io, bdev_reset_complete); 6061 return; 6062 } 6063 } else { 6064 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6065 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6066 /* bdev IO will be completed in the callback */ 6067 return; 6068 } 6069 6070 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6071 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6072 return; 6073 } 6074 } 6075 6076 bdev_io_complete(bdev_io); 6077 } 6078 6079 void 6080 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6081 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6082 { 6083 if (sc == SPDK_SCSI_STATUS_GOOD) { 6084 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6085 } else { 6086 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6087 bdev_io->internal.error.scsi.sc = sc; 6088 bdev_io->internal.error.scsi.sk = sk; 6089 bdev_io->internal.error.scsi.asc = asc; 6090 bdev_io->internal.error.scsi.ascq = ascq; 6091 } 6092 6093 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6094 } 6095 6096 void 6097 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6098 int *sc, int *sk, int *asc, int *ascq) 6099 { 6100 assert(sc != NULL); 6101 assert(sk != NULL); 6102 assert(asc != NULL); 6103 assert(ascq != NULL); 6104 6105 switch (bdev_io->internal.status) { 6106 case SPDK_BDEV_IO_STATUS_SUCCESS: 6107 *sc = SPDK_SCSI_STATUS_GOOD; 6108 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6109 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6110 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6111 break; 6112 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6113 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6114 break; 6115 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6116 *sc = bdev_io->internal.error.scsi.sc; 6117 *sk = bdev_io->internal.error.scsi.sk; 6118 *asc = bdev_io->internal.error.scsi.asc; 6119 *ascq = bdev_io->internal.error.scsi.ascq; 6120 break; 6121 default: 6122 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6123 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6124 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6125 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6126 break; 6127 } 6128 } 6129 6130 void 6131 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6132 { 6133 if (aio_result == 0) { 6134 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6135 } else { 6136 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6137 } 6138 6139 bdev_io->internal.error.aio_result = aio_result; 6140 6141 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6142 } 6143 6144 void 6145 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6146 { 6147 assert(aio_result != NULL); 6148 6149 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6150 *aio_result = bdev_io->internal.error.aio_result; 6151 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6152 *aio_result = 0; 6153 } else { 6154 *aio_result = -EIO; 6155 } 6156 } 6157 6158 void 6159 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6160 { 6161 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6162 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6163 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6164 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6165 } else { 6166 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6167 } 6168 6169 bdev_io->internal.error.nvme.cdw0 = cdw0; 6170 bdev_io->internal.error.nvme.sct = sct; 6171 bdev_io->internal.error.nvme.sc = sc; 6172 6173 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6174 } 6175 6176 void 6177 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6178 { 6179 assert(sct != NULL); 6180 assert(sc != NULL); 6181 assert(cdw0 != NULL); 6182 6183 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6184 *sct = SPDK_NVME_SCT_GENERIC; 6185 *sc = SPDK_NVME_SC_SUCCESS; 6186 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6187 *cdw0 = 0; 6188 } else { 6189 *cdw0 = 1U; 6190 } 6191 return; 6192 } 6193 6194 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6195 *sct = bdev_io->internal.error.nvme.sct; 6196 *sc = bdev_io->internal.error.nvme.sc; 6197 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6198 *sct = SPDK_NVME_SCT_GENERIC; 6199 *sc = SPDK_NVME_SC_SUCCESS; 6200 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6201 *sct = SPDK_NVME_SCT_GENERIC; 6202 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6203 } else { 6204 *sct = SPDK_NVME_SCT_GENERIC; 6205 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6206 } 6207 6208 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6209 } 6210 6211 void 6212 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6213 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6214 { 6215 assert(first_sct != NULL); 6216 assert(first_sc != NULL); 6217 assert(second_sct != NULL); 6218 assert(second_sc != NULL); 6219 assert(cdw0 != NULL); 6220 6221 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6222 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6223 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6224 *first_sct = bdev_io->internal.error.nvme.sct; 6225 *first_sc = bdev_io->internal.error.nvme.sc; 6226 *second_sct = SPDK_NVME_SCT_GENERIC; 6227 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6228 } else { 6229 *first_sct = SPDK_NVME_SCT_GENERIC; 6230 *first_sc = SPDK_NVME_SC_SUCCESS; 6231 *second_sct = bdev_io->internal.error.nvme.sct; 6232 *second_sc = bdev_io->internal.error.nvme.sc; 6233 } 6234 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6235 *first_sct = SPDK_NVME_SCT_GENERIC; 6236 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6237 *second_sct = SPDK_NVME_SCT_GENERIC; 6238 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6239 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6240 *first_sct = SPDK_NVME_SCT_GENERIC; 6241 *first_sc = SPDK_NVME_SC_SUCCESS; 6242 *second_sct = SPDK_NVME_SCT_GENERIC; 6243 *second_sc = SPDK_NVME_SC_SUCCESS; 6244 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6245 *first_sct = SPDK_NVME_SCT_GENERIC; 6246 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6247 *second_sct = SPDK_NVME_SCT_GENERIC; 6248 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6249 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6250 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6251 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6252 *second_sct = SPDK_NVME_SCT_GENERIC; 6253 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6254 } else { 6255 *first_sct = SPDK_NVME_SCT_GENERIC; 6256 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6257 *second_sct = SPDK_NVME_SCT_GENERIC; 6258 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6259 } 6260 6261 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6262 } 6263 6264 struct spdk_thread * 6265 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6266 { 6267 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6268 } 6269 6270 struct spdk_io_channel * 6271 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6272 { 6273 return bdev_io->internal.ch->channel; 6274 } 6275 6276 static int 6277 bdev_register(struct spdk_bdev *bdev) 6278 { 6279 char *bdev_name; 6280 char uuid[SPDK_UUID_STRING_LEN]; 6281 int ret; 6282 6283 assert(bdev->module != NULL); 6284 6285 if (!bdev->name) { 6286 SPDK_ERRLOG("Bdev name is NULL\n"); 6287 return -EINVAL; 6288 } 6289 6290 if (!strlen(bdev->name)) { 6291 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6292 return -EINVAL; 6293 } 6294 6295 /* Users often register their own I/O devices using the bdev name. In 6296 * order to avoid conflicts, prepend bdev_. */ 6297 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6298 if (!bdev_name) { 6299 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6300 return -ENOMEM; 6301 } 6302 6303 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6304 bdev->internal.measured_queue_depth = UINT64_MAX; 6305 bdev->internal.claim_module = NULL; 6306 bdev->internal.qd_poller = NULL; 6307 bdev->internal.qos = NULL; 6308 6309 TAILQ_INIT(&bdev->internal.open_descs); 6310 TAILQ_INIT(&bdev->internal.locked_ranges); 6311 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6312 TAILQ_INIT(&bdev->aliases); 6313 6314 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6315 if (ret != 0) { 6316 free(bdev_name); 6317 return ret; 6318 } 6319 6320 /* If the user didn't specify a uuid, generate one. */ 6321 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6322 spdk_uuid_generate(&bdev->uuid); 6323 } 6324 6325 /* Add the UUID alias only if it's different than the name */ 6326 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6327 if (strcmp(bdev->name, uuid) != 0) { 6328 ret = spdk_bdev_alias_add(bdev, uuid); 6329 if (ret != 0) { 6330 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6331 bdev_name_del(&bdev->internal.bdev_name); 6332 free(bdev_name); 6333 return ret; 6334 } 6335 } 6336 6337 if (spdk_bdev_get_buf_align(bdev) > 1) { 6338 if (bdev->split_on_optimal_io_boundary) { 6339 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6340 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6341 } else { 6342 bdev->split_on_optimal_io_boundary = true; 6343 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6344 } 6345 } 6346 6347 /* If the user didn't specify a write unit size, set it to one. */ 6348 if (bdev->write_unit_size == 0) { 6349 bdev->write_unit_size = 1; 6350 } 6351 6352 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 6353 if (bdev->acwu == 0) { 6354 bdev->acwu = 1; 6355 } 6356 6357 if (bdev->phys_blocklen == 0) { 6358 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6359 } 6360 6361 bdev->internal.reset_in_progress = NULL; 6362 bdev->internal.qd_poll_in_progress = false; 6363 bdev->internal.period = 0; 6364 bdev->internal.new_period = 0; 6365 6366 spdk_io_device_register(__bdev_to_io_dev(bdev), 6367 bdev_channel_create, bdev_channel_destroy, 6368 sizeof(struct spdk_bdev_channel), 6369 bdev_name); 6370 6371 free(bdev_name); 6372 6373 pthread_mutex_init(&bdev->internal.mutex, NULL); 6374 6375 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6376 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6377 6378 return 0; 6379 } 6380 6381 static void 6382 bdev_destroy_cb(void *io_device) 6383 { 6384 int rc; 6385 struct spdk_bdev *bdev; 6386 spdk_bdev_unregister_cb cb_fn; 6387 void *cb_arg; 6388 6389 bdev = __bdev_from_io_dev(io_device); 6390 cb_fn = bdev->internal.unregister_cb; 6391 cb_arg = bdev->internal.unregister_ctx; 6392 6393 pthread_mutex_destroy(&bdev->internal.mutex); 6394 free(bdev->internal.qos); 6395 6396 rc = bdev->fn_table->destruct(bdev->ctxt); 6397 if (rc < 0) { 6398 SPDK_ERRLOG("destruct failed\n"); 6399 } 6400 if (rc <= 0 && cb_fn != NULL) { 6401 cb_fn(cb_arg, rc); 6402 } 6403 } 6404 6405 void 6406 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6407 { 6408 if (bdev->internal.unregister_cb != NULL) { 6409 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6410 } 6411 } 6412 6413 static void 6414 _remove_notify(void *arg) 6415 { 6416 struct spdk_bdev_desc *desc = arg; 6417 6418 pthread_mutex_lock(&desc->mutex); 6419 desc->refs--; 6420 6421 if (!desc->closed) { 6422 pthread_mutex_unlock(&desc->mutex); 6423 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6424 return; 6425 } else if (0 == desc->refs) { 6426 /* This descriptor was closed after this remove_notify message was sent. 6427 * spdk_bdev_close() could not free the descriptor since this message was 6428 * in flight, so we free it now using bdev_desc_free(). 6429 */ 6430 pthread_mutex_unlock(&desc->mutex); 6431 bdev_desc_free(desc); 6432 return; 6433 } 6434 pthread_mutex_unlock(&desc->mutex); 6435 } 6436 6437 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 6438 * returns: 0 - bdev removed and ready to be destructed. 6439 * -EBUSY - bdev can't be destructed yet. */ 6440 static int 6441 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6442 { 6443 struct spdk_bdev_desc *desc, *tmp; 6444 int rc = 0; 6445 char uuid[SPDK_UUID_STRING_LEN]; 6446 6447 /* Notify each descriptor about hotremoval */ 6448 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6449 rc = -EBUSY; 6450 pthread_mutex_lock(&desc->mutex); 6451 /* 6452 * Defer invocation of the event_cb to a separate message that will 6453 * run later on its thread. This ensures this context unwinds and 6454 * we don't recursively unregister this bdev again if the event_cb 6455 * immediately closes its descriptor. 6456 */ 6457 desc->refs++; 6458 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6459 pthread_mutex_unlock(&desc->mutex); 6460 } 6461 6462 /* If there are no descriptors, proceed removing the bdev */ 6463 if (rc == 0) { 6464 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6465 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6466 6467 /* Delete the name and the UUID alias */ 6468 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6469 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6470 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6471 6472 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6473 6474 if (bdev->internal.reset_in_progress != NULL) { 6475 /* If reset is in progress, let the completion callback for reset 6476 * unregister the bdev. 6477 */ 6478 rc = -EBUSY; 6479 } 6480 } 6481 6482 return rc; 6483 } 6484 6485 static void 6486 bdev_unregister_abort_channel(struct spdk_io_channel_iter *i) 6487 { 6488 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 6489 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 6490 6491 bdev_channel_abort_queued_ios(bdev_ch); 6492 spdk_for_each_channel_continue(i, 0); 6493 } 6494 6495 static void 6496 bdev_unregister(struct spdk_io_channel_iter *i, int status) 6497 { 6498 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 6499 int rc; 6500 6501 pthread_mutex_lock(&g_bdev_mgr.mutex); 6502 pthread_mutex_lock(&bdev->internal.mutex); 6503 /* 6504 * Set the status to REMOVING after completing to abort channels. Otherwise, 6505 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6506 * spdk_for_each_channel() is executed and spdk_io_device_unregister() may fail. 6507 */ 6508 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6509 rc = bdev_unregister_unsafe(bdev); 6510 pthread_mutex_unlock(&bdev->internal.mutex); 6511 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6512 6513 if (rc == 0) { 6514 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6515 } 6516 } 6517 6518 void 6519 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6520 { 6521 struct spdk_thread *thread; 6522 6523 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6524 6525 thread = spdk_get_thread(); 6526 if (!thread) { 6527 /* The user called this from a non-SPDK thread. */ 6528 if (cb_fn != NULL) { 6529 cb_fn(cb_arg, -ENOTSUP); 6530 } 6531 return; 6532 } 6533 6534 pthread_mutex_lock(&g_bdev_mgr.mutex); 6535 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6536 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6537 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6538 if (cb_fn) { 6539 cb_fn(cb_arg, -EBUSY); 6540 } 6541 return; 6542 } 6543 6544 pthread_mutex_lock(&bdev->internal.mutex); 6545 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6546 bdev->internal.unregister_cb = cb_fn; 6547 bdev->internal.unregister_ctx = cb_arg; 6548 pthread_mutex_unlock(&bdev->internal.mutex); 6549 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6550 6551 spdk_bdev_set_qd_sampling_period(bdev, 0); 6552 6553 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6554 bdev_unregister_abort_channel, 6555 bdev, 6556 bdev_unregister); 6557 } 6558 6559 int 6560 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6561 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6562 { 6563 struct spdk_bdev_desc *desc; 6564 struct spdk_bdev *bdev; 6565 int rc; 6566 6567 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 6568 if (rc != 0) { 6569 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 6570 return rc; 6571 } 6572 6573 bdev = spdk_bdev_desc_get_bdev(desc); 6574 6575 if (bdev->module != module) { 6576 spdk_bdev_close(desc); 6577 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 6578 bdev_name); 6579 return -ENODEV; 6580 } 6581 6582 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 6583 6584 spdk_bdev_close(desc); 6585 6586 return 0; 6587 } 6588 6589 static int 6590 bdev_start_qos(struct spdk_bdev *bdev) 6591 { 6592 struct set_qos_limit_ctx *ctx; 6593 6594 /* Enable QoS */ 6595 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6596 ctx = calloc(1, sizeof(*ctx)); 6597 if (ctx == NULL) { 6598 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6599 return -ENOMEM; 6600 } 6601 ctx->bdev = bdev; 6602 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6603 bdev_enable_qos_msg, ctx, 6604 bdev_enable_qos_done); 6605 } 6606 6607 return 0; 6608 } 6609 6610 static int 6611 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6612 { 6613 struct spdk_thread *thread; 6614 int rc = 0; 6615 6616 thread = spdk_get_thread(); 6617 if (!thread) { 6618 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6619 return -ENOTSUP; 6620 } 6621 6622 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6623 spdk_get_thread()); 6624 6625 desc->bdev = bdev; 6626 desc->thread = thread; 6627 desc->write = write; 6628 6629 pthread_mutex_lock(&bdev->internal.mutex); 6630 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6631 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6632 pthread_mutex_unlock(&bdev->internal.mutex); 6633 return -ENODEV; 6634 } 6635 6636 if (write && bdev->internal.claim_module) { 6637 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6638 bdev->name, bdev->internal.claim_module->name); 6639 pthread_mutex_unlock(&bdev->internal.mutex); 6640 return -EPERM; 6641 } 6642 6643 rc = bdev_start_qos(bdev); 6644 if (rc != 0) { 6645 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6646 pthread_mutex_unlock(&bdev->internal.mutex); 6647 return rc; 6648 } 6649 6650 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6651 6652 pthread_mutex_unlock(&bdev->internal.mutex); 6653 6654 return 0; 6655 } 6656 6657 static int 6658 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 6659 struct spdk_bdev_desc **_desc) 6660 { 6661 struct spdk_bdev_desc *desc; 6662 unsigned int event_id; 6663 6664 desc = calloc(1, sizeof(*desc)); 6665 if (desc == NULL) { 6666 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6667 return -ENOMEM; 6668 } 6669 6670 TAILQ_INIT(&desc->pending_media_events); 6671 TAILQ_INIT(&desc->free_media_events); 6672 6673 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 6674 desc->callback.event_fn = event_cb; 6675 desc->callback.ctx = event_ctx; 6676 pthread_mutex_init(&desc->mutex, NULL); 6677 6678 if (bdev->media_events) { 6679 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6680 sizeof(*desc->media_events_buffer)); 6681 if (desc->media_events_buffer == NULL) { 6682 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6683 bdev_desc_free(desc); 6684 return -ENOMEM; 6685 } 6686 6687 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6688 TAILQ_INSERT_TAIL(&desc->free_media_events, 6689 &desc->media_events_buffer[event_id], tailq); 6690 } 6691 } 6692 6693 *_desc = desc; 6694 6695 return 0; 6696 } 6697 6698 int 6699 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6700 void *event_ctx, struct spdk_bdev_desc **_desc) 6701 { 6702 struct spdk_bdev_desc *desc; 6703 struct spdk_bdev *bdev; 6704 int rc; 6705 6706 if (event_cb == NULL) { 6707 SPDK_ERRLOG("Missing event callback function\n"); 6708 return -EINVAL; 6709 } 6710 6711 pthread_mutex_lock(&g_bdev_mgr.mutex); 6712 6713 bdev = bdev_get_by_name(bdev_name); 6714 6715 if (bdev == NULL) { 6716 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6717 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6718 return -ENODEV; 6719 } 6720 6721 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 6722 if (rc != 0) { 6723 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6724 return rc; 6725 } 6726 6727 rc = bdev_open(bdev, write, desc); 6728 if (rc != 0) { 6729 bdev_desc_free(desc); 6730 desc = NULL; 6731 } 6732 6733 *_desc = desc; 6734 6735 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6736 6737 return rc; 6738 } 6739 6740 static void 6741 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 6742 { 6743 int rc; 6744 6745 pthread_mutex_lock(&bdev->internal.mutex); 6746 pthread_mutex_lock(&desc->mutex); 6747 6748 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6749 6750 desc->closed = true; 6751 6752 if (0 == desc->refs) { 6753 pthread_mutex_unlock(&desc->mutex); 6754 bdev_desc_free(desc); 6755 } else { 6756 pthread_mutex_unlock(&desc->mutex); 6757 } 6758 6759 /* If no more descriptors, kill QoS channel */ 6760 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6761 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6762 bdev->name, spdk_get_thread()); 6763 6764 if (bdev_qos_destroy(bdev)) { 6765 /* There isn't anything we can do to recover here. Just let the 6766 * old QoS poller keep running. The QoS handling won't change 6767 * cores when the user allocates a new channel, but it won't break. */ 6768 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6769 } 6770 } 6771 6772 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6773 rc = bdev_unregister_unsafe(bdev); 6774 pthread_mutex_unlock(&bdev->internal.mutex); 6775 6776 if (rc == 0) { 6777 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6778 } 6779 } else { 6780 pthread_mutex_unlock(&bdev->internal.mutex); 6781 } 6782 } 6783 6784 void 6785 spdk_bdev_close(struct spdk_bdev_desc *desc) 6786 { 6787 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6788 6789 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6790 spdk_get_thread()); 6791 6792 assert(desc->thread == spdk_get_thread()); 6793 6794 spdk_poller_unregister(&desc->io_timeout_poller); 6795 6796 pthread_mutex_lock(&g_bdev_mgr.mutex); 6797 6798 bdev_close(bdev, desc); 6799 6800 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6801 } 6802 6803 static void 6804 bdev_register_finished(void *arg) 6805 { 6806 struct spdk_bdev_desc *desc = arg; 6807 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6808 6809 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 6810 6811 bdev_close(bdev, desc); 6812 } 6813 6814 int 6815 spdk_bdev_register(struct spdk_bdev *bdev) 6816 { 6817 struct spdk_bdev_desc *desc; 6818 int rc; 6819 6820 rc = bdev_register(bdev); 6821 if (rc != 0) { 6822 return rc; 6823 } 6824 6825 /* A descriptor is opened to prevent bdev deletion during examination */ 6826 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6827 if (rc != 0) { 6828 spdk_bdev_unregister(bdev, NULL, NULL); 6829 return rc; 6830 } 6831 6832 rc = bdev_open(bdev, false, desc); 6833 if (rc != 0) { 6834 bdev_desc_free(desc); 6835 spdk_bdev_unregister(bdev, NULL, NULL); 6836 return rc; 6837 } 6838 6839 /* Examine configuration before initializing I/O */ 6840 bdev_examine(bdev); 6841 6842 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 6843 if (rc != 0) { 6844 bdev_close(bdev, desc); 6845 spdk_bdev_unregister(bdev, NULL, NULL); 6846 } 6847 6848 return rc; 6849 } 6850 6851 int 6852 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6853 struct spdk_bdev_module *module) 6854 { 6855 if (bdev->internal.claim_module != NULL) { 6856 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6857 bdev->internal.claim_module->name); 6858 return -EPERM; 6859 } 6860 6861 if (desc && !desc->write) { 6862 desc->write = true; 6863 } 6864 6865 bdev->internal.claim_module = module; 6866 return 0; 6867 } 6868 6869 void 6870 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6871 { 6872 assert(bdev->internal.claim_module != NULL); 6873 bdev->internal.claim_module = NULL; 6874 } 6875 6876 struct spdk_bdev * 6877 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6878 { 6879 assert(desc != NULL); 6880 return desc->bdev; 6881 } 6882 6883 int 6884 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 6885 { 6886 struct spdk_bdev *bdev, *tmp; 6887 struct spdk_bdev_desc *desc; 6888 int rc = 0; 6889 6890 assert(fn != NULL); 6891 6892 pthread_mutex_lock(&g_bdev_mgr.mutex); 6893 bdev = spdk_bdev_first(); 6894 while (bdev != NULL) { 6895 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6896 if (rc != 0) { 6897 break; 6898 } 6899 rc = bdev_open(bdev, false, desc); 6900 if (rc != 0) { 6901 bdev_desc_free(desc); 6902 break; 6903 } 6904 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6905 6906 rc = fn(ctx, bdev); 6907 6908 pthread_mutex_lock(&g_bdev_mgr.mutex); 6909 tmp = spdk_bdev_next(bdev); 6910 bdev_close(bdev, desc); 6911 if (rc != 0) { 6912 break; 6913 } 6914 bdev = tmp; 6915 } 6916 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6917 6918 return rc; 6919 } 6920 6921 int 6922 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 6923 { 6924 struct spdk_bdev *bdev, *tmp; 6925 struct spdk_bdev_desc *desc; 6926 int rc = 0; 6927 6928 assert(fn != NULL); 6929 6930 pthread_mutex_lock(&g_bdev_mgr.mutex); 6931 bdev = spdk_bdev_first_leaf(); 6932 while (bdev != NULL) { 6933 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6934 if (rc != 0) { 6935 break; 6936 } 6937 rc = bdev_open(bdev, false, desc); 6938 if (rc != 0) { 6939 bdev_desc_free(desc); 6940 break; 6941 } 6942 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6943 6944 rc = fn(ctx, bdev); 6945 6946 pthread_mutex_lock(&g_bdev_mgr.mutex); 6947 tmp = spdk_bdev_next_leaf(bdev); 6948 bdev_close(bdev, desc); 6949 if (rc != 0) { 6950 break; 6951 } 6952 bdev = tmp; 6953 } 6954 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6955 6956 return rc; 6957 } 6958 6959 void 6960 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6961 { 6962 struct iovec *iovs; 6963 int iovcnt; 6964 6965 if (bdev_io == NULL) { 6966 return; 6967 } 6968 6969 switch (bdev_io->type) { 6970 case SPDK_BDEV_IO_TYPE_READ: 6971 case SPDK_BDEV_IO_TYPE_WRITE: 6972 case SPDK_BDEV_IO_TYPE_ZCOPY: 6973 iovs = bdev_io->u.bdev.iovs; 6974 iovcnt = bdev_io->u.bdev.iovcnt; 6975 break; 6976 default: 6977 iovs = NULL; 6978 iovcnt = 0; 6979 break; 6980 } 6981 6982 if (iovp) { 6983 *iovp = iovs; 6984 } 6985 if (iovcntp) { 6986 *iovcntp = iovcnt; 6987 } 6988 } 6989 6990 void * 6991 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 6992 { 6993 if (bdev_io == NULL) { 6994 return NULL; 6995 } 6996 6997 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 6998 return NULL; 6999 } 7000 7001 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7002 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7003 return bdev_io->u.bdev.md_buf; 7004 } 7005 7006 return NULL; 7007 } 7008 7009 void * 7010 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7011 { 7012 if (bdev_io == NULL) { 7013 assert(false); 7014 return NULL; 7015 } 7016 7017 return bdev_io->internal.caller_ctx; 7018 } 7019 7020 void 7021 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7022 { 7023 7024 if (spdk_bdev_module_list_find(bdev_module->name)) { 7025 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7026 assert(false); 7027 } 7028 7029 /* 7030 * Modules with examine callbacks must be initialized first, so they are 7031 * ready to handle examine callbacks from later modules that will 7032 * register physical bdevs. 7033 */ 7034 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7035 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7036 } else { 7037 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7038 } 7039 } 7040 7041 struct spdk_bdev_module * 7042 spdk_bdev_module_list_find(const char *name) 7043 { 7044 struct spdk_bdev_module *bdev_module; 7045 7046 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 7047 if (strcmp(name, bdev_module->name) == 0) { 7048 break; 7049 } 7050 } 7051 7052 return bdev_module; 7053 } 7054 7055 static void 7056 bdev_write_zero_buffer_next(void *_bdev_io) 7057 { 7058 struct spdk_bdev_io *bdev_io = _bdev_io; 7059 uint64_t num_bytes, num_blocks; 7060 void *md_buf = NULL; 7061 int rc; 7062 7063 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 7064 bdev_io->u.bdev.split_remaining_num_blocks, 7065 ZERO_BUFFER_SIZE); 7066 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 7067 7068 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 7069 md_buf = (char *)g_bdev_mgr.zero_buffer + 7070 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 7071 } 7072 7073 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 7074 spdk_io_channel_from_ctx(bdev_io->internal.ch), 7075 g_bdev_mgr.zero_buffer, md_buf, 7076 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 7077 bdev_write_zero_buffer_done, bdev_io); 7078 if (rc == 0) { 7079 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 7080 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 7081 } else if (rc == -ENOMEM) { 7082 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 7083 } else { 7084 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7085 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 7086 } 7087 } 7088 7089 static void 7090 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7091 { 7092 struct spdk_bdev_io *parent_io = cb_arg; 7093 7094 spdk_bdev_free_io(bdev_io); 7095 7096 if (!success) { 7097 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7098 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 7099 return; 7100 } 7101 7102 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 7103 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7104 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 7105 return; 7106 } 7107 7108 bdev_write_zero_buffer_next(parent_io); 7109 } 7110 7111 static void 7112 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 7113 { 7114 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7115 ctx->bdev->internal.qos_mod_in_progress = false; 7116 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7117 7118 if (ctx->cb_fn) { 7119 ctx->cb_fn(ctx->cb_arg, status); 7120 } 7121 free(ctx); 7122 } 7123 7124 static void 7125 bdev_disable_qos_done(void *cb_arg) 7126 { 7127 struct set_qos_limit_ctx *ctx = cb_arg; 7128 struct spdk_bdev *bdev = ctx->bdev; 7129 struct spdk_bdev_io *bdev_io; 7130 struct spdk_bdev_qos *qos; 7131 7132 pthread_mutex_lock(&bdev->internal.mutex); 7133 qos = bdev->internal.qos; 7134 bdev->internal.qos = NULL; 7135 pthread_mutex_unlock(&bdev->internal.mutex); 7136 7137 while (!TAILQ_EMPTY(&qos->queued)) { 7138 /* Send queued I/O back to their original thread for resubmission. */ 7139 bdev_io = TAILQ_FIRST(&qos->queued); 7140 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 7141 7142 if (bdev_io->internal.io_submit_ch) { 7143 /* 7144 * Channel was changed when sending it to the QoS thread - change it back 7145 * before sending it back to the original thread. 7146 */ 7147 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 7148 bdev_io->internal.io_submit_ch = NULL; 7149 } 7150 7151 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7152 _bdev_io_submit, bdev_io); 7153 } 7154 7155 if (qos->thread != NULL) { 7156 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 7157 spdk_poller_unregister(&qos->poller); 7158 } 7159 7160 free(qos); 7161 7162 bdev_set_qos_limit_done(ctx, 0); 7163 } 7164 7165 static void 7166 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 7167 { 7168 void *io_device = spdk_io_channel_iter_get_io_device(i); 7169 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 7170 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7171 struct spdk_thread *thread; 7172 7173 pthread_mutex_lock(&bdev->internal.mutex); 7174 thread = bdev->internal.qos->thread; 7175 pthread_mutex_unlock(&bdev->internal.mutex); 7176 7177 if (thread != NULL) { 7178 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 7179 } else { 7180 bdev_disable_qos_done(ctx); 7181 } 7182 } 7183 7184 static void 7185 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 7186 { 7187 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 7188 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 7189 7190 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 7191 7192 spdk_for_each_channel_continue(i, 0); 7193 } 7194 7195 static void 7196 bdev_update_qos_rate_limit_msg(void *cb_arg) 7197 { 7198 struct set_qos_limit_ctx *ctx = cb_arg; 7199 struct spdk_bdev *bdev = ctx->bdev; 7200 7201 pthread_mutex_lock(&bdev->internal.mutex); 7202 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 7203 pthread_mutex_unlock(&bdev->internal.mutex); 7204 7205 bdev_set_qos_limit_done(ctx, 0); 7206 } 7207 7208 static void 7209 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 7210 { 7211 void *io_device = spdk_io_channel_iter_get_io_device(i); 7212 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 7213 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 7214 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 7215 7216 pthread_mutex_lock(&bdev->internal.mutex); 7217 bdev_enable_qos(bdev, bdev_ch); 7218 pthread_mutex_unlock(&bdev->internal.mutex); 7219 spdk_for_each_channel_continue(i, 0); 7220 } 7221 7222 static void 7223 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 7224 { 7225 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7226 7227 bdev_set_qos_limit_done(ctx, status); 7228 } 7229 7230 static void 7231 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 7232 { 7233 int i; 7234 7235 assert(bdev->internal.qos != NULL); 7236 7237 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7238 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7239 bdev->internal.qos->rate_limits[i].limit = limits[i]; 7240 7241 if (limits[i] == 0) { 7242 bdev->internal.qos->rate_limits[i].limit = 7243 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 7244 } 7245 } 7246 } 7247 } 7248 7249 void 7250 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 7251 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 7252 { 7253 struct set_qos_limit_ctx *ctx; 7254 uint32_t limit_set_complement; 7255 uint64_t min_limit_per_sec; 7256 int i; 7257 bool disable_rate_limit = true; 7258 7259 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7260 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7261 continue; 7262 } 7263 7264 if (limits[i] > 0) { 7265 disable_rate_limit = false; 7266 } 7267 7268 if (bdev_qos_is_iops_rate_limit(i) == true) { 7269 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7270 } else { 7271 /* Change from megabyte to byte rate limit */ 7272 limits[i] = limits[i] * 1024 * 1024; 7273 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7274 } 7275 7276 limit_set_complement = limits[i] % min_limit_per_sec; 7277 if (limit_set_complement) { 7278 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7279 limits[i], min_limit_per_sec); 7280 limits[i] += min_limit_per_sec - limit_set_complement; 7281 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7282 } 7283 } 7284 7285 ctx = calloc(1, sizeof(*ctx)); 7286 if (ctx == NULL) { 7287 cb_fn(cb_arg, -ENOMEM); 7288 return; 7289 } 7290 7291 ctx->cb_fn = cb_fn; 7292 ctx->cb_arg = cb_arg; 7293 ctx->bdev = bdev; 7294 7295 pthread_mutex_lock(&bdev->internal.mutex); 7296 if (bdev->internal.qos_mod_in_progress) { 7297 pthread_mutex_unlock(&bdev->internal.mutex); 7298 free(ctx); 7299 cb_fn(cb_arg, -EAGAIN); 7300 return; 7301 } 7302 bdev->internal.qos_mod_in_progress = true; 7303 7304 if (disable_rate_limit == true && bdev->internal.qos) { 7305 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7306 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7307 (bdev->internal.qos->rate_limits[i].limit > 0 && 7308 bdev->internal.qos->rate_limits[i].limit != 7309 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7310 disable_rate_limit = false; 7311 break; 7312 } 7313 } 7314 } 7315 7316 if (disable_rate_limit == false) { 7317 if (bdev->internal.qos == NULL) { 7318 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7319 if (!bdev->internal.qos) { 7320 pthread_mutex_unlock(&bdev->internal.mutex); 7321 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7322 bdev_set_qos_limit_done(ctx, -ENOMEM); 7323 return; 7324 } 7325 } 7326 7327 if (bdev->internal.qos->thread == NULL) { 7328 /* Enabling */ 7329 bdev_set_qos_rate_limits(bdev, limits); 7330 7331 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7332 bdev_enable_qos_msg, ctx, 7333 bdev_enable_qos_done); 7334 } else { 7335 /* Updating */ 7336 bdev_set_qos_rate_limits(bdev, limits); 7337 7338 spdk_thread_send_msg(bdev->internal.qos->thread, 7339 bdev_update_qos_rate_limit_msg, ctx); 7340 } 7341 } else { 7342 if (bdev->internal.qos != NULL) { 7343 bdev_set_qos_rate_limits(bdev, limits); 7344 7345 /* Disabling */ 7346 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7347 bdev_disable_qos_msg, ctx, 7348 bdev_disable_qos_msg_done); 7349 } else { 7350 pthread_mutex_unlock(&bdev->internal.mutex); 7351 bdev_set_qos_limit_done(ctx, 0); 7352 return; 7353 } 7354 } 7355 7356 pthread_mutex_unlock(&bdev->internal.mutex); 7357 } 7358 7359 struct spdk_bdev_histogram_ctx { 7360 spdk_bdev_histogram_status_cb cb_fn; 7361 void *cb_arg; 7362 struct spdk_bdev *bdev; 7363 int status; 7364 }; 7365 7366 static void 7367 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 7368 { 7369 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7370 7371 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7372 ctx->bdev->internal.histogram_in_progress = false; 7373 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7374 ctx->cb_fn(ctx->cb_arg, ctx->status); 7375 free(ctx); 7376 } 7377 7378 static void 7379 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 7380 { 7381 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7382 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7383 7384 if (ch->histogram != NULL) { 7385 spdk_histogram_data_free(ch->histogram); 7386 ch->histogram = NULL; 7387 } 7388 spdk_for_each_channel_continue(i, 0); 7389 } 7390 7391 static void 7392 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 7393 { 7394 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7395 7396 if (status != 0) { 7397 ctx->status = status; 7398 ctx->bdev->internal.histogram_enabled = false; 7399 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 7400 bdev_histogram_disable_channel_cb); 7401 } else { 7402 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7403 ctx->bdev->internal.histogram_in_progress = false; 7404 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7405 ctx->cb_fn(ctx->cb_arg, ctx->status); 7406 free(ctx); 7407 } 7408 } 7409 7410 static void 7411 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 7412 { 7413 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7414 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7415 int status = 0; 7416 7417 if (ch->histogram == NULL) { 7418 ch->histogram = spdk_histogram_data_alloc(); 7419 if (ch->histogram == NULL) { 7420 status = -ENOMEM; 7421 } 7422 } 7423 7424 spdk_for_each_channel_continue(i, status); 7425 } 7426 7427 void 7428 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7429 void *cb_arg, bool enable) 7430 { 7431 struct spdk_bdev_histogram_ctx *ctx; 7432 7433 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7434 if (ctx == NULL) { 7435 cb_fn(cb_arg, -ENOMEM); 7436 return; 7437 } 7438 7439 ctx->bdev = bdev; 7440 ctx->status = 0; 7441 ctx->cb_fn = cb_fn; 7442 ctx->cb_arg = cb_arg; 7443 7444 pthread_mutex_lock(&bdev->internal.mutex); 7445 if (bdev->internal.histogram_in_progress) { 7446 pthread_mutex_unlock(&bdev->internal.mutex); 7447 free(ctx); 7448 cb_fn(cb_arg, -EAGAIN); 7449 return; 7450 } 7451 7452 bdev->internal.histogram_in_progress = true; 7453 pthread_mutex_unlock(&bdev->internal.mutex); 7454 7455 bdev->internal.histogram_enabled = enable; 7456 7457 if (enable) { 7458 /* Allocate histogram for each channel */ 7459 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 7460 bdev_histogram_enable_channel_cb); 7461 } else { 7462 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 7463 bdev_histogram_disable_channel_cb); 7464 } 7465 } 7466 7467 struct spdk_bdev_histogram_data_ctx { 7468 spdk_bdev_histogram_data_cb cb_fn; 7469 void *cb_arg; 7470 struct spdk_bdev *bdev; 7471 /** merged histogram data from all channels */ 7472 struct spdk_histogram_data *histogram; 7473 }; 7474 7475 static void 7476 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 7477 { 7478 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7479 7480 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7481 free(ctx); 7482 } 7483 7484 static void 7485 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 7486 { 7487 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7488 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7489 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7490 int status = 0; 7491 7492 if (ch->histogram == NULL) { 7493 status = -EFAULT; 7494 } else { 7495 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7496 } 7497 7498 spdk_for_each_channel_continue(i, status); 7499 } 7500 7501 void 7502 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7503 spdk_bdev_histogram_data_cb cb_fn, 7504 void *cb_arg) 7505 { 7506 struct spdk_bdev_histogram_data_ctx *ctx; 7507 7508 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7509 if (ctx == NULL) { 7510 cb_fn(cb_arg, -ENOMEM, NULL); 7511 return; 7512 } 7513 7514 ctx->bdev = bdev; 7515 ctx->cb_fn = cb_fn; 7516 ctx->cb_arg = cb_arg; 7517 7518 ctx->histogram = histogram; 7519 7520 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 7521 bdev_histogram_get_channel_cb); 7522 } 7523 7524 size_t 7525 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 7526 size_t max_events) 7527 { 7528 struct media_event_entry *entry; 7529 size_t num_events = 0; 7530 7531 for (; num_events < max_events; ++num_events) { 7532 entry = TAILQ_FIRST(&desc->pending_media_events); 7533 if (entry == NULL) { 7534 break; 7535 } 7536 7537 events[num_events] = entry->event; 7538 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 7539 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 7540 } 7541 7542 return num_events; 7543 } 7544 7545 int 7546 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 7547 size_t num_events) 7548 { 7549 struct spdk_bdev_desc *desc; 7550 struct media_event_entry *entry; 7551 size_t event_id; 7552 int rc = 0; 7553 7554 assert(bdev->media_events); 7555 7556 pthread_mutex_lock(&bdev->internal.mutex); 7557 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7558 if (desc->write) { 7559 break; 7560 } 7561 } 7562 7563 if (desc == NULL || desc->media_events_buffer == NULL) { 7564 rc = -ENODEV; 7565 goto out; 7566 } 7567 7568 for (event_id = 0; event_id < num_events; ++event_id) { 7569 entry = TAILQ_FIRST(&desc->free_media_events); 7570 if (entry == NULL) { 7571 break; 7572 } 7573 7574 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 7575 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 7576 entry->event = events[event_id]; 7577 } 7578 7579 rc = event_id; 7580 out: 7581 pthread_mutex_unlock(&bdev->internal.mutex); 7582 return rc; 7583 } 7584 7585 void 7586 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 7587 { 7588 struct spdk_bdev_desc *desc; 7589 7590 pthread_mutex_lock(&bdev->internal.mutex); 7591 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7592 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 7593 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 7594 desc->callback.ctx); 7595 } 7596 } 7597 pthread_mutex_unlock(&bdev->internal.mutex); 7598 } 7599 7600 struct locked_lba_range_ctx { 7601 struct lba_range range; 7602 struct spdk_bdev *bdev; 7603 struct lba_range *current_range; 7604 struct lba_range *owner_range; 7605 struct spdk_poller *poller; 7606 lock_range_cb cb_fn; 7607 void *cb_arg; 7608 }; 7609 7610 static void 7611 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 7612 { 7613 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7614 7615 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 7616 free(ctx); 7617 } 7618 7619 static void bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 7620 7621 static void 7622 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7623 { 7624 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7625 struct spdk_bdev *bdev = ctx->bdev; 7626 7627 if (status == -ENOMEM) { 7628 /* One of the channels could not allocate a range object. 7629 * So we have to go back and clean up any ranges that were 7630 * allocated successfully before we return error status to 7631 * the caller. We can reuse the unlock function to do that 7632 * clean up. 7633 */ 7634 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7635 bdev_unlock_lba_range_get_channel, ctx, 7636 bdev_lock_error_cleanup_cb); 7637 return; 7638 } 7639 7640 /* All channels have locked this range and no I/O overlapping the range 7641 * are outstanding! Set the owner_ch for the range object for the 7642 * locking channel, so that this channel will know that it is allowed 7643 * to write to this range. 7644 */ 7645 ctx->owner_range->owner_ch = ctx->range.owner_ch; 7646 ctx->cb_fn(ctx->cb_arg, status); 7647 7648 /* Don't free the ctx here. Its range is in the bdev's global list of 7649 * locked ranges still, and will be removed and freed when this range 7650 * is later unlocked. 7651 */ 7652 } 7653 7654 static int 7655 bdev_lock_lba_range_check_io(void *_i) 7656 { 7657 struct spdk_io_channel_iter *i = _i; 7658 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7659 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7660 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7661 struct lba_range *range = ctx->current_range; 7662 struct spdk_bdev_io *bdev_io; 7663 7664 spdk_poller_unregister(&ctx->poller); 7665 7666 /* The range is now in the locked_ranges, so no new IO can be submitted to this 7667 * range. But we need to wait until any outstanding IO overlapping with this range 7668 * are completed. 7669 */ 7670 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 7671 if (bdev_io_range_is_locked(bdev_io, range)) { 7672 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 7673 return SPDK_POLLER_BUSY; 7674 } 7675 } 7676 7677 spdk_for_each_channel_continue(i, 0); 7678 return SPDK_POLLER_BUSY; 7679 } 7680 7681 static void 7682 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7683 { 7684 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7685 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7686 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7687 struct lba_range *range; 7688 7689 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7690 if (range->length == ctx->range.length && 7691 range->offset == ctx->range.offset && 7692 range->locked_ctx == ctx->range.locked_ctx) { 7693 /* This range already exists on this channel, so don't add 7694 * it again. This can happen when a new channel is created 7695 * while the for_each_channel operation is in progress. 7696 * Do not check for outstanding I/O in that case, since the 7697 * range was locked before any I/O could be submitted to the 7698 * new channel. 7699 */ 7700 spdk_for_each_channel_continue(i, 0); 7701 return; 7702 } 7703 } 7704 7705 range = calloc(1, sizeof(*range)); 7706 if (range == NULL) { 7707 spdk_for_each_channel_continue(i, -ENOMEM); 7708 return; 7709 } 7710 7711 range->length = ctx->range.length; 7712 range->offset = ctx->range.offset; 7713 range->locked_ctx = ctx->range.locked_ctx; 7714 ctx->current_range = range; 7715 if (ctx->range.owner_ch == ch) { 7716 /* This is the range object for the channel that will hold 7717 * the lock. Store it in the ctx object so that we can easily 7718 * set its owner_ch after the lock is finally acquired. 7719 */ 7720 ctx->owner_range = range; 7721 } 7722 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 7723 bdev_lock_lba_range_check_io(i); 7724 } 7725 7726 static void 7727 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 7728 { 7729 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 7730 7731 /* We will add a copy of this range to each channel now. */ 7732 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 7733 bdev_lock_lba_range_cb); 7734 } 7735 7736 static bool 7737 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 7738 { 7739 struct lba_range *r; 7740 7741 TAILQ_FOREACH(r, tailq, tailq) { 7742 if (bdev_lba_range_overlapped(range, r)) { 7743 return true; 7744 } 7745 } 7746 return false; 7747 } 7748 7749 static int 7750 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7751 uint64_t offset, uint64_t length, 7752 lock_range_cb cb_fn, void *cb_arg) 7753 { 7754 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7755 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7756 struct locked_lba_range_ctx *ctx; 7757 7758 if (cb_arg == NULL) { 7759 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7760 return -EINVAL; 7761 } 7762 7763 ctx = calloc(1, sizeof(*ctx)); 7764 if (ctx == NULL) { 7765 return -ENOMEM; 7766 } 7767 7768 ctx->range.offset = offset; 7769 ctx->range.length = length; 7770 ctx->range.owner_ch = ch; 7771 ctx->range.locked_ctx = cb_arg; 7772 ctx->bdev = bdev; 7773 ctx->cb_fn = cb_fn; 7774 ctx->cb_arg = cb_arg; 7775 7776 pthread_mutex_lock(&bdev->internal.mutex); 7777 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7778 /* There is an active lock overlapping with this range. 7779 * Put it on the pending list until this range no 7780 * longer overlaps with another. 7781 */ 7782 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7783 } else { 7784 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7785 bdev_lock_lba_range_ctx(bdev, ctx); 7786 } 7787 pthread_mutex_unlock(&bdev->internal.mutex); 7788 return 0; 7789 } 7790 7791 static void 7792 bdev_lock_lba_range_ctx_msg(void *_ctx) 7793 { 7794 struct locked_lba_range_ctx *ctx = _ctx; 7795 7796 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7797 } 7798 7799 static void 7800 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7801 { 7802 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7803 struct locked_lba_range_ctx *pending_ctx; 7804 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7805 struct spdk_bdev *bdev = ch->bdev; 7806 struct lba_range *range, *tmp; 7807 7808 pthread_mutex_lock(&bdev->internal.mutex); 7809 /* Check if there are any pending locked ranges that overlap with this range 7810 * that was just unlocked. If there are, check that it doesn't overlap with any 7811 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7812 * the lock process. 7813 */ 7814 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7815 if (bdev_lba_range_overlapped(range, &ctx->range) && 7816 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7817 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7818 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7819 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7820 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7821 bdev_lock_lba_range_ctx_msg, pending_ctx); 7822 } 7823 } 7824 pthread_mutex_unlock(&bdev->internal.mutex); 7825 7826 ctx->cb_fn(ctx->cb_arg, status); 7827 free(ctx); 7828 } 7829 7830 static void 7831 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7832 { 7833 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7834 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7835 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7836 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7837 struct spdk_bdev_io *bdev_io; 7838 struct lba_range *range; 7839 7840 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7841 if (ctx->range.offset == range->offset && 7842 ctx->range.length == range->length && 7843 ctx->range.locked_ctx == range->locked_ctx) { 7844 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7845 free(range); 7846 break; 7847 } 7848 } 7849 7850 /* Note: we should almost always be able to assert that the range specified 7851 * was found. But there are some very rare corner cases where a new channel 7852 * gets created simultaneously with a range unlock, where this function 7853 * would execute on that new channel and wouldn't have the range. 7854 * We also use this to clean up range allocations when a later allocation 7855 * fails in the locking path. 7856 * So we can't actually assert() here. 7857 */ 7858 7859 /* Swap the locked IO into a temporary list, and then try to submit them again. 7860 * We could hyper-optimize this to only resubmit locked I/O that overlap 7861 * with the range that was just unlocked, but this isn't a performance path so 7862 * we go for simplicity here. 7863 */ 7864 TAILQ_INIT(&io_locked); 7865 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7866 while (!TAILQ_EMPTY(&io_locked)) { 7867 bdev_io = TAILQ_FIRST(&io_locked); 7868 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7869 bdev_io_submit(bdev_io); 7870 } 7871 7872 spdk_for_each_channel_continue(i, 0); 7873 } 7874 7875 static int 7876 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7877 uint64_t offset, uint64_t length, 7878 lock_range_cb cb_fn, void *cb_arg) 7879 { 7880 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7881 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7882 struct locked_lba_range_ctx *ctx; 7883 struct lba_range *range; 7884 bool range_found = false; 7885 7886 /* Let's make sure the specified channel actually has a lock on 7887 * the specified range. Note that the range must match exactly. 7888 */ 7889 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7890 if (range->offset == offset && range->length == length && 7891 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7892 range_found = true; 7893 break; 7894 } 7895 } 7896 7897 if (!range_found) { 7898 return -EINVAL; 7899 } 7900 7901 pthread_mutex_lock(&bdev->internal.mutex); 7902 /* We confirmed that this channel has locked the specified range. To 7903 * start the unlock the process, we find the range in the bdev's locked_ranges 7904 * and remove it. This ensures new channels don't inherit the locked range. 7905 * Then we will send a message to each channel (including the one specified 7906 * here) to remove the range from its per-channel list. 7907 */ 7908 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7909 if (range->offset == offset && range->length == length && 7910 range->locked_ctx == cb_arg) { 7911 break; 7912 } 7913 } 7914 if (range == NULL) { 7915 assert(false); 7916 pthread_mutex_unlock(&bdev->internal.mutex); 7917 return -EINVAL; 7918 } 7919 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7920 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7921 pthread_mutex_unlock(&bdev->internal.mutex); 7922 7923 ctx->cb_fn = cb_fn; 7924 ctx->cb_arg = cb_arg; 7925 7926 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7927 bdev_unlock_lba_range_cb); 7928 return 0; 7929 } 7930 7931 int 7932 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7933 int array_size) 7934 { 7935 if (!bdev) { 7936 return -EINVAL; 7937 } 7938 7939 if (bdev->fn_table->get_memory_domains) { 7940 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7941 } 7942 7943 return 0; 7944 } 7945 7946 struct spdk_bdev_for_each_io_ctx { 7947 void *ctx; 7948 spdk_bdev_io_fn fn; 7949 spdk_bdev_for_each_io_cb cb; 7950 }; 7951 7952 static void 7953 bdev_channel_for_each_io(struct spdk_io_channel_iter *i) 7954 { 7955 struct spdk_bdev_for_each_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7956 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 7957 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 7958 struct spdk_bdev_io *bdev_io; 7959 int rc = 0; 7960 7961 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 7962 rc = ctx->fn(ctx->ctx, bdev_io); 7963 if (rc != 0) { 7964 break; 7965 } 7966 } 7967 7968 spdk_for_each_channel_continue(i, rc); 7969 } 7970 7971 static void 7972 bdev_for_each_io_done(struct spdk_io_channel_iter *i, int status) 7973 { 7974 struct spdk_bdev_for_each_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7975 7976 ctx->cb(ctx->ctx, status); 7977 7978 free(ctx); 7979 } 7980 7981 void 7982 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 7983 spdk_bdev_for_each_io_cb cb) 7984 { 7985 struct spdk_bdev_for_each_io_ctx *ctx; 7986 7987 assert(fn != NULL && cb != NULL); 7988 7989 ctx = calloc(1, sizeof(*ctx)); 7990 if (ctx == NULL) { 7991 SPDK_ERRLOG("Failed to allocate context.\n"); 7992 cb(_ctx, -ENOMEM); 7993 return; 7994 } 7995 7996 ctx->ctx = _ctx; 7997 ctx->fn = fn; 7998 ctx->cb = cb; 7999 8000 spdk_for_each_channel(__bdev_to_io_dev(bdev), 8001 bdev_channel_for_each_io, 8002 ctx, 8003 bdev_for_each_io_done); 8004 } 8005 8006 SPDK_LOG_REGISTER_COMPONENT(bdev) 8007 8008 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 8009 { 8010 struct spdk_trace_tpoint_opts opts[] = { 8011 { 8012 "BDEV_IO_START", TRACE_BDEV_IO_START, 8013 OWNER_BDEV, OBJECT_BDEV_IO, 1, 8014 { 8015 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8016 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 8017 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8018 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 } 8019 } 8020 }, 8021 { 8022 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 8023 OWNER_BDEV, OBJECT_BDEV_IO, 0, 8024 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8025 }, 8026 { 8027 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 8028 OWNER_BDEV, OBJECT_NONE, 1, 8029 { 8030 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8031 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8032 } 8033 }, 8034 { 8035 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 8036 OWNER_BDEV, OBJECT_NONE, 0, 8037 { 8038 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8039 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8040 } 8041 }, 8042 }; 8043 8044 8045 spdk_trace_register_owner(OWNER_BDEV, 'b'); 8046 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 8047 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8048 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 8049 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 8050 } 8051