1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 79 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 80 }; 81 82 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 83 84 RB_HEAD(bdev_name_tree, spdk_bdev_name); 85 86 static int 87 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 88 { 89 return strcmp(name1->name, name2->name); 90 } 91 92 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 93 94 struct spdk_bdev_mgr { 95 struct spdk_mempool *bdev_io_pool; 96 97 void *zero_buffer; 98 99 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 100 101 struct spdk_bdev_list bdevs; 102 struct bdev_name_tree bdev_names; 103 104 bool init_complete; 105 bool module_init_complete; 106 107 struct spdk_spinlock spinlock; 108 109 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 123 }; 124 125 static void 126 __attribute__((constructor)) 127 _bdev_init(void) 128 { 129 spdk_spin_init(&g_bdev_mgr.spinlock); 130 } 131 132 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 133 134 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 135 136 struct lba_range { 137 struct spdk_bdev *bdev; 138 uint64_t offset; 139 uint64_t length; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 }; 152 153 static spdk_bdev_init_cb g_init_cb_fn = NULL; 154 static void *g_init_cb_arg = NULL; 155 156 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 157 static void *g_fini_cb_arg = NULL; 158 static struct spdk_thread *g_fini_thread = NULL; 159 160 struct spdk_bdev_qos_limit { 161 /** IOs or bytes allowed per second (i.e., 1s). */ 162 uint64_t limit; 163 164 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 165 * For remaining bytes, allowed to run negative if an I/O is submitted when 166 * some bytes are remaining, but the I/O is bigger than that amount. The 167 * excess will be deducted from the next timeslice. 168 */ 169 int64_t remaining_this_timeslice; 170 171 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 172 uint32_t min_per_timeslice; 173 174 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 175 uint32_t max_per_timeslice; 176 177 /** Function to check whether to queue the IO. */ 178 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 179 180 /** Function to update for the submitted IO. */ 181 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 182 }; 183 184 struct spdk_bdev_qos { 185 /** Types of structure of rate limits. */ 186 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 187 188 /** The channel that all I/O are funneled through. */ 189 struct spdk_bdev_channel *ch; 190 191 /** The thread on which the poller is running. */ 192 struct spdk_thread *thread; 193 194 /** Queue of I/O waiting to be issued. */ 195 bdev_io_tailq_t queued; 196 197 /** Size of a timeslice in tsc ticks. */ 198 uint64_t timeslice_size; 199 200 /** Timestamp of start of last timeslice. */ 201 uint64_t last_timeslice; 202 203 /** Poller that processes queued I/O commands each time slice. */ 204 struct spdk_poller *poller; 205 }; 206 207 struct spdk_bdev_mgmt_channel { 208 /* 209 * Each thread keeps a cache of bdev_io - this allows 210 * bdev threads which are *not* DPDK threads to still 211 * benefit from a per-thread bdev_io cache. Without 212 * this, non-DPDK threads fetching from the mempool 213 * incur a cmpxchg on get and put. 214 */ 215 bdev_io_stailq_t per_thread_cache; 216 uint32_t per_thread_cache_count; 217 uint32_t bdev_io_cache_size; 218 219 struct spdk_iobuf_channel iobuf; 220 221 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 222 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 223 }; 224 225 /* 226 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 227 * will queue here their IO that awaits retry. It makes it possible to retry sending 228 * IO to one bdev after IO from other bdev completes. 229 */ 230 struct spdk_bdev_shared_resource { 231 /* The bdev management channel */ 232 struct spdk_bdev_mgmt_channel *mgmt_ch; 233 234 /* 235 * Count of I/O submitted to bdev module and waiting for completion. 236 * Incremented before submit_request() is called on an spdk_bdev_io. 237 */ 238 uint64_t io_outstanding; 239 240 /* 241 * Queue of IO awaiting retry because of a previous NOMEM status returned 242 * on this channel. 243 */ 244 bdev_io_tailq_t nomem_io; 245 246 /* 247 * Threshold which io_outstanding must drop to before retrying nomem_io. 248 */ 249 uint64_t nomem_threshold; 250 251 /* I/O channel allocated by a bdev module */ 252 struct spdk_io_channel *shared_ch; 253 254 struct spdk_poller *nomem_poller; 255 256 /* Refcount of bdev channels using this resource */ 257 uint32_t ref; 258 259 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 260 }; 261 262 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 263 #define BDEV_CH_QOS_ENABLED (1 << 1) 264 265 struct spdk_bdev_channel { 266 struct spdk_bdev *bdev; 267 268 /* The channel for the underlying device */ 269 struct spdk_io_channel *channel; 270 271 /* Accel channel */ 272 struct spdk_io_channel *accel_channel; 273 274 /* Per io_device per thread data */ 275 struct spdk_bdev_shared_resource *shared_resource; 276 277 struct spdk_bdev_io_stat *stat; 278 279 /* 280 * Count of I/O submitted to the underlying dev module through this channel 281 * and waiting for completion. 282 */ 283 uint64_t io_outstanding; 284 285 /* 286 * List of all submitted I/Os including I/O that are generated via splitting. 287 */ 288 bdev_io_tailq_t io_submitted; 289 290 /* 291 * List of spdk_bdev_io that are currently queued because they write to a locked 292 * LBA range. 293 */ 294 bdev_io_tailq_t io_locked; 295 296 /* List of I/Os with accel sequence being currently executed */ 297 bdev_io_tailq_t io_accel_exec; 298 299 /* List of I/Os doing memory domain pull/push */ 300 bdev_io_tailq_t io_memory_domain; 301 302 uint32_t flags; 303 304 struct spdk_histogram_data *histogram; 305 306 #ifdef SPDK_CONFIG_VTUNE 307 uint64_t start_tsc; 308 uint64_t interval_tsc; 309 __itt_string_handle *handle; 310 struct spdk_bdev_io_stat *prev_stat; 311 #endif 312 313 bdev_io_tailq_t queued_resets; 314 315 lba_range_tailq_t locked_ranges; 316 }; 317 318 struct media_event_entry { 319 struct spdk_bdev_media_event event; 320 TAILQ_ENTRY(media_event_entry) tailq; 321 }; 322 323 #define MEDIA_EVENT_POOL_SIZE 64 324 325 struct spdk_bdev_desc { 326 struct spdk_bdev *bdev; 327 struct spdk_thread *thread; 328 struct { 329 spdk_bdev_event_cb_t event_fn; 330 void *ctx; 331 } callback; 332 bool closed; 333 bool write; 334 bool memory_domains_supported; 335 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 336 struct spdk_spinlock spinlock; 337 uint32_t refs; 338 TAILQ_HEAD(, media_event_entry) pending_media_events; 339 TAILQ_HEAD(, media_event_entry) free_media_events; 340 struct media_event_entry *media_events_buffer; 341 TAILQ_ENTRY(spdk_bdev_desc) link; 342 343 uint64_t timeout_in_sec; 344 spdk_bdev_io_timeout_cb cb_fn; 345 void *cb_arg; 346 struct spdk_poller *io_timeout_poller; 347 struct spdk_bdev_module_claim *claim; 348 }; 349 350 struct spdk_bdev_iostat_ctx { 351 struct spdk_bdev_io_stat *stat; 352 spdk_bdev_get_device_stat_cb cb; 353 void *cb_arg; 354 }; 355 356 struct set_qos_limit_ctx { 357 void (*cb_fn)(void *cb_arg, int status); 358 void *cb_arg; 359 struct spdk_bdev *bdev; 360 }; 361 362 struct spdk_bdev_channel_iter { 363 spdk_bdev_for_each_channel_msg fn; 364 spdk_bdev_for_each_channel_done cpl; 365 struct spdk_io_channel_iter *i; 366 void *ctx; 367 }; 368 369 struct spdk_bdev_io_error_stat { 370 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 371 }; 372 373 enum bdev_io_retry_state { 374 BDEV_IO_RETRY_STATE_INVALID, 375 BDEV_IO_RETRY_STATE_PULL, 376 BDEV_IO_RETRY_STATE_PULL_MD, 377 BDEV_IO_RETRY_STATE_SUBMIT, 378 BDEV_IO_RETRY_STATE_PUSH, 379 BDEV_IO_RETRY_STATE_PUSH_MD, 380 }; 381 382 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 383 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 384 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 385 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 386 387 static inline void bdev_io_complete(void *ctx); 388 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 389 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 390 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 391 392 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 393 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 394 395 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 396 struct spdk_io_channel *ch, void *_ctx); 397 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 398 399 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 400 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 401 uint64_t num_blocks, 402 struct spdk_memory_domain *domain, void *domain_ctx, 403 struct spdk_accel_sequence *seq, 404 spdk_bdev_io_completion_cb cb, void *cb_arg); 405 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 406 struct iovec *iov, int iovcnt, void *md_buf, 407 uint64_t offset_blocks, uint64_t num_blocks, 408 struct spdk_memory_domain *domain, void *domain_ctx, 409 struct spdk_accel_sequence *seq, 410 spdk_bdev_io_completion_cb cb, void *cb_arg); 411 412 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 413 uint64_t offset, uint64_t length, 414 lock_range_cb cb_fn, void *cb_arg); 415 416 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 417 uint64_t offset, uint64_t length, 418 lock_range_cb cb_fn, void *cb_arg); 419 420 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 421 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 422 423 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 424 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 425 static void claim_reset(struct spdk_bdev *bdev); 426 427 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 428 429 #define bdev_get_ext_io_opt(opts, field, defval) \ 430 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 431 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 432 433 void 434 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 435 { 436 if (!opts) { 437 SPDK_ERRLOG("opts should not be NULL\n"); 438 return; 439 } 440 441 if (!opts_size) { 442 SPDK_ERRLOG("opts_size should not be zero value\n"); 443 return; 444 } 445 446 opts->opts_size = opts_size; 447 448 #define SET_FIELD(field) \ 449 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 450 opts->field = g_bdev_opts.field; \ 451 } \ 452 453 SET_FIELD(bdev_io_pool_size); 454 SET_FIELD(bdev_io_cache_size); 455 SET_FIELD(bdev_auto_examine); 456 457 /* Do not remove this statement, you should always update this statement when you adding a new field, 458 * and do not forget to add the SET_FIELD statement for your added field. */ 459 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 460 461 #undef SET_FIELD 462 } 463 464 int 465 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 466 { 467 uint32_t min_pool_size; 468 469 if (!opts) { 470 SPDK_ERRLOG("opts cannot be NULL\n"); 471 return -1; 472 } 473 474 if (!opts->opts_size) { 475 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 476 return -1; 477 } 478 479 /* 480 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 481 * initialization. A second mgmt_ch will be created on the same thread when the application starts 482 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 483 */ 484 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 485 if (opts->bdev_io_pool_size < min_pool_size) { 486 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 487 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 488 spdk_thread_get_count()); 489 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 490 return -1; 491 } 492 493 #define SET_FIELD(field) \ 494 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 495 g_bdev_opts.field = opts->field; \ 496 } \ 497 498 SET_FIELD(bdev_io_pool_size); 499 SET_FIELD(bdev_io_cache_size); 500 SET_FIELD(bdev_auto_examine); 501 502 g_bdev_opts.opts_size = opts->opts_size; 503 504 #undef SET_FIELD 505 506 return 0; 507 } 508 509 static struct spdk_bdev * 510 bdev_get_by_name(const char *bdev_name) 511 { 512 struct spdk_bdev_name find; 513 struct spdk_bdev_name *res; 514 515 find.name = (char *)bdev_name; 516 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 517 if (res != NULL) { 518 return res->bdev; 519 } 520 521 return NULL; 522 } 523 524 struct spdk_bdev * 525 spdk_bdev_get_by_name(const char *bdev_name) 526 { 527 struct spdk_bdev *bdev; 528 529 spdk_spin_lock(&g_bdev_mgr.spinlock); 530 bdev = bdev_get_by_name(bdev_name); 531 spdk_spin_unlock(&g_bdev_mgr.spinlock); 532 533 return bdev; 534 } 535 536 struct bdev_io_status_string { 537 enum spdk_bdev_io_status status; 538 const char *str; 539 }; 540 541 static const struct bdev_io_status_string bdev_io_status_strings[] = { 542 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 543 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 544 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 545 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 546 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 547 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 548 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 549 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 550 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 551 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 552 }; 553 554 static const char * 555 bdev_io_status_get_string(enum spdk_bdev_io_status status) 556 { 557 uint32_t i; 558 559 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 560 if (bdev_io_status_strings[i].status == status) { 561 return bdev_io_status_strings[i].str; 562 } 563 } 564 565 return "reserved"; 566 } 567 568 struct spdk_bdev_wait_for_examine_ctx { 569 struct spdk_poller *poller; 570 spdk_bdev_wait_for_examine_cb cb_fn; 571 void *cb_arg; 572 }; 573 574 static bool bdev_module_all_actions_completed(void); 575 576 static int 577 bdev_wait_for_examine_cb(void *arg) 578 { 579 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 580 581 if (!bdev_module_all_actions_completed()) { 582 return SPDK_POLLER_IDLE; 583 } 584 585 spdk_poller_unregister(&ctx->poller); 586 ctx->cb_fn(ctx->cb_arg); 587 free(ctx); 588 589 return SPDK_POLLER_BUSY; 590 } 591 592 int 593 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 594 { 595 struct spdk_bdev_wait_for_examine_ctx *ctx; 596 597 ctx = calloc(1, sizeof(*ctx)); 598 if (ctx == NULL) { 599 return -ENOMEM; 600 } 601 ctx->cb_fn = cb_fn; 602 ctx->cb_arg = cb_arg; 603 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 604 605 return 0; 606 } 607 608 struct spdk_bdev_examine_item { 609 char *name; 610 TAILQ_ENTRY(spdk_bdev_examine_item) link; 611 }; 612 613 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 614 615 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 616 g_bdev_examine_allowlist); 617 618 static inline bool 619 bdev_examine_allowlist_check(const char *name) 620 { 621 struct spdk_bdev_examine_item *item; 622 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 623 if (strcmp(name, item->name) == 0) { 624 return true; 625 } 626 } 627 return false; 628 } 629 630 static inline void 631 bdev_examine_allowlist_free(void) 632 { 633 struct spdk_bdev_examine_item *item; 634 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 635 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 636 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 637 free(item->name); 638 free(item); 639 } 640 } 641 642 static inline bool 643 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 644 { 645 struct spdk_bdev_alias *tmp; 646 if (bdev_examine_allowlist_check(bdev->name)) { 647 return true; 648 } 649 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 650 if (bdev_examine_allowlist_check(tmp->alias.name)) { 651 return true; 652 } 653 } 654 return false; 655 } 656 657 static inline bool 658 bdev_ok_to_examine(struct spdk_bdev *bdev) 659 { 660 if (g_bdev_opts.bdev_auto_examine) { 661 return true; 662 } else { 663 return bdev_in_examine_allowlist(bdev); 664 } 665 } 666 667 static void 668 bdev_examine(struct spdk_bdev *bdev) 669 { 670 struct spdk_bdev_module *module; 671 struct spdk_bdev_module_claim *claim, *tmpclaim; 672 uint32_t action; 673 674 if (!bdev_ok_to_examine(bdev)) { 675 return; 676 } 677 678 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 679 if (module->examine_config) { 680 spdk_spin_lock(&module->internal.spinlock); 681 action = module->internal.action_in_progress; 682 module->internal.action_in_progress++; 683 spdk_spin_unlock(&module->internal.spinlock); 684 module->examine_config(bdev); 685 if (action != module->internal.action_in_progress) { 686 SPDK_ERRLOG("examine_config for module %s did not call " 687 "spdk_bdev_module_examine_done()\n", module->name); 688 } 689 } 690 } 691 692 spdk_spin_lock(&bdev->internal.spinlock); 693 694 switch (bdev->internal.claim_type) { 695 case SPDK_BDEV_CLAIM_NONE: 696 /* Examine by all bdev modules */ 697 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 698 if (module->examine_disk) { 699 spdk_spin_lock(&module->internal.spinlock); 700 module->internal.action_in_progress++; 701 spdk_spin_unlock(&module->internal.spinlock); 702 spdk_spin_unlock(&bdev->internal.spinlock); 703 module->examine_disk(bdev); 704 spdk_spin_lock(&bdev->internal.spinlock); 705 } 706 } 707 break; 708 case SPDK_BDEV_CLAIM_EXCL_WRITE: 709 /* Examine by the one bdev module with a v1 claim */ 710 module = bdev->internal.claim.v1.module; 711 if (module->examine_disk) { 712 spdk_spin_lock(&module->internal.spinlock); 713 module->internal.action_in_progress++; 714 spdk_spin_unlock(&module->internal.spinlock); 715 spdk_spin_unlock(&bdev->internal.spinlock); 716 module->examine_disk(bdev); 717 return; 718 } 719 break; 720 default: 721 /* Examine by all bdev modules with a v2 claim */ 722 assert(claim_type_is_v2(bdev->internal.claim_type)); 723 /* 724 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 725 * list, perhaps accessing freed memory. Without protection, this could happen 726 * while the lock is dropped during the examine callback. 727 */ 728 bdev->internal.examine_in_progress++; 729 730 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 731 module = claim->module; 732 733 if (module == NULL) { 734 /* This is a vestigial claim, held by examine_count */ 735 continue; 736 } 737 738 if (module->examine_disk == NULL) { 739 continue; 740 } 741 742 spdk_spin_lock(&module->internal.spinlock); 743 module->internal.action_in_progress++; 744 spdk_spin_unlock(&module->internal.spinlock); 745 746 /* Call examine_disk without holding internal.spinlock. */ 747 spdk_spin_unlock(&bdev->internal.spinlock); 748 module->examine_disk(bdev); 749 spdk_spin_lock(&bdev->internal.spinlock); 750 } 751 752 assert(bdev->internal.examine_in_progress > 0); 753 bdev->internal.examine_in_progress--; 754 if (bdev->internal.examine_in_progress == 0) { 755 /* Remove any claims that were released during examine_disk */ 756 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 757 if (claim->desc != NULL) { 758 continue; 759 } 760 761 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 762 free(claim); 763 } 764 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 765 claim_reset(bdev); 766 } 767 } 768 } 769 770 spdk_spin_unlock(&bdev->internal.spinlock); 771 } 772 773 int 774 spdk_bdev_examine(const char *name) 775 { 776 struct spdk_bdev *bdev; 777 struct spdk_bdev_examine_item *item; 778 struct spdk_thread *thread = spdk_get_thread(); 779 780 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 781 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 782 thread ? spdk_thread_get_name(thread) : "null"); 783 return -EINVAL; 784 } 785 786 if (g_bdev_opts.bdev_auto_examine) { 787 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 788 return -EINVAL; 789 } 790 791 if (bdev_examine_allowlist_check(name)) { 792 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 793 return -EEXIST; 794 } 795 796 item = calloc(1, sizeof(*item)); 797 if (!item) { 798 return -ENOMEM; 799 } 800 item->name = strdup(name); 801 if (!item->name) { 802 free(item); 803 return -ENOMEM; 804 } 805 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 806 807 bdev = spdk_bdev_get_by_name(name); 808 if (bdev) { 809 bdev_examine(bdev); 810 } 811 return 0; 812 } 813 814 static inline void 815 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 816 { 817 struct spdk_bdev_examine_item *item; 818 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 819 spdk_json_write_object_begin(w); 820 spdk_json_write_named_string(w, "method", "bdev_examine"); 821 spdk_json_write_named_object_begin(w, "params"); 822 spdk_json_write_named_string(w, "name", item->name); 823 spdk_json_write_object_end(w); 824 spdk_json_write_object_end(w); 825 } 826 } 827 828 struct spdk_bdev * 829 spdk_bdev_first(void) 830 { 831 struct spdk_bdev *bdev; 832 833 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 834 if (bdev) { 835 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 836 } 837 838 return bdev; 839 } 840 841 struct spdk_bdev * 842 spdk_bdev_next(struct spdk_bdev *prev) 843 { 844 struct spdk_bdev *bdev; 845 846 bdev = TAILQ_NEXT(prev, internal.link); 847 if (bdev) { 848 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 849 } 850 851 return bdev; 852 } 853 854 static struct spdk_bdev * 855 _bdev_next_leaf(struct spdk_bdev *bdev) 856 { 857 while (bdev != NULL) { 858 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 859 return bdev; 860 } else { 861 bdev = TAILQ_NEXT(bdev, internal.link); 862 } 863 } 864 865 return bdev; 866 } 867 868 struct spdk_bdev * 869 spdk_bdev_first_leaf(void) 870 { 871 struct spdk_bdev *bdev; 872 873 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 874 875 if (bdev) { 876 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 877 } 878 879 return bdev; 880 } 881 882 struct spdk_bdev * 883 spdk_bdev_next_leaf(struct spdk_bdev *prev) 884 { 885 struct spdk_bdev *bdev; 886 887 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 888 889 if (bdev) { 890 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 891 } 892 893 return bdev; 894 } 895 896 static inline bool 897 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 898 { 899 return bdev_io->internal.memory_domain; 900 } 901 902 static inline bool 903 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 904 { 905 return bdev_io->internal.has_accel_sequence; 906 } 907 908 static inline void 909 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 910 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 911 { 912 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 913 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 914 * channels we will instead wait for half to complete. 915 */ 916 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 917 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 918 919 assert(state != BDEV_IO_RETRY_STATE_INVALID); 920 bdev_io->internal.retry_state = state; 921 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 922 } 923 924 static inline void 925 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 926 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 927 { 928 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 929 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 930 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 931 932 assert(state != BDEV_IO_RETRY_STATE_INVALID); 933 bdev_io->internal.retry_state = state; 934 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 935 } 936 937 void 938 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 939 { 940 struct iovec *iovs; 941 942 if (bdev_io->u.bdev.iovs == NULL) { 943 bdev_io->u.bdev.iovs = &bdev_io->iov; 944 bdev_io->u.bdev.iovcnt = 1; 945 } 946 947 iovs = bdev_io->u.bdev.iovs; 948 949 assert(iovs != NULL); 950 assert(bdev_io->u.bdev.iovcnt >= 1); 951 952 iovs[0].iov_base = buf; 953 iovs[0].iov_len = len; 954 } 955 956 void 957 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 958 { 959 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 960 bdev_io->u.bdev.md_buf = md_buf; 961 } 962 963 static bool 964 _is_buf_allocated(const struct iovec *iovs) 965 { 966 if (iovs == NULL) { 967 return false; 968 } 969 970 return iovs[0].iov_base != NULL; 971 } 972 973 static bool 974 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 975 { 976 int i; 977 uintptr_t iov_base; 978 979 if (spdk_likely(alignment == 1)) { 980 return true; 981 } 982 983 for (i = 0; i < iovcnt; i++) { 984 iov_base = (uintptr_t)iovs[i].iov_base; 985 if ((iov_base & (alignment - 1)) != 0) { 986 return false; 987 } 988 } 989 990 return true; 991 } 992 993 static inline bool 994 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 995 { 996 if (!bdev_io->internal.accel_sequence) { 997 return false; 998 } 999 1000 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1001 * bdev module didn't support accel sequences */ 1002 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1003 } 1004 1005 static inline void 1006 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1007 struct spdk_bdev_shared_resource *shared_resource) 1008 { 1009 bdev_ch->io_outstanding++; 1010 shared_resource->io_outstanding++; 1011 } 1012 1013 static inline void 1014 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1015 struct spdk_bdev_shared_resource *shared_resource) 1016 { 1017 assert(bdev_ch->io_outstanding > 0); 1018 assert(shared_resource->io_outstanding > 0); 1019 bdev_ch->io_outstanding--; 1020 shared_resource->io_outstanding--; 1021 } 1022 1023 static void 1024 bdev_io_submit_sequence_cb(void *ctx, int status) 1025 { 1026 struct spdk_bdev_io *bdev_io = ctx; 1027 1028 bdev_io->u.bdev.accel_sequence = NULL; 1029 bdev_io->internal.accel_sequence = NULL; 1030 1031 if (spdk_unlikely(status != 0)) { 1032 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1033 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1034 bdev_io_complete_unsubmitted(bdev_io); 1035 return; 1036 } 1037 1038 bdev_io_submit(bdev_io); 1039 } 1040 1041 static void 1042 bdev_io_exec_sequence_cb(void *ctx, int status) 1043 { 1044 struct spdk_bdev_io *bdev_io = ctx; 1045 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1046 1047 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1048 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1049 1050 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1051 bdev_ch_retry_io(ch); 1052 } 1053 1054 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1055 } 1056 1057 static void 1058 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1059 { 1060 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1061 1062 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1063 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1064 1065 /* Since the operations are appended during submission, they're in the opposite order than 1066 * how we want to execute them for reads (i.e. we need to execute the most recently added 1067 * operation first), so reverse the sequence before executing it. 1068 */ 1069 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1070 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1071 } 1072 1073 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1074 bdev_io_increment_outstanding(ch, ch->shared_resource); 1075 bdev_io->internal.data_transfer_cpl = cb_fn; 1076 1077 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1078 bdev_io_exec_sequence_cb, bdev_io); 1079 } 1080 1081 static void 1082 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1083 { 1084 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1085 void *buf; 1086 1087 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1088 buf = bdev_io->internal.buf; 1089 bdev_io->internal.buf = NULL; 1090 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1091 bdev_io->internal.get_aux_buf_cb = NULL; 1092 } else { 1093 assert(bdev_io->internal.get_buf_cb != NULL); 1094 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1095 bdev_io->internal.get_buf_cb = NULL; 1096 } 1097 } 1098 1099 static void 1100 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1101 { 1102 struct spdk_bdev_io *bdev_io = ctx; 1103 1104 if (rc) { 1105 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1106 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1107 } 1108 bdev_io_get_buf_complete(bdev_io, !rc); 1109 } 1110 1111 static void 1112 bdev_io_pull_md_buf_done(void *ctx, int status) 1113 { 1114 struct spdk_bdev_io *bdev_io = ctx; 1115 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1116 1117 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1118 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1119 1120 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1121 bdev_ch_retry_io(ch); 1122 } 1123 1124 assert(bdev_io->internal.data_transfer_cpl); 1125 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1126 } 1127 1128 static void 1129 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1130 { 1131 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1132 int rc = 0; 1133 1134 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1135 if (bdev_io_use_memory_domain(bdev_io)) { 1136 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1137 bdev_io_increment_outstanding(ch, ch->shared_resource); 1138 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1139 bdev_io->internal.memory_domain_ctx, 1140 &bdev_io->internal.orig_md_iov, 1, 1141 &bdev_io->internal.bounce_md_iov, 1, 1142 bdev_io_pull_md_buf_done, bdev_io); 1143 if (rc == 0) { 1144 /* Continue to submit IO in completion callback */ 1145 return; 1146 } 1147 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1148 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1149 if (rc != -ENOMEM) { 1150 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1151 spdk_memory_domain_get_dma_device_id( 1152 bdev_io->internal.memory_domain), rc); 1153 } 1154 } else { 1155 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1156 bdev_io->internal.orig_md_iov.iov_base, 1157 bdev_io->internal.orig_md_iov.iov_len); 1158 } 1159 } 1160 1161 if (spdk_unlikely(rc == -ENOMEM)) { 1162 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1163 } else { 1164 assert(bdev_io->internal.data_transfer_cpl); 1165 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1166 } 1167 } 1168 1169 static void 1170 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1171 { 1172 /* save original md_buf */ 1173 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1174 bdev_io->internal.orig_md_iov.iov_len = len; 1175 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1176 bdev_io->internal.bounce_md_iov.iov_len = len; 1177 /* set bounce md_buf */ 1178 bdev_io->u.bdev.md_buf = md_buf; 1179 1180 bdev_io_pull_md_buf(bdev_io); 1181 } 1182 1183 static void 1184 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1185 { 1186 struct spdk_bdev *bdev = bdev_io->bdev; 1187 uint64_t md_len; 1188 void *buf; 1189 1190 if (spdk_bdev_is_md_separate(bdev)) { 1191 assert(!bdev_io_use_accel_sequence(bdev_io)); 1192 1193 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1194 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1195 1196 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1197 1198 if (bdev_io->u.bdev.md_buf != NULL) { 1199 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1200 return; 1201 } else { 1202 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1203 } 1204 } 1205 1206 bdev_io_get_buf_complete(bdev_io, true); 1207 } 1208 1209 static inline void 1210 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1211 { 1212 if (rc) { 1213 SPDK_ERRLOG("Failed to get data buffer\n"); 1214 assert(bdev_io->internal.data_transfer_cpl); 1215 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1216 return; 1217 } 1218 1219 _bdev_io_set_md_buf(bdev_io); 1220 } 1221 1222 static void 1223 bdev_io_pull_data_done_and_track(void *ctx, int status) 1224 { 1225 struct spdk_bdev_io *bdev_io = ctx; 1226 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1227 1228 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1229 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1230 1231 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1232 bdev_ch_retry_io(ch); 1233 } 1234 1235 bdev_io_pull_data_done(bdev_io, status); 1236 } 1237 1238 static void 1239 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1240 { 1241 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1242 int rc = 0; 1243 1244 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1245 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1246 * operation */ 1247 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1248 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1249 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1250 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1251 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1252 NULL, NULL, 1253 bdev_io->internal.orig_iovs, 1254 bdev_io->internal.orig_iovcnt, 1255 bdev_io->internal.memory_domain, 1256 bdev_io->internal.memory_domain_ctx, 1257 0, NULL, NULL); 1258 } else { 1259 /* We need to reverse the src/dst for reads */ 1260 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1261 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1262 bdev_io->internal.orig_iovs, 1263 bdev_io->internal.orig_iovcnt, 1264 bdev_io->internal.memory_domain, 1265 bdev_io->internal.memory_domain_ctx, 1266 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1267 NULL, NULL, 0, NULL, NULL); 1268 } 1269 1270 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1271 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1272 bdev_io->internal.accel_sequence); 1273 } 1274 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1275 /* if this is write path, copy data from original buffer to bounce buffer */ 1276 if (bdev_io_use_memory_domain(bdev_io)) { 1277 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1278 bdev_io_increment_outstanding(ch, ch->shared_resource); 1279 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1280 bdev_io->internal.memory_domain_ctx, 1281 bdev_io->internal.orig_iovs, 1282 (uint32_t) bdev_io->internal.orig_iovcnt, 1283 bdev_io->u.bdev.iovs, 1, 1284 bdev_io_pull_data_done_and_track, 1285 bdev_io); 1286 if (rc == 0) { 1287 /* Continue to submit IO in completion callback */ 1288 return; 1289 } 1290 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1291 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1292 if (rc != -ENOMEM) { 1293 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1294 spdk_memory_domain_get_dma_device_id( 1295 bdev_io->internal.memory_domain)); 1296 } 1297 } else { 1298 assert(bdev_io->u.bdev.iovcnt == 1); 1299 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1300 bdev_io->u.bdev.iovs[0].iov_len, 1301 bdev_io->internal.orig_iovs, 1302 bdev_io->internal.orig_iovcnt); 1303 } 1304 } 1305 1306 if (spdk_unlikely(rc == -ENOMEM)) { 1307 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1308 } else { 1309 bdev_io_pull_data_done(bdev_io, rc); 1310 } 1311 } 1312 1313 static void 1314 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1315 bdev_copy_bounce_buffer_cpl cpl_cb) 1316 { 1317 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1318 1319 bdev_io->internal.data_transfer_cpl = cpl_cb; 1320 /* save original iovec */ 1321 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1322 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1323 /* set bounce iov */ 1324 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1325 bdev_io->u.bdev.iovcnt = 1; 1326 /* set bounce buffer for this operation */ 1327 bdev_io->u.bdev.iovs[0].iov_base = buf; 1328 bdev_io->u.bdev.iovs[0].iov_len = len; 1329 1330 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1331 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1332 } else { 1333 bdev_io_pull_data(bdev_io); 1334 } 1335 } 1336 1337 static void 1338 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1339 { 1340 struct spdk_bdev *bdev = bdev_io->bdev; 1341 bool buf_allocated; 1342 uint64_t alignment; 1343 void *aligned_buf; 1344 1345 bdev_io->internal.buf = buf; 1346 1347 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1348 bdev_io_get_buf_complete(bdev_io, true); 1349 return; 1350 } 1351 1352 alignment = spdk_bdev_get_buf_align(bdev); 1353 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1354 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1355 1356 if (buf_allocated) { 1357 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1358 /* Continue in completion callback */ 1359 return; 1360 } else { 1361 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1362 } 1363 1364 _bdev_io_set_md_buf(bdev_io); 1365 } 1366 1367 static inline uint64_t 1368 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1369 { 1370 struct spdk_bdev *bdev = bdev_io->bdev; 1371 uint64_t md_len, alignment; 1372 1373 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1374 1375 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1376 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1377 1378 return len + alignment + md_len; 1379 } 1380 1381 static void 1382 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1383 { 1384 struct spdk_bdev_mgmt_channel *ch; 1385 1386 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1387 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1388 } 1389 1390 static void 1391 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1392 { 1393 assert(bdev_io->internal.buf != NULL); 1394 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1395 bdev_io->internal.buf = NULL; 1396 } 1397 1398 void 1399 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1400 { 1401 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1402 1403 assert(buf != NULL); 1404 _bdev_io_put_buf(bdev_io, buf, len); 1405 } 1406 1407 static inline void 1408 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1409 struct spdk_bdev_io *bdev_io) 1410 { 1411 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1412 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1413 * sequence pointer to make sure we won't touch it anymore. */ 1414 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1415 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1416 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1417 bdev_io->internal.accel_sequence = NULL; 1418 } 1419 1420 bdev->fn_table->submit_request(ioch, bdev_io); 1421 } 1422 1423 static inline void 1424 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1425 { 1426 struct spdk_bdev *bdev = bdev_io->bdev; 1427 1428 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1429 bdev_io->internal.error.nvme.cdw0 = 0; 1430 bdev_io->num_retries++; 1431 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1432 } 1433 1434 static void 1435 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1436 { 1437 struct spdk_bdev_io *bdev_io; 1438 1439 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1440 /* 1441 * Allow some more I/O to complete before retrying the nomem_io queue. 1442 * Some drivers (such as nvme) cannot immediately take a new I/O in 1443 * the context of a completion, because the resources for the I/O are 1444 * not released until control returns to the bdev poller. Also, we 1445 * may require several small I/O to complete before a larger I/O 1446 * (that requires splitting) can be submitted. 1447 */ 1448 return; 1449 } 1450 1451 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1452 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1453 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1454 1455 switch (bdev_io->internal.retry_state) { 1456 case BDEV_IO_RETRY_STATE_SUBMIT: 1457 bdev_ch_resubmit_io(shared_resource, bdev_io); 1458 break; 1459 case BDEV_IO_RETRY_STATE_PULL: 1460 bdev_io_pull_data(bdev_io); 1461 break; 1462 case BDEV_IO_RETRY_STATE_PULL_MD: 1463 bdev_io_pull_md_buf(bdev_io); 1464 break; 1465 case BDEV_IO_RETRY_STATE_PUSH: 1466 bdev_io_push_bounce_data(bdev_io); 1467 break; 1468 case BDEV_IO_RETRY_STATE_PUSH_MD: 1469 bdev_io_push_bounce_md_buf(bdev_io); 1470 break; 1471 default: 1472 assert(0 && "invalid retry state"); 1473 break; 1474 } 1475 1476 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1477 /* This IO completed again with NOMEM status, so break the loop and 1478 * don't try anymore. Note that a bdev_io that fails with NOMEM 1479 * always gets requeued at the front of the list, to maintain 1480 * ordering. 1481 */ 1482 break; 1483 } 1484 } 1485 } 1486 1487 static void 1488 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1489 { 1490 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1491 } 1492 1493 static int 1494 bdev_no_mem_poller(void *ctx) 1495 { 1496 struct spdk_bdev_shared_resource *shared_resource = ctx; 1497 1498 spdk_poller_unregister(&shared_resource->nomem_poller); 1499 1500 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1501 bdev_shared_ch_retry_io(shared_resource); 1502 } 1503 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && shared_resource->io_outstanding == 0) { 1504 /* No IOs were submitted, try again */ 1505 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1506 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1507 } 1508 1509 return SPDK_POLLER_BUSY; 1510 } 1511 1512 static inline bool 1513 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1514 { 1515 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1516 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1517 1518 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1519 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1520 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1521 1522 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1523 /* Special case when we have nomem IOs and no outstanding IOs which completions 1524 * could trigger retry of queued IOs 1525 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1526 * new IOs submitted, e.g. qd==1 */ 1527 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1528 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1529 } 1530 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1531 * ownership of that sequence is transferred back to the bdev layer, so we need to 1532 * restore internal.accel_sequence to make sure that the sequence is handled 1533 * correctly in case the I/O is later aborted. */ 1534 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1535 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1536 assert(bdev_io->internal.accel_sequence == NULL); 1537 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1538 } 1539 1540 return true; 1541 } 1542 1543 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1544 bdev_ch_retry_io(bdev_ch); 1545 } 1546 1547 return false; 1548 } 1549 1550 static void 1551 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1552 { 1553 struct spdk_bdev_io *bdev_io = ctx; 1554 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1555 1556 if (rc) { 1557 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1558 } 1559 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1560 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1561 */ 1562 bdev_io_put_buf(bdev_io); 1563 1564 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1565 bdev_ch_retry_io(ch); 1566 } 1567 1568 /* Continue with IO completion flow */ 1569 bdev_io_complete(bdev_io); 1570 } 1571 1572 static void 1573 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1574 { 1575 struct spdk_bdev_io *bdev_io = ctx; 1576 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1577 1578 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1579 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1580 1581 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1582 bdev_ch_retry_io(ch); 1583 } 1584 1585 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1586 } 1587 1588 static inline void 1589 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1590 { 1591 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1592 int rc = 0; 1593 1594 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1595 /* do the same for metadata buffer */ 1596 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1597 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1598 1599 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1600 if (bdev_io_use_memory_domain(bdev_io)) { 1601 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1602 bdev_io_increment_outstanding(ch, ch->shared_resource); 1603 /* If memory domain is used then we need to call async push function */ 1604 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1605 bdev_io->internal.memory_domain_ctx, 1606 &bdev_io->internal.orig_md_iov, 1607 (uint32_t)bdev_io->internal.orig_iovcnt, 1608 &bdev_io->internal.bounce_md_iov, 1, 1609 bdev_io_push_bounce_md_buf_done, 1610 bdev_io); 1611 if (rc == 0) { 1612 /* Continue IO completion in async callback */ 1613 return; 1614 } 1615 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1616 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1617 if (rc != -ENOMEM) { 1618 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1619 spdk_memory_domain_get_dma_device_id( 1620 bdev_io->internal.memory_domain)); 1621 } 1622 } else { 1623 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1624 bdev_io->internal.orig_md_iov.iov_len); 1625 } 1626 } 1627 } 1628 1629 if (spdk_unlikely(rc == -ENOMEM)) { 1630 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1631 } else { 1632 assert(bdev_io->internal.data_transfer_cpl); 1633 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1634 } 1635 } 1636 1637 static inline void 1638 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1639 { 1640 assert(bdev_io->internal.data_transfer_cpl); 1641 if (rc) { 1642 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1643 return; 1644 } 1645 1646 /* set original buffer for this io */ 1647 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1648 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1649 /* disable bouncing buffer for this io */ 1650 bdev_io->internal.orig_iovcnt = 0; 1651 bdev_io->internal.orig_iovs = NULL; 1652 1653 bdev_io_push_bounce_md_buf(bdev_io); 1654 } 1655 1656 static void 1657 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1658 { 1659 struct spdk_bdev_io *bdev_io = ctx; 1660 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1661 1662 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1663 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1664 1665 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1666 bdev_ch_retry_io(ch); 1667 } 1668 1669 bdev_io_push_bounce_data_done(bdev_io, status); 1670 } 1671 1672 static inline void 1673 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1674 { 1675 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1676 int rc = 0; 1677 1678 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1679 assert(!bdev_io_use_accel_sequence(bdev_io)); 1680 1681 /* if this is read path, copy data from bounce buffer to original buffer */ 1682 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1683 if (bdev_io_use_memory_domain(bdev_io)) { 1684 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1685 bdev_io_increment_outstanding(ch, ch->shared_resource); 1686 /* If memory domain is used then we need to call async push function */ 1687 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1688 bdev_io->internal.memory_domain_ctx, 1689 bdev_io->internal.orig_iovs, 1690 (uint32_t)bdev_io->internal.orig_iovcnt, 1691 &bdev_io->internal.bounce_iov, 1, 1692 bdev_io_push_bounce_data_done_and_track, 1693 bdev_io); 1694 if (rc == 0) { 1695 /* Continue IO completion in async callback */ 1696 return; 1697 } 1698 1699 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1700 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1701 if (rc != -ENOMEM) { 1702 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1703 spdk_memory_domain_get_dma_device_id( 1704 bdev_io->internal.memory_domain)); 1705 } 1706 } else { 1707 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1708 bdev_io->internal.orig_iovcnt, 1709 bdev_io->internal.bounce_iov.iov_base, 1710 bdev_io->internal.bounce_iov.iov_len); 1711 } 1712 } 1713 1714 if (spdk_unlikely(rc == -ENOMEM)) { 1715 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1716 } else { 1717 bdev_io_push_bounce_data_done(bdev_io, rc); 1718 } 1719 } 1720 1721 static inline void 1722 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1723 { 1724 bdev_io->internal.data_transfer_cpl = cpl_cb; 1725 bdev_io_push_bounce_data(bdev_io); 1726 } 1727 1728 static void 1729 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1730 { 1731 struct spdk_bdev_io *bdev_io; 1732 1733 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1734 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1735 } 1736 1737 static void 1738 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1739 { 1740 struct spdk_bdev_mgmt_channel *mgmt_ch; 1741 uint64_t max_len; 1742 void *buf; 1743 1744 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1745 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1746 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1747 1748 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1749 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1750 bdev_io_get_buf_complete(bdev_io, false); 1751 return; 1752 } 1753 1754 bdev_io->internal.buf_len = len; 1755 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1756 bdev_io_get_iobuf_cb); 1757 if (buf != NULL) { 1758 _bdev_io_set_buf(bdev_io, buf, len); 1759 } 1760 } 1761 1762 void 1763 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1764 { 1765 struct spdk_bdev *bdev = bdev_io->bdev; 1766 uint64_t alignment; 1767 1768 assert(cb != NULL); 1769 bdev_io->internal.get_buf_cb = cb; 1770 1771 alignment = spdk_bdev_get_buf_align(bdev); 1772 1773 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1774 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1775 /* Buffer already present and aligned */ 1776 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1777 return; 1778 } 1779 1780 bdev_io_get_buf(bdev_io, len); 1781 } 1782 1783 static void 1784 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1785 bool success) 1786 { 1787 if (!success) { 1788 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1789 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1790 bdev_io_complete_unsubmitted(bdev_io); 1791 return; 1792 } 1793 1794 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1795 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1796 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1797 return; 1798 } 1799 /* For reads we'll execute the sequence after the data is read, so, for now, only 1800 * clear out accel_sequence pointer and submit the IO */ 1801 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1802 bdev_io->u.bdev.accel_sequence = NULL; 1803 } 1804 1805 bdev_io_submit(bdev_io); 1806 } 1807 1808 static void 1809 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1810 uint64_t len) 1811 { 1812 assert(cb != NULL); 1813 bdev_io->internal.get_buf_cb = cb; 1814 1815 bdev_io_get_buf(bdev_io, len); 1816 } 1817 1818 void 1819 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1820 { 1821 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1822 1823 assert(cb != NULL); 1824 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1825 bdev_io->internal.get_aux_buf_cb = cb; 1826 bdev_io_get_buf(bdev_io, len); 1827 } 1828 1829 static int 1830 bdev_module_get_max_ctx_size(void) 1831 { 1832 struct spdk_bdev_module *bdev_module; 1833 int max_bdev_module_size = 0; 1834 1835 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1836 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1837 max_bdev_module_size = bdev_module->get_ctx_size(); 1838 } 1839 } 1840 1841 return max_bdev_module_size; 1842 } 1843 1844 static void 1845 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1846 { 1847 if (!bdev->internal.histogram_enabled) { 1848 return; 1849 } 1850 1851 spdk_json_write_object_begin(w); 1852 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1853 1854 spdk_json_write_named_object_begin(w, "params"); 1855 spdk_json_write_named_string(w, "name", bdev->name); 1856 1857 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1858 spdk_json_write_object_end(w); 1859 1860 spdk_json_write_object_end(w); 1861 } 1862 1863 static void 1864 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1865 { 1866 int i; 1867 struct spdk_bdev_qos *qos = bdev->internal.qos; 1868 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1869 1870 if (!qos) { 1871 return; 1872 } 1873 1874 spdk_bdev_get_qos_rate_limits(bdev, limits); 1875 1876 spdk_json_write_object_begin(w); 1877 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1878 1879 spdk_json_write_named_object_begin(w, "params"); 1880 spdk_json_write_named_string(w, "name", bdev->name); 1881 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1882 if (limits[i] > 0) { 1883 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1884 } 1885 } 1886 spdk_json_write_object_end(w); 1887 1888 spdk_json_write_object_end(w); 1889 } 1890 1891 void 1892 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1893 { 1894 struct spdk_bdev_module *bdev_module; 1895 struct spdk_bdev *bdev; 1896 1897 assert(w != NULL); 1898 1899 spdk_json_write_array_begin(w); 1900 1901 spdk_json_write_object_begin(w); 1902 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1903 spdk_json_write_named_object_begin(w, "params"); 1904 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1905 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1906 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1907 spdk_json_write_object_end(w); 1908 spdk_json_write_object_end(w); 1909 1910 bdev_examine_allowlist_config_json(w); 1911 1912 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1913 if (bdev_module->config_json) { 1914 bdev_module->config_json(w); 1915 } 1916 } 1917 1918 spdk_spin_lock(&g_bdev_mgr.spinlock); 1919 1920 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1921 if (bdev->fn_table->write_config_json) { 1922 bdev->fn_table->write_config_json(bdev, w); 1923 } 1924 1925 bdev_qos_config_json(bdev, w); 1926 bdev_enable_histogram_config_json(bdev, w); 1927 } 1928 1929 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1930 1931 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1932 spdk_json_write_object_begin(w); 1933 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1934 spdk_json_write_object_end(w); 1935 1936 spdk_json_write_array_end(w); 1937 } 1938 1939 static void 1940 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1941 { 1942 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1943 struct spdk_bdev_io *bdev_io; 1944 1945 spdk_iobuf_channel_fini(&ch->iobuf); 1946 1947 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1948 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1949 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1950 ch->per_thread_cache_count--; 1951 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1952 } 1953 1954 assert(ch->per_thread_cache_count == 0); 1955 } 1956 1957 static int 1958 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1959 { 1960 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1961 struct spdk_bdev_io *bdev_io; 1962 uint32_t i; 1963 int rc; 1964 1965 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1966 if (rc != 0) { 1967 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1968 return -1; 1969 } 1970 1971 STAILQ_INIT(&ch->per_thread_cache); 1972 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1973 1974 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1975 ch->per_thread_cache_count = 0; 1976 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1977 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1978 if (bdev_io == NULL) { 1979 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1980 assert(false); 1981 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1982 return -1; 1983 } 1984 ch->per_thread_cache_count++; 1985 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1986 } 1987 1988 TAILQ_INIT(&ch->shared_resources); 1989 TAILQ_INIT(&ch->io_wait_queue); 1990 1991 return 0; 1992 } 1993 1994 static void 1995 bdev_init_complete(int rc) 1996 { 1997 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1998 void *cb_arg = g_init_cb_arg; 1999 struct spdk_bdev_module *m; 2000 2001 g_bdev_mgr.init_complete = true; 2002 g_init_cb_fn = NULL; 2003 g_init_cb_arg = NULL; 2004 2005 /* 2006 * For modules that need to know when subsystem init is complete, 2007 * inform them now. 2008 */ 2009 if (rc == 0) { 2010 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2011 if (m->init_complete) { 2012 m->init_complete(); 2013 } 2014 } 2015 } 2016 2017 cb_fn(cb_arg, rc); 2018 } 2019 2020 static bool 2021 bdev_module_all_actions_completed(void) 2022 { 2023 struct spdk_bdev_module *m; 2024 2025 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2026 if (m->internal.action_in_progress > 0) { 2027 return false; 2028 } 2029 } 2030 return true; 2031 } 2032 2033 static void 2034 bdev_module_action_complete(void) 2035 { 2036 /* 2037 * Don't finish bdev subsystem initialization if 2038 * module pre-initialization is still in progress, or 2039 * the subsystem been already initialized. 2040 */ 2041 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2042 return; 2043 } 2044 2045 /* 2046 * Check all bdev modules for inits/examinations in progress. If any 2047 * exist, return immediately since we cannot finish bdev subsystem 2048 * initialization until all are completed. 2049 */ 2050 if (!bdev_module_all_actions_completed()) { 2051 return; 2052 } 2053 2054 /* 2055 * Modules already finished initialization - now that all 2056 * the bdev modules have finished their asynchronous I/O 2057 * processing, the entire bdev layer can be marked as complete. 2058 */ 2059 bdev_init_complete(0); 2060 } 2061 2062 static void 2063 bdev_module_action_done(struct spdk_bdev_module *module) 2064 { 2065 spdk_spin_lock(&module->internal.spinlock); 2066 assert(module->internal.action_in_progress > 0); 2067 module->internal.action_in_progress--; 2068 spdk_spin_unlock(&module->internal.spinlock); 2069 bdev_module_action_complete(); 2070 } 2071 2072 void 2073 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2074 { 2075 assert(module->async_init); 2076 bdev_module_action_done(module); 2077 } 2078 2079 void 2080 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2081 { 2082 bdev_module_action_done(module); 2083 } 2084 2085 /** The last initialized bdev module */ 2086 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2087 2088 static void 2089 bdev_init_failed(void *cb_arg) 2090 { 2091 struct spdk_bdev_module *module = cb_arg; 2092 2093 spdk_spin_lock(&module->internal.spinlock); 2094 assert(module->internal.action_in_progress > 0); 2095 module->internal.action_in_progress--; 2096 spdk_spin_unlock(&module->internal.spinlock); 2097 bdev_init_complete(-1); 2098 } 2099 2100 static int 2101 bdev_modules_init(void) 2102 { 2103 struct spdk_bdev_module *module; 2104 int rc = 0; 2105 2106 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2107 g_resume_bdev_module = module; 2108 if (module->async_init) { 2109 spdk_spin_lock(&module->internal.spinlock); 2110 module->internal.action_in_progress = 1; 2111 spdk_spin_unlock(&module->internal.spinlock); 2112 } 2113 rc = module->module_init(); 2114 if (rc != 0) { 2115 /* Bump action_in_progress to prevent other modules from completion of modules_init 2116 * Send message to defer application shutdown until resources are cleaned up */ 2117 spdk_spin_lock(&module->internal.spinlock); 2118 module->internal.action_in_progress = 1; 2119 spdk_spin_unlock(&module->internal.spinlock); 2120 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2121 return rc; 2122 } 2123 } 2124 2125 g_resume_bdev_module = NULL; 2126 return 0; 2127 } 2128 2129 void 2130 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2131 { 2132 int rc = 0; 2133 char mempool_name[32]; 2134 2135 assert(cb_fn != NULL); 2136 2137 g_init_cb_fn = cb_fn; 2138 g_init_cb_arg = cb_arg; 2139 2140 spdk_notify_type_register("bdev_register"); 2141 spdk_notify_type_register("bdev_unregister"); 2142 2143 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2144 2145 rc = spdk_iobuf_register_module("bdev"); 2146 if (rc != 0) { 2147 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2148 bdev_init_complete(-1); 2149 return; 2150 } 2151 2152 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2153 g_bdev_opts.bdev_io_pool_size, 2154 sizeof(struct spdk_bdev_io) + 2155 bdev_module_get_max_ctx_size(), 2156 0, 2157 SPDK_ENV_SOCKET_ID_ANY); 2158 2159 if (g_bdev_mgr.bdev_io_pool == NULL) { 2160 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2161 bdev_init_complete(-1); 2162 return; 2163 } 2164 2165 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2166 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2167 if (!g_bdev_mgr.zero_buffer) { 2168 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2169 bdev_init_complete(-1); 2170 return; 2171 } 2172 2173 #ifdef SPDK_CONFIG_VTUNE 2174 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2175 #endif 2176 2177 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2178 bdev_mgmt_channel_destroy, 2179 sizeof(struct spdk_bdev_mgmt_channel), 2180 "bdev_mgr"); 2181 2182 rc = bdev_modules_init(); 2183 g_bdev_mgr.module_init_complete = true; 2184 if (rc != 0) { 2185 SPDK_ERRLOG("bdev modules init failed\n"); 2186 return; 2187 } 2188 2189 bdev_module_action_complete(); 2190 } 2191 2192 static void 2193 bdev_mgr_unregister_cb(void *io_device) 2194 { 2195 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2196 2197 if (g_bdev_mgr.bdev_io_pool) { 2198 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2199 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2200 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2201 g_bdev_opts.bdev_io_pool_size); 2202 } 2203 2204 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2205 } 2206 2207 spdk_free(g_bdev_mgr.zero_buffer); 2208 2209 bdev_examine_allowlist_free(); 2210 2211 cb_fn(g_fini_cb_arg); 2212 g_fini_cb_fn = NULL; 2213 g_fini_cb_arg = NULL; 2214 g_bdev_mgr.init_complete = false; 2215 g_bdev_mgr.module_init_complete = false; 2216 } 2217 2218 static void 2219 bdev_module_fini_iter(void *arg) 2220 { 2221 struct spdk_bdev_module *bdev_module; 2222 2223 /* FIXME: Handling initialization failures is broken now, 2224 * so we won't even try cleaning up after successfully 2225 * initialized modules. if module_init_complete is false, 2226 * just call spdk_bdev_mgr_unregister_cb 2227 */ 2228 if (!g_bdev_mgr.module_init_complete) { 2229 bdev_mgr_unregister_cb(NULL); 2230 return; 2231 } 2232 2233 /* Start iterating from the last touched module */ 2234 if (!g_resume_bdev_module) { 2235 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2236 } else { 2237 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2238 internal.tailq); 2239 } 2240 2241 while (bdev_module) { 2242 if (bdev_module->async_fini) { 2243 /* Save our place so we can resume later. We must 2244 * save the variable here, before calling module_fini() 2245 * below, because in some cases the module may immediately 2246 * call spdk_bdev_module_fini_done() and re-enter 2247 * this function to continue iterating. */ 2248 g_resume_bdev_module = bdev_module; 2249 } 2250 2251 if (bdev_module->module_fini) { 2252 bdev_module->module_fini(); 2253 } 2254 2255 if (bdev_module->async_fini) { 2256 return; 2257 } 2258 2259 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2260 internal.tailq); 2261 } 2262 2263 g_resume_bdev_module = NULL; 2264 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2265 } 2266 2267 void 2268 spdk_bdev_module_fini_done(void) 2269 { 2270 if (spdk_get_thread() != g_fini_thread) { 2271 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2272 } else { 2273 bdev_module_fini_iter(NULL); 2274 } 2275 } 2276 2277 static void 2278 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2279 { 2280 struct spdk_bdev *bdev = cb_arg; 2281 2282 if (bdeverrno && bdev) { 2283 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2284 bdev->name); 2285 2286 /* 2287 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2288 * bdev; try to continue by manually removing this bdev from the list and continue 2289 * with the next bdev in the list. 2290 */ 2291 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2292 } 2293 2294 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2295 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2296 /* 2297 * Bdev module finish need to be deferred as we might be in the middle of some context 2298 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2299 * after returning. 2300 */ 2301 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2302 return; 2303 } 2304 2305 /* 2306 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2307 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2308 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2309 * base bdevs. 2310 * 2311 * Also, walk the list in the reverse order. 2312 */ 2313 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2314 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2315 spdk_spin_lock(&bdev->internal.spinlock); 2316 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2317 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2318 spdk_spin_unlock(&bdev->internal.spinlock); 2319 continue; 2320 } 2321 spdk_spin_unlock(&bdev->internal.spinlock); 2322 2323 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2324 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2325 return; 2326 } 2327 2328 /* 2329 * If any bdev fails to unclaim underlying bdev properly, we may face the 2330 * case of bdev list consisting of claimed bdevs only (if claims are managed 2331 * correctly, this would mean there's a loop in the claims graph which is 2332 * clearly impossible). Warn and unregister last bdev on the list then. 2333 */ 2334 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2335 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2336 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2337 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2338 return; 2339 } 2340 } 2341 2342 static void 2343 bdev_module_fini_start_iter(void *arg) 2344 { 2345 struct spdk_bdev_module *bdev_module; 2346 2347 if (!g_resume_bdev_module) { 2348 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2349 } else { 2350 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2351 } 2352 2353 while (bdev_module) { 2354 if (bdev_module->async_fini_start) { 2355 /* Save our place so we can resume later. We must 2356 * save the variable here, before calling fini_start() 2357 * below, because in some cases the module may immediately 2358 * call spdk_bdev_module_fini_start_done() and re-enter 2359 * this function to continue iterating. */ 2360 g_resume_bdev_module = bdev_module; 2361 } 2362 2363 if (bdev_module->fini_start) { 2364 bdev_module->fini_start(); 2365 } 2366 2367 if (bdev_module->async_fini_start) { 2368 return; 2369 } 2370 2371 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2372 } 2373 2374 g_resume_bdev_module = NULL; 2375 2376 bdev_finish_unregister_bdevs_iter(NULL, 0); 2377 } 2378 2379 void 2380 spdk_bdev_module_fini_start_done(void) 2381 { 2382 if (spdk_get_thread() != g_fini_thread) { 2383 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2384 } else { 2385 bdev_module_fini_start_iter(NULL); 2386 } 2387 } 2388 2389 static void 2390 bdev_finish_wait_for_examine_done(void *cb_arg) 2391 { 2392 bdev_module_fini_start_iter(NULL); 2393 } 2394 2395 static void bdev_open_async_fini(void); 2396 2397 void 2398 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2399 { 2400 int rc; 2401 2402 assert(cb_fn != NULL); 2403 2404 g_fini_thread = spdk_get_thread(); 2405 2406 g_fini_cb_fn = cb_fn; 2407 g_fini_cb_arg = cb_arg; 2408 2409 bdev_open_async_fini(); 2410 2411 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2412 if (rc != 0) { 2413 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2414 bdev_finish_wait_for_examine_done(NULL); 2415 } 2416 } 2417 2418 struct spdk_bdev_io * 2419 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2420 { 2421 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2422 struct spdk_bdev_io *bdev_io; 2423 2424 if (ch->per_thread_cache_count > 0) { 2425 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2426 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2427 ch->per_thread_cache_count--; 2428 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2429 /* 2430 * Don't try to look for bdev_ios in the global pool if there are 2431 * waiters on bdev_ios - we don't want this caller to jump the line. 2432 */ 2433 bdev_io = NULL; 2434 } else { 2435 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2436 } 2437 2438 return bdev_io; 2439 } 2440 2441 void 2442 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2443 { 2444 struct spdk_bdev_mgmt_channel *ch; 2445 2446 assert(bdev_io != NULL); 2447 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2448 2449 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2450 2451 if (bdev_io->internal.buf != NULL) { 2452 bdev_io_put_buf(bdev_io); 2453 } 2454 2455 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2456 ch->per_thread_cache_count++; 2457 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2458 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2459 struct spdk_bdev_io_wait_entry *entry; 2460 2461 entry = TAILQ_FIRST(&ch->io_wait_queue); 2462 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2463 entry->cb_fn(entry->cb_arg); 2464 } 2465 } else { 2466 /* We should never have a full cache with entries on the io wait queue. */ 2467 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2468 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2469 } 2470 } 2471 2472 static bool 2473 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2474 { 2475 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2476 2477 switch (limit) { 2478 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2479 return true; 2480 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2481 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2482 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2483 return false; 2484 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2485 default: 2486 return false; 2487 } 2488 } 2489 2490 static bool 2491 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2492 { 2493 switch (bdev_io->type) { 2494 case SPDK_BDEV_IO_TYPE_NVME_IO: 2495 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2496 case SPDK_BDEV_IO_TYPE_READ: 2497 case SPDK_BDEV_IO_TYPE_WRITE: 2498 return true; 2499 case SPDK_BDEV_IO_TYPE_ZCOPY: 2500 if (bdev_io->u.bdev.zcopy.start) { 2501 return true; 2502 } else { 2503 return false; 2504 } 2505 default: 2506 return false; 2507 } 2508 } 2509 2510 static bool 2511 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2512 { 2513 switch (bdev_io->type) { 2514 case SPDK_BDEV_IO_TYPE_NVME_IO: 2515 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2516 /* Bit 1 (0x2) set for read operation */ 2517 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2518 return true; 2519 } else { 2520 return false; 2521 } 2522 case SPDK_BDEV_IO_TYPE_READ: 2523 return true; 2524 case SPDK_BDEV_IO_TYPE_ZCOPY: 2525 /* Populate to read from disk */ 2526 if (bdev_io->u.bdev.zcopy.populate) { 2527 return true; 2528 } else { 2529 return false; 2530 } 2531 default: 2532 return false; 2533 } 2534 } 2535 2536 static uint64_t 2537 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2538 { 2539 struct spdk_bdev *bdev = bdev_io->bdev; 2540 2541 switch (bdev_io->type) { 2542 case SPDK_BDEV_IO_TYPE_NVME_IO: 2543 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2544 return bdev_io->u.nvme_passthru.nbytes; 2545 case SPDK_BDEV_IO_TYPE_READ: 2546 case SPDK_BDEV_IO_TYPE_WRITE: 2547 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2548 case SPDK_BDEV_IO_TYPE_ZCOPY: 2549 /* Track the data in the start phase only */ 2550 if (bdev_io->u.bdev.zcopy.start) { 2551 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2552 } else { 2553 return 0; 2554 } 2555 default: 2556 return 0; 2557 } 2558 } 2559 2560 static bool 2561 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2562 { 2563 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2564 return true; 2565 } else { 2566 return false; 2567 } 2568 } 2569 2570 static bool 2571 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2572 { 2573 if (bdev_is_read_io(io) == false) { 2574 return false; 2575 } 2576 2577 return bdev_qos_rw_queue_io(limit, io); 2578 } 2579 2580 static bool 2581 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2582 { 2583 if (bdev_is_read_io(io) == true) { 2584 return false; 2585 } 2586 2587 return bdev_qos_rw_queue_io(limit, io); 2588 } 2589 2590 static void 2591 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2592 { 2593 limit->remaining_this_timeslice--; 2594 } 2595 2596 static void 2597 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2598 { 2599 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2600 } 2601 2602 static void 2603 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2604 { 2605 if (bdev_is_read_io(io) == false) { 2606 return; 2607 } 2608 2609 return bdev_qos_rw_bps_update_quota(limit, io); 2610 } 2611 2612 static void 2613 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2614 { 2615 if (bdev_is_read_io(io) == true) { 2616 return; 2617 } 2618 2619 return bdev_qos_rw_bps_update_quota(limit, io); 2620 } 2621 2622 static void 2623 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2624 { 2625 int i; 2626 2627 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2628 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2629 qos->rate_limits[i].queue_io = NULL; 2630 qos->rate_limits[i].update_quota = NULL; 2631 continue; 2632 } 2633 2634 switch (i) { 2635 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2636 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2637 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2638 break; 2639 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2640 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2641 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2642 break; 2643 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2644 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2645 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2646 break; 2647 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2648 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2649 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2650 break; 2651 default: 2652 break; 2653 } 2654 } 2655 } 2656 2657 static void 2658 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2659 struct spdk_bdev_io *bdev_io, 2660 enum spdk_bdev_io_status status) 2661 { 2662 bdev_io->internal.in_submit_request = true; 2663 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2664 spdk_bdev_io_complete(bdev_io, status); 2665 bdev_io->internal.in_submit_request = false; 2666 } 2667 2668 static inline void 2669 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2670 { 2671 struct spdk_bdev *bdev = bdev_io->bdev; 2672 struct spdk_io_channel *ch = bdev_ch->channel; 2673 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2674 2675 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2676 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2677 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2678 2679 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2680 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2681 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2682 SPDK_BDEV_IO_STATUS_SUCCESS); 2683 return; 2684 } 2685 } 2686 2687 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2688 bdev_io->bdev->split_on_write_unit && 2689 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2690 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2691 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2692 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2693 return; 2694 } 2695 2696 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2697 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2698 bdev_io->internal.in_submit_request = true; 2699 bdev_submit_request(bdev, ch, bdev_io); 2700 bdev_io->internal.in_submit_request = false; 2701 } else { 2702 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2703 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2704 /* Special case when we have nomem IOs and no outstanding IOs which completions 2705 * could trigger retry of queued IOs */ 2706 bdev_shared_ch_retry_io(shared_resource); 2707 } 2708 } 2709 } 2710 2711 static bool 2712 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2713 { 2714 int i; 2715 2716 if (bdev_qos_io_to_limit(bdev_io) == true) { 2717 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2718 if (!qos->rate_limits[i].queue_io) { 2719 continue; 2720 } 2721 2722 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2723 bdev_io) == true) { 2724 return true; 2725 } 2726 } 2727 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2728 if (!qos->rate_limits[i].update_quota) { 2729 continue; 2730 } 2731 2732 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2733 } 2734 } 2735 2736 return false; 2737 } 2738 2739 static inline void 2740 _bdev_io_do_submit(void *ctx) 2741 { 2742 struct spdk_bdev_io *bdev_io = ctx; 2743 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2744 2745 bdev_io_do_submit(ch, bdev_io); 2746 } 2747 2748 static int 2749 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2750 { 2751 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2752 int submitted_ios = 0; 2753 2754 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2755 if (!bdev_qos_queue_io(qos, bdev_io)) { 2756 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2757 2758 if (bdev_io->internal.io_submit_ch) { 2759 /* Send back the IO to the original thread for the actual processing. */ 2760 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2761 bdev_io->internal.io_submit_ch = NULL; 2762 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2763 _bdev_io_do_submit, bdev_io); 2764 } else { 2765 bdev_io_do_submit(ch, bdev_io); 2766 } 2767 2768 submitted_ios++; 2769 } 2770 } 2771 2772 return submitted_ios; 2773 } 2774 2775 static void 2776 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2777 { 2778 int rc; 2779 2780 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2781 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2782 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2783 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2784 &bdev_io->internal.waitq_entry); 2785 if (rc != 0) { 2786 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2787 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2788 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2789 } 2790 } 2791 2792 static bool 2793 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2794 { 2795 uint32_t io_boundary; 2796 struct spdk_bdev *bdev = bdev_io->bdev; 2797 uint32_t max_size = bdev->max_segment_size; 2798 int max_segs = bdev->max_num_segments; 2799 2800 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2801 io_boundary = bdev->write_unit_size; 2802 } else if (bdev->split_on_optimal_io_boundary) { 2803 io_boundary = bdev->optimal_io_boundary; 2804 } else { 2805 io_boundary = 0; 2806 } 2807 2808 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2809 return false; 2810 } 2811 2812 if (io_boundary) { 2813 uint64_t start_stripe, end_stripe; 2814 2815 start_stripe = bdev_io->u.bdev.offset_blocks; 2816 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2817 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2818 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2819 start_stripe >>= spdk_u32log2(io_boundary); 2820 end_stripe >>= spdk_u32log2(io_boundary); 2821 } else { 2822 start_stripe /= io_boundary; 2823 end_stripe /= io_boundary; 2824 } 2825 2826 if (start_stripe != end_stripe) { 2827 return true; 2828 } 2829 } 2830 2831 if (max_segs) { 2832 if (bdev_io->u.bdev.iovcnt > max_segs) { 2833 return true; 2834 } 2835 } 2836 2837 if (max_size) { 2838 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2839 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2840 return true; 2841 } 2842 } 2843 } 2844 2845 return false; 2846 } 2847 2848 static bool 2849 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2850 { 2851 uint32_t num_unmap_segments; 2852 2853 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2854 return false; 2855 } 2856 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2857 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2858 return true; 2859 } 2860 2861 return false; 2862 } 2863 2864 static bool 2865 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2866 { 2867 if (!bdev_io->bdev->max_write_zeroes) { 2868 return false; 2869 } 2870 2871 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2872 return true; 2873 } 2874 2875 return false; 2876 } 2877 2878 static bool 2879 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2880 { 2881 if (bdev_io->bdev->max_copy != 0 && 2882 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2883 return true; 2884 } 2885 2886 return false; 2887 } 2888 2889 static bool 2890 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2891 { 2892 switch (bdev_io->type) { 2893 case SPDK_BDEV_IO_TYPE_READ: 2894 case SPDK_BDEV_IO_TYPE_WRITE: 2895 return bdev_rw_should_split(bdev_io); 2896 case SPDK_BDEV_IO_TYPE_UNMAP: 2897 return bdev_unmap_should_split(bdev_io); 2898 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2899 return bdev_write_zeroes_should_split(bdev_io); 2900 case SPDK_BDEV_IO_TYPE_COPY: 2901 return bdev_copy_should_split(bdev_io); 2902 default: 2903 return false; 2904 } 2905 } 2906 2907 static uint32_t 2908 _to_next_boundary(uint64_t offset, uint32_t boundary) 2909 { 2910 return (boundary - (offset % boundary)); 2911 } 2912 2913 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2914 2915 static void _bdev_rw_split(void *_bdev_io); 2916 2917 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2918 2919 static void 2920 _bdev_unmap_split(void *_bdev_io) 2921 { 2922 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2923 } 2924 2925 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2926 2927 static void 2928 _bdev_write_zeroes_split(void *_bdev_io) 2929 { 2930 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2931 } 2932 2933 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2934 2935 static void 2936 _bdev_copy_split(void *_bdev_io) 2937 { 2938 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2939 } 2940 2941 static int 2942 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2943 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2944 { 2945 int rc; 2946 uint64_t current_offset, current_remaining, current_src_offset; 2947 spdk_bdev_io_wait_cb io_wait_fn; 2948 2949 current_offset = *offset; 2950 current_remaining = *remaining; 2951 2952 bdev_io->u.bdev.split_outstanding++; 2953 2954 io_wait_fn = _bdev_rw_split; 2955 switch (bdev_io->type) { 2956 case SPDK_BDEV_IO_TYPE_READ: 2957 assert(bdev_io->u.bdev.accel_sequence == NULL); 2958 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2959 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2960 iov, iovcnt, md_buf, current_offset, 2961 num_blocks, bdev_io->internal.memory_domain, 2962 bdev_io->internal.memory_domain_ctx, NULL, 2963 bdev_io_split_done, bdev_io); 2964 break; 2965 case SPDK_BDEV_IO_TYPE_WRITE: 2966 assert(bdev_io->u.bdev.accel_sequence == NULL); 2967 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2968 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2969 iov, iovcnt, md_buf, current_offset, 2970 num_blocks, bdev_io->internal.memory_domain, 2971 bdev_io->internal.memory_domain_ctx, NULL, 2972 bdev_io_split_done, bdev_io); 2973 break; 2974 case SPDK_BDEV_IO_TYPE_UNMAP: 2975 io_wait_fn = _bdev_unmap_split; 2976 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2977 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2978 current_offset, num_blocks, 2979 bdev_io_split_done, bdev_io); 2980 break; 2981 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2982 io_wait_fn = _bdev_write_zeroes_split; 2983 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2984 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2985 current_offset, num_blocks, 2986 bdev_io_split_done, bdev_io); 2987 break; 2988 case SPDK_BDEV_IO_TYPE_COPY: 2989 io_wait_fn = _bdev_copy_split; 2990 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2991 (current_offset - bdev_io->u.bdev.offset_blocks); 2992 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2993 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2994 current_offset, current_src_offset, num_blocks, 2995 bdev_io_split_done, bdev_io); 2996 break; 2997 default: 2998 assert(false); 2999 rc = -EINVAL; 3000 break; 3001 } 3002 3003 if (rc == 0) { 3004 current_offset += num_blocks; 3005 current_remaining -= num_blocks; 3006 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 3007 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 3008 *offset = current_offset; 3009 *remaining = current_remaining; 3010 } else { 3011 bdev_io->u.bdev.split_outstanding--; 3012 if (rc == -ENOMEM) { 3013 if (bdev_io->u.bdev.split_outstanding == 0) { 3014 /* No I/O is outstanding. Hence we should wait here. */ 3015 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3016 } 3017 } else { 3018 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3019 if (bdev_io->u.bdev.split_outstanding == 0) { 3020 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3021 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3022 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3023 } 3024 } 3025 } 3026 3027 return rc; 3028 } 3029 3030 static void 3031 _bdev_rw_split(void *_bdev_io) 3032 { 3033 struct iovec *parent_iov, *iov; 3034 struct spdk_bdev_io *bdev_io = _bdev_io; 3035 struct spdk_bdev *bdev = bdev_io->bdev; 3036 uint64_t parent_offset, current_offset, remaining; 3037 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3038 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3039 uint32_t iovcnt, iov_len, child_iovsize; 3040 uint32_t blocklen = bdev->blocklen; 3041 uint32_t io_boundary; 3042 uint32_t max_segment_size = bdev->max_segment_size; 3043 uint32_t max_child_iovcnt = bdev->max_num_segments; 3044 void *md_buf = NULL; 3045 int rc; 3046 3047 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3048 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3049 SPDK_BDEV_IO_NUM_CHILD_IOV; 3050 3051 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3052 io_boundary = bdev->write_unit_size; 3053 } else if (bdev->split_on_optimal_io_boundary) { 3054 io_boundary = bdev->optimal_io_boundary; 3055 } else { 3056 io_boundary = UINT32_MAX; 3057 } 3058 3059 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3060 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3061 parent_offset = bdev_io->u.bdev.offset_blocks; 3062 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3063 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3064 3065 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3066 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3067 if (parent_iov_offset < parent_iov->iov_len) { 3068 break; 3069 } 3070 parent_iov_offset -= parent_iov->iov_len; 3071 } 3072 3073 child_iovcnt = 0; 3074 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3075 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3076 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3077 to_next_boundary = spdk_min(remaining, to_next_boundary); 3078 to_next_boundary_bytes = to_next_boundary * blocklen; 3079 3080 iov = &bdev_io->child_iov[child_iovcnt]; 3081 iovcnt = 0; 3082 3083 if (bdev_io->u.bdev.md_buf) { 3084 md_buf = (char *)bdev_io->u.bdev.md_buf + 3085 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3086 } 3087 3088 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3089 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3090 iovcnt < child_iovsize) { 3091 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3092 iov_len = parent_iov->iov_len - parent_iov_offset; 3093 3094 iov_len = spdk_min(iov_len, max_segment_size); 3095 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3096 to_next_boundary_bytes -= iov_len; 3097 3098 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3099 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3100 3101 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3102 parent_iov_offset += iov_len; 3103 } else { 3104 parent_iovpos++; 3105 parent_iov_offset = 0; 3106 } 3107 child_iovcnt++; 3108 iovcnt++; 3109 } 3110 3111 if (to_next_boundary_bytes > 0) { 3112 /* We had to stop this child I/O early because we ran out of 3113 * child_iov space or were limited by max_num_segments. 3114 * Ensure the iovs to be aligned with block size and 3115 * then adjust to_next_boundary before starting the 3116 * child I/O. 3117 */ 3118 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3119 iovcnt == child_iovsize); 3120 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3121 if (to_last_block_bytes != 0) { 3122 uint32_t child_iovpos = child_iovcnt - 1; 3123 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3124 * so the loop will naturally end 3125 */ 3126 3127 to_last_block_bytes = blocklen - to_last_block_bytes; 3128 to_next_boundary_bytes += to_last_block_bytes; 3129 while (to_last_block_bytes > 0 && iovcnt > 0) { 3130 iov_len = spdk_min(to_last_block_bytes, 3131 bdev_io->child_iov[child_iovpos].iov_len); 3132 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3133 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3134 child_iovpos--; 3135 if (--iovcnt == 0) { 3136 /* If the child IO is less than a block size just return. 3137 * If the first child IO of any split round is less than 3138 * a block size, an error exit. 3139 */ 3140 if (bdev_io->u.bdev.split_outstanding == 0) { 3141 SPDK_ERRLOG("The first child io was less than a block size\n"); 3142 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3143 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3144 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3145 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3146 } 3147 3148 return; 3149 } 3150 } 3151 3152 to_last_block_bytes -= iov_len; 3153 3154 if (parent_iov_offset == 0) { 3155 parent_iovpos--; 3156 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3157 } 3158 parent_iov_offset -= iov_len; 3159 } 3160 3161 assert(to_last_block_bytes == 0); 3162 } 3163 to_next_boundary -= to_next_boundary_bytes / blocklen; 3164 } 3165 3166 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3167 ¤t_offset, &remaining); 3168 if (spdk_unlikely(rc)) { 3169 return; 3170 } 3171 } 3172 } 3173 3174 static void 3175 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3176 { 3177 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3178 uint32_t num_children_reqs = 0; 3179 int rc; 3180 3181 offset = bdev_io->u.bdev.split_current_offset_blocks; 3182 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3183 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3184 3185 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3186 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3187 3188 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3189 &offset, &remaining); 3190 if (spdk_likely(rc == 0)) { 3191 num_children_reqs++; 3192 } else { 3193 return; 3194 } 3195 } 3196 } 3197 3198 static void 3199 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3200 { 3201 uint64_t offset, write_zeroes_blocks, remaining; 3202 uint32_t num_children_reqs = 0; 3203 int rc; 3204 3205 offset = bdev_io->u.bdev.split_current_offset_blocks; 3206 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3207 3208 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3209 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3210 3211 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3212 &offset, &remaining); 3213 if (spdk_likely(rc == 0)) { 3214 num_children_reqs++; 3215 } else { 3216 return; 3217 } 3218 } 3219 } 3220 3221 static void 3222 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3223 { 3224 uint64_t offset, copy_blocks, remaining; 3225 uint32_t num_children_reqs = 0; 3226 int rc; 3227 3228 offset = bdev_io->u.bdev.split_current_offset_blocks; 3229 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3230 3231 assert(bdev_io->bdev->max_copy != 0); 3232 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3233 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3234 3235 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3236 &offset, &remaining); 3237 if (spdk_likely(rc == 0)) { 3238 num_children_reqs++; 3239 } else { 3240 return; 3241 } 3242 } 3243 } 3244 3245 static void 3246 parent_bdev_io_complete(void *ctx, int rc) 3247 { 3248 struct spdk_bdev_io *parent_io = ctx; 3249 3250 if (rc) { 3251 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3252 } 3253 3254 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3255 parent_io->internal.caller_ctx); 3256 } 3257 3258 static void 3259 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3260 { 3261 struct spdk_bdev_io *bdev_io = ctx; 3262 3263 /* u.bdev.accel_sequence should have already been cleared at this point */ 3264 assert(bdev_io->u.bdev.accel_sequence == NULL); 3265 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3266 bdev_io->internal.accel_sequence = NULL; 3267 3268 if (spdk_unlikely(status != 0)) { 3269 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3270 } 3271 3272 parent_bdev_io_complete(bdev_io, status); 3273 } 3274 3275 static void 3276 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3277 { 3278 struct spdk_bdev_io *parent_io = cb_arg; 3279 3280 spdk_bdev_free_io(bdev_io); 3281 3282 if (!success) { 3283 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3284 /* If any child I/O failed, stop further splitting process. */ 3285 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3286 parent_io->u.bdev.split_remaining_num_blocks = 0; 3287 } 3288 parent_io->u.bdev.split_outstanding--; 3289 if (parent_io->u.bdev.split_outstanding != 0) { 3290 return; 3291 } 3292 3293 /* 3294 * Parent I/O finishes when all blocks are consumed. 3295 */ 3296 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3297 assert(parent_io->internal.cb != bdev_io_split_done); 3298 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3299 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3300 3301 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3302 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3303 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3304 return; 3305 } else if (parent_io->internal.orig_iovcnt != 0 && 3306 !bdev_io_use_accel_sequence(bdev_io)) { 3307 /* bdev IO will be completed in the callback */ 3308 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3309 return; 3310 } 3311 } 3312 3313 parent_bdev_io_complete(parent_io, 0); 3314 return; 3315 } 3316 3317 /* 3318 * Continue with the splitting process. This function will complete the parent I/O if the 3319 * splitting is done. 3320 */ 3321 switch (parent_io->type) { 3322 case SPDK_BDEV_IO_TYPE_READ: 3323 case SPDK_BDEV_IO_TYPE_WRITE: 3324 _bdev_rw_split(parent_io); 3325 break; 3326 case SPDK_BDEV_IO_TYPE_UNMAP: 3327 bdev_unmap_split(parent_io); 3328 break; 3329 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3330 bdev_write_zeroes_split(parent_io); 3331 break; 3332 case SPDK_BDEV_IO_TYPE_COPY: 3333 bdev_copy_split(parent_io); 3334 break; 3335 default: 3336 assert(false); 3337 break; 3338 } 3339 } 3340 3341 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3342 bool success); 3343 3344 static void 3345 bdev_io_split(struct spdk_bdev_io *bdev_io) 3346 { 3347 assert(bdev_io_should_split(bdev_io)); 3348 3349 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3350 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3351 bdev_io->u.bdev.split_outstanding = 0; 3352 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3353 3354 switch (bdev_io->type) { 3355 case SPDK_BDEV_IO_TYPE_READ: 3356 case SPDK_BDEV_IO_TYPE_WRITE: 3357 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3358 _bdev_rw_split(bdev_io); 3359 } else { 3360 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3361 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3362 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3363 } 3364 break; 3365 case SPDK_BDEV_IO_TYPE_UNMAP: 3366 bdev_unmap_split(bdev_io); 3367 break; 3368 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3369 bdev_write_zeroes_split(bdev_io); 3370 break; 3371 case SPDK_BDEV_IO_TYPE_COPY: 3372 bdev_copy_split(bdev_io); 3373 break; 3374 default: 3375 assert(false); 3376 break; 3377 } 3378 } 3379 3380 static void 3381 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3382 { 3383 if (!success) { 3384 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3385 return; 3386 } 3387 3388 _bdev_rw_split(bdev_io); 3389 } 3390 3391 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3392 * be inlined, at least on some compilers. 3393 */ 3394 static inline void 3395 _bdev_io_submit(void *ctx) 3396 { 3397 struct spdk_bdev_io *bdev_io = ctx; 3398 struct spdk_bdev *bdev = bdev_io->bdev; 3399 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3400 3401 if (spdk_likely(bdev_ch->flags == 0)) { 3402 bdev_io_do_submit(bdev_ch, bdev_io); 3403 return; 3404 } 3405 3406 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3407 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3408 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3409 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3410 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3411 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3412 } else { 3413 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3414 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3415 } 3416 } else { 3417 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3418 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3419 } 3420 } 3421 3422 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3423 3424 bool 3425 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3426 { 3427 if (range1->length == 0 || range2->length == 0) { 3428 return false; 3429 } 3430 3431 if (range1->offset + range1->length <= range2->offset) { 3432 return false; 3433 } 3434 3435 if (range2->offset + range2->length <= range1->offset) { 3436 return false; 3437 } 3438 3439 return true; 3440 } 3441 3442 static bool 3443 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3444 { 3445 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3446 struct lba_range r; 3447 3448 switch (bdev_io->type) { 3449 case SPDK_BDEV_IO_TYPE_NVME_IO: 3450 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3451 /* Don't try to decode the NVMe command - just assume worst-case and that 3452 * it overlaps a locked range. 3453 */ 3454 return true; 3455 case SPDK_BDEV_IO_TYPE_WRITE: 3456 case SPDK_BDEV_IO_TYPE_UNMAP: 3457 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3458 case SPDK_BDEV_IO_TYPE_ZCOPY: 3459 case SPDK_BDEV_IO_TYPE_COPY: 3460 r.offset = bdev_io->u.bdev.offset_blocks; 3461 r.length = bdev_io->u.bdev.num_blocks; 3462 if (!bdev_lba_range_overlapped(range, &r)) { 3463 /* This I/O doesn't overlap the specified LBA range. */ 3464 return false; 3465 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3466 /* This I/O overlaps, but the I/O is on the same channel that locked this 3467 * range, and the caller_ctx is the same as the locked_ctx. This means 3468 * that this I/O is associated with the lock, and is allowed to execute. 3469 */ 3470 return false; 3471 } else { 3472 return true; 3473 } 3474 default: 3475 return false; 3476 } 3477 } 3478 3479 void 3480 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3481 { 3482 struct spdk_bdev *bdev = bdev_io->bdev; 3483 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3484 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3485 3486 assert(thread != NULL); 3487 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3488 3489 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3490 struct lba_range *range; 3491 3492 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3493 if (bdev_io_range_is_locked(bdev_io, range)) { 3494 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3495 return; 3496 } 3497 } 3498 } 3499 3500 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3501 3502 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3503 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3504 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3505 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3506 spdk_bdev_get_name(bdev)); 3507 3508 if (bdev_io->internal.split) { 3509 bdev_io_split(bdev_io); 3510 return; 3511 } 3512 3513 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3514 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3515 _bdev_io_submit(bdev_io); 3516 } else { 3517 bdev_io->internal.io_submit_ch = ch; 3518 bdev_io->internal.ch = bdev->internal.qos->ch; 3519 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3520 } 3521 } else { 3522 _bdev_io_submit(bdev_io); 3523 } 3524 } 3525 3526 static inline void 3527 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3528 { 3529 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3530 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3531 * For write operation we need to pull buffers from memory domain before submitting IO. 3532 * Once read operation completes, we need to use memory_domain push functionality to 3533 * update data in original memory domain IO buffer 3534 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3535 bdev_io->u.bdev.memory_domain = NULL; 3536 bdev_io->u.bdev.memory_domain_ctx = NULL; 3537 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3538 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3539 } 3540 3541 static inline void 3542 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3543 { 3544 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3545 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3546 3547 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3548 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3549 bdev_io_complete_unsubmitted(bdev_io); 3550 return; 3551 } 3552 3553 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3554 * support them, but we need to execute an accel sequence and the data buffer is from accel 3555 * memory domain (to avoid doing a push/pull from that domain). 3556 */ 3557 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3558 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3559 _bdev_io_ext_use_bounce_buffer(bdev_io); 3560 return; 3561 } 3562 3563 if (needs_exec) { 3564 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3565 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3566 return; 3567 } 3568 /* For reads we'll execute the sequence after the data is read, so, for now, only 3569 * clear out accel_sequence pointer and submit the IO */ 3570 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3571 bdev_io->u.bdev.accel_sequence = NULL; 3572 } 3573 3574 bdev_io_submit(bdev_io); 3575 } 3576 3577 static void 3578 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3579 { 3580 struct spdk_bdev *bdev = bdev_io->bdev; 3581 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3582 struct spdk_io_channel *ch = bdev_ch->channel; 3583 3584 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3585 3586 bdev_io->internal.in_submit_request = true; 3587 bdev_submit_request(bdev, ch, bdev_io); 3588 bdev_io->internal.in_submit_request = false; 3589 } 3590 3591 void 3592 bdev_io_init(struct spdk_bdev_io *bdev_io, 3593 struct spdk_bdev *bdev, void *cb_arg, 3594 spdk_bdev_io_completion_cb cb) 3595 { 3596 bdev_io->bdev = bdev; 3597 bdev_io->internal.caller_ctx = cb_arg; 3598 bdev_io->internal.cb = cb; 3599 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3600 bdev_io->internal.in_submit_request = false; 3601 bdev_io->internal.buf = NULL; 3602 bdev_io->internal.io_submit_ch = NULL; 3603 bdev_io->internal.orig_iovs = NULL; 3604 bdev_io->internal.orig_iovcnt = 0; 3605 bdev_io->internal.orig_md_iov.iov_base = NULL; 3606 bdev_io->internal.error.nvme.cdw0 = 0; 3607 bdev_io->num_retries = 0; 3608 bdev_io->internal.get_buf_cb = NULL; 3609 bdev_io->internal.get_aux_buf_cb = NULL; 3610 bdev_io->internal.memory_domain = NULL; 3611 bdev_io->internal.memory_domain_ctx = NULL; 3612 bdev_io->internal.data_transfer_cpl = NULL; 3613 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3614 bdev_io->internal.accel_sequence = NULL; 3615 bdev_io->internal.has_accel_sequence = false; 3616 } 3617 3618 static bool 3619 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3620 { 3621 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3622 } 3623 3624 bool 3625 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3626 { 3627 bool supported; 3628 3629 supported = bdev_io_type_supported(bdev, io_type); 3630 3631 if (!supported) { 3632 switch (io_type) { 3633 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3634 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3635 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3636 break; 3637 default: 3638 break; 3639 } 3640 } 3641 3642 return supported; 3643 } 3644 3645 uint64_t 3646 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3647 { 3648 return bdev_io->internal.submit_tsc; 3649 } 3650 3651 int 3652 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3653 { 3654 if (bdev->fn_table->dump_info_json) { 3655 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3656 } 3657 3658 return 0; 3659 } 3660 3661 static void 3662 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3663 { 3664 uint32_t max_per_timeslice = 0; 3665 int i; 3666 3667 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3668 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3669 qos->rate_limits[i].max_per_timeslice = 0; 3670 continue; 3671 } 3672 3673 max_per_timeslice = qos->rate_limits[i].limit * 3674 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3675 3676 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3677 qos->rate_limits[i].min_per_timeslice); 3678 3679 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3680 } 3681 3682 bdev_qos_set_ops(qos); 3683 } 3684 3685 static int 3686 bdev_channel_poll_qos(void *arg) 3687 { 3688 struct spdk_bdev_qos *qos = arg; 3689 uint64_t now = spdk_get_ticks(); 3690 int i; 3691 3692 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3693 /* We received our callback earlier than expected - return 3694 * immediately and wait to do accounting until at least one 3695 * timeslice has actually expired. This should never happen 3696 * with a well-behaved timer implementation. 3697 */ 3698 return SPDK_POLLER_IDLE; 3699 } 3700 3701 /* Reset for next round of rate limiting */ 3702 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3703 /* We may have allowed the IOs or bytes to slightly overrun in the last 3704 * timeslice. remaining_this_timeslice is signed, so if it's negative 3705 * here, we'll account for the overrun so that the next timeslice will 3706 * be appropriately reduced. 3707 */ 3708 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3709 qos->rate_limits[i].remaining_this_timeslice = 0; 3710 } 3711 } 3712 3713 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3714 qos->last_timeslice += qos->timeslice_size; 3715 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3716 qos->rate_limits[i].remaining_this_timeslice += 3717 qos->rate_limits[i].max_per_timeslice; 3718 } 3719 } 3720 3721 return bdev_qos_io_submit(qos->ch, qos); 3722 } 3723 3724 static void 3725 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3726 { 3727 struct spdk_bdev_shared_resource *shared_resource; 3728 struct lba_range *range; 3729 3730 bdev_free_io_stat(ch->stat); 3731 #ifdef SPDK_CONFIG_VTUNE 3732 bdev_free_io_stat(ch->prev_stat); 3733 #endif 3734 3735 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3736 range = TAILQ_FIRST(&ch->locked_ranges); 3737 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3738 free(range); 3739 } 3740 3741 spdk_put_io_channel(ch->channel); 3742 spdk_put_io_channel(ch->accel_channel); 3743 3744 shared_resource = ch->shared_resource; 3745 3746 assert(TAILQ_EMPTY(&ch->io_locked)); 3747 assert(TAILQ_EMPTY(&ch->io_submitted)); 3748 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3749 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3750 assert(ch->io_outstanding == 0); 3751 assert(shared_resource->ref > 0); 3752 shared_resource->ref--; 3753 if (shared_resource->ref == 0) { 3754 assert(shared_resource->io_outstanding == 0); 3755 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3756 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3757 spdk_poller_unregister(&shared_resource->nomem_poller); 3758 free(shared_resource); 3759 } 3760 } 3761 3762 static void 3763 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3764 { 3765 struct spdk_bdev_qos *qos = bdev->internal.qos; 3766 int i; 3767 3768 assert(spdk_spin_held(&bdev->internal.spinlock)); 3769 3770 /* Rate limiting on this bdev enabled */ 3771 if (qos) { 3772 if (qos->ch == NULL) { 3773 struct spdk_io_channel *io_ch; 3774 3775 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3776 bdev->name, spdk_get_thread()); 3777 3778 /* No qos channel has been selected, so set one up */ 3779 3780 /* Take another reference to ch */ 3781 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3782 assert(io_ch != NULL); 3783 qos->ch = ch; 3784 3785 qos->thread = spdk_io_channel_get_thread(io_ch); 3786 3787 TAILQ_INIT(&qos->queued); 3788 3789 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3790 if (bdev_qos_is_iops_rate_limit(i) == true) { 3791 qos->rate_limits[i].min_per_timeslice = 3792 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3793 } else { 3794 qos->rate_limits[i].min_per_timeslice = 3795 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3796 } 3797 3798 if (qos->rate_limits[i].limit == 0) { 3799 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3800 } 3801 } 3802 bdev_qos_update_max_quota_per_timeslice(qos); 3803 qos->timeslice_size = 3804 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3805 qos->last_timeslice = spdk_get_ticks(); 3806 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3807 qos, 3808 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3809 } 3810 3811 ch->flags |= BDEV_CH_QOS_ENABLED; 3812 } 3813 } 3814 3815 struct poll_timeout_ctx { 3816 struct spdk_bdev_desc *desc; 3817 uint64_t timeout_in_sec; 3818 spdk_bdev_io_timeout_cb cb_fn; 3819 void *cb_arg; 3820 }; 3821 3822 static void 3823 bdev_desc_free(struct spdk_bdev_desc *desc) 3824 { 3825 spdk_spin_destroy(&desc->spinlock); 3826 free(desc->media_events_buffer); 3827 free(desc); 3828 } 3829 3830 static void 3831 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3832 { 3833 struct poll_timeout_ctx *ctx = _ctx; 3834 struct spdk_bdev_desc *desc = ctx->desc; 3835 3836 free(ctx); 3837 3838 spdk_spin_lock(&desc->spinlock); 3839 desc->refs--; 3840 if (desc->closed == true && desc->refs == 0) { 3841 spdk_spin_unlock(&desc->spinlock); 3842 bdev_desc_free(desc); 3843 return; 3844 } 3845 spdk_spin_unlock(&desc->spinlock); 3846 } 3847 3848 static void 3849 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3850 struct spdk_io_channel *io_ch, void *_ctx) 3851 { 3852 struct poll_timeout_ctx *ctx = _ctx; 3853 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3854 struct spdk_bdev_desc *desc = ctx->desc; 3855 struct spdk_bdev_io *bdev_io; 3856 uint64_t now; 3857 3858 spdk_spin_lock(&desc->spinlock); 3859 if (desc->closed == true) { 3860 spdk_spin_unlock(&desc->spinlock); 3861 spdk_bdev_for_each_channel_continue(i, -1); 3862 return; 3863 } 3864 spdk_spin_unlock(&desc->spinlock); 3865 3866 now = spdk_get_ticks(); 3867 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3868 /* Exclude any I/O that are generated via splitting. */ 3869 if (bdev_io->internal.cb == bdev_io_split_done) { 3870 continue; 3871 } 3872 3873 /* Once we find an I/O that has not timed out, we can immediately 3874 * exit the loop. 3875 */ 3876 if (now < (bdev_io->internal.submit_tsc + 3877 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3878 goto end; 3879 } 3880 3881 if (bdev_io->internal.desc == desc) { 3882 ctx->cb_fn(ctx->cb_arg, bdev_io); 3883 } 3884 } 3885 3886 end: 3887 spdk_bdev_for_each_channel_continue(i, 0); 3888 } 3889 3890 static int 3891 bdev_poll_timeout_io(void *arg) 3892 { 3893 struct spdk_bdev_desc *desc = arg; 3894 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3895 struct poll_timeout_ctx *ctx; 3896 3897 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3898 if (!ctx) { 3899 SPDK_ERRLOG("failed to allocate memory\n"); 3900 return SPDK_POLLER_BUSY; 3901 } 3902 ctx->desc = desc; 3903 ctx->cb_arg = desc->cb_arg; 3904 ctx->cb_fn = desc->cb_fn; 3905 ctx->timeout_in_sec = desc->timeout_in_sec; 3906 3907 /* Take a ref on the descriptor in case it gets closed while we are checking 3908 * all of the channels. 3909 */ 3910 spdk_spin_lock(&desc->spinlock); 3911 desc->refs++; 3912 spdk_spin_unlock(&desc->spinlock); 3913 3914 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3915 bdev_channel_poll_timeout_io_done); 3916 3917 return SPDK_POLLER_BUSY; 3918 } 3919 3920 int 3921 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3922 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3923 { 3924 assert(desc->thread == spdk_get_thread()); 3925 3926 spdk_poller_unregister(&desc->io_timeout_poller); 3927 3928 if (timeout_in_sec) { 3929 assert(cb_fn != NULL); 3930 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3931 desc, 3932 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3933 1000); 3934 if (desc->io_timeout_poller == NULL) { 3935 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3936 return -1; 3937 } 3938 } 3939 3940 desc->cb_fn = cb_fn; 3941 desc->cb_arg = cb_arg; 3942 desc->timeout_in_sec = timeout_in_sec; 3943 3944 return 0; 3945 } 3946 3947 static int 3948 bdev_channel_create(void *io_device, void *ctx_buf) 3949 { 3950 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3951 struct spdk_bdev_channel *ch = ctx_buf; 3952 struct spdk_io_channel *mgmt_io_ch; 3953 struct spdk_bdev_mgmt_channel *mgmt_ch; 3954 struct spdk_bdev_shared_resource *shared_resource; 3955 struct lba_range *range; 3956 3957 ch->bdev = bdev; 3958 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3959 if (!ch->channel) { 3960 return -1; 3961 } 3962 3963 ch->accel_channel = spdk_accel_get_io_channel(); 3964 if (!ch->accel_channel) { 3965 spdk_put_io_channel(ch->channel); 3966 return -1; 3967 } 3968 3969 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3970 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3971 3972 assert(ch->histogram == NULL); 3973 if (bdev->internal.histogram_enabled) { 3974 ch->histogram = spdk_histogram_data_alloc(); 3975 if (ch->histogram == NULL) { 3976 SPDK_ERRLOG("Could not allocate histogram\n"); 3977 } 3978 } 3979 3980 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3981 if (!mgmt_io_ch) { 3982 spdk_put_io_channel(ch->channel); 3983 spdk_put_io_channel(ch->accel_channel); 3984 return -1; 3985 } 3986 3987 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3988 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3989 if (shared_resource->shared_ch == ch->channel) { 3990 spdk_put_io_channel(mgmt_io_ch); 3991 shared_resource->ref++; 3992 break; 3993 } 3994 } 3995 3996 if (shared_resource == NULL) { 3997 shared_resource = calloc(1, sizeof(*shared_resource)); 3998 if (shared_resource == NULL) { 3999 spdk_put_io_channel(ch->channel); 4000 spdk_put_io_channel(ch->accel_channel); 4001 spdk_put_io_channel(mgmt_io_ch); 4002 return -1; 4003 } 4004 4005 shared_resource->mgmt_ch = mgmt_ch; 4006 shared_resource->io_outstanding = 0; 4007 TAILQ_INIT(&shared_resource->nomem_io); 4008 shared_resource->nomem_threshold = 0; 4009 shared_resource->shared_ch = ch->channel; 4010 shared_resource->ref = 1; 4011 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4012 } 4013 4014 ch->io_outstanding = 0; 4015 TAILQ_INIT(&ch->queued_resets); 4016 TAILQ_INIT(&ch->locked_ranges); 4017 ch->flags = 0; 4018 ch->shared_resource = shared_resource; 4019 4020 TAILQ_INIT(&ch->io_submitted); 4021 TAILQ_INIT(&ch->io_locked); 4022 TAILQ_INIT(&ch->io_accel_exec); 4023 TAILQ_INIT(&ch->io_memory_domain); 4024 4025 ch->stat = bdev_alloc_io_stat(false); 4026 if (ch->stat == NULL) { 4027 bdev_channel_destroy_resource(ch); 4028 return -1; 4029 } 4030 4031 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4032 4033 #ifdef SPDK_CONFIG_VTUNE 4034 { 4035 char *name; 4036 __itt_init_ittlib(NULL, 0); 4037 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4038 if (!name) { 4039 bdev_channel_destroy_resource(ch); 4040 return -1; 4041 } 4042 ch->handle = __itt_string_handle_create(name); 4043 free(name); 4044 ch->start_tsc = spdk_get_ticks(); 4045 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4046 ch->prev_stat = bdev_alloc_io_stat(false); 4047 if (ch->prev_stat == NULL) { 4048 bdev_channel_destroy_resource(ch); 4049 return -1; 4050 } 4051 } 4052 #endif 4053 4054 spdk_spin_lock(&bdev->internal.spinlock); 4055 bdev_enable_qos(bdev, ch); 4056 4057 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4058 struct lba_range *new_range; 4059 4060 new_range = calloc(1, sizeof(*new_range)); 4061 if (new_range == NULL) { 4062 spdk_spin_unlock(&bdev->internal.spinlock); 4063 bdev_channel_destroy_resource(ch); 4064 return -1; 4065 } 4066 new_range->length = range->length; 4067 new_range->offset = range->offset; 4068 new_range->locked_ctx = range->locked_ctx; 4069 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4070 } 4071 4072 spdk_spin_unlock(&bdev->internal.spinlock); 4073 4074 return 0; 4075 } 4076 4077 static int 4078 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4079 void *cb_ctx) 4080 { 4081 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4082 struct spdk_bdev_io *bdev_io; 4083 uint64_t buf_len; 4084 4085 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4086 if (bdev_io->internal.ch == bdev_ch) { 4087 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4088 spdk_iobuf_entry_abort(ch, entry, buf_len); 4089 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4090 } 4091 4092 return 0; 4093 } 4094 4095 /* 4096 * Abort I/O that are waiting on a data buffer. 4097 */ 4098 static void 4099 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4100 { 4101 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4102 bdev_abort_all_buf_io_cb, ch); 4103 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4104 bdev_abort_all_buf_io_cb, ch); 4105 } 4106 4107 /* 4108 * Abort I/O that are queued waiting for submission. These types of I/O are 4109 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4110 */ 4111 static void 4112 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4113 { 4114 struct spdk_bdev_io *bdev_io, *tmp; 4115 4116 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4117 if (bdev_io->internal.ch == ch) { 4118 TAILQ_REMOVE(queue, bdev_io, internal.link); 4119 /* 4120 * spdk_bdev_io_complete() assumes that the completed I/O had 4121 * been submitted to the bdev module. Since in this case it 4122 * hadn't, bump io_outstanding to account for the decrement 4123 * that spdk_bdev_io_complete() will do. 4124 */ 4125 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4126 bdev_io_increment_outstanding(ch, ch->shared_resource); 4127 } 4128 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4129 } 4130 } 4131 } 4132 4133 static bool 4134 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4135 { 4136 struct spdk_bdev_io *bdev_io; 4137 4138 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4139 if (bdev_io == bio_to_abort) { 4140 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4141 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4142 return true; 4143 } 4144 } 4145 4146 return false; 4147 } 4148 4149 static int 4150 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4151 { 4152 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4153 uint64_t buf_len; 4154 4155 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4156 if (bdev_io == bio_to_abort) { 4157 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4158 spdk_iobuf_entry_abort(ch, entry, buf_len); 4159 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4160 return 1; 4161 } 4162 4163 return 0; 4164 } 4165 4166 static bool 4167 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4168 { 4169 int rc; 4170 4171 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4172 bdev_abort_buf_io_cb, bio_to_abort); 4173 if (rc == 1) { 4174 return true; 4175 } 4176 4177 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4178 bdev_abort_buf_io_cb, bio_to_abort); 4179 return rc == 1; 4180 } 4181 4182 static void 4183 bdev_qos_channel_destroy(void *cb_arg) 4184 { 4185 struct spdk_bdev_qos *qos = cb_arg; 4186 4187 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4188 spdk_poller_unregister(&qos->poller); 4189 4190 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4191 4192 free(qos); 4193 } 4194 4195 static int 4196 bdev_qos_destroy(struct spdk_bdev *bdev) 4197 { 4198 int i; 4199 4200 /* 4201 * Cleanly shutting down the QoS poller is tricky, because 4202 * during the asynchronous operation the user could open 4203 * a new descriptor and create a new channel, spawning 4204 * a new QoS poller. 4205 * 4206 * The strategy is to create a new QoS structure here and swap it 4207 * in. The shutdown path then continues to refer to the old one 4208 * until it completes and then releases it. 4209 */ 4210 struct spdk_bdev_qos *new_qos, *old_qos; 4211 4212 old_qos = bdev->internal.qos; 4213 4214 new_qos = calloc(1, sizeof(*new_qos)); 4215 if (!new_qos) { 4216 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4217 return -ENOMEM; 4218 } 4219 4220 /* Copy the old QoS data into the newly allocated structure */ 4221 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4222 4223 /* Zero out the key parts of the QoS structure */ 4224 new_qos->ch = NULL; 4225 new_qos->thread = NULL; 4226 new_qos->poller = NULL; 4227 TAILQ_INIT(&new_qos->queued); 4228 /* 4229 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4230 * It will be used later for the new QoS structure. 4231 */ 4232 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4233 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4234 new_qos->rate_limits[i].min_per_timeslice = 0; 4235 new_qos->rate_limits[i].max_per_timeslice = 0; 4236 } 4237 4238 bdev->internal.qos = new_qos; 4239 4240 if (old_qos->thread == NULL) { 4241 free(old_qos); 4242 } else { 4243 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4244 } 4245 4246 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4247 * been destroyed yet. The destruction path will end up waiting for the final 4248 * channel to be put before it releases resources. */ 4249 4250 return 0; 4251 } 4252 4253 void 4254 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4255 { 4256 total->bytes_read += add->bytes_read; 4257 total->num_read_ops += add->num_read_ops; 4258 total->bytes_written += add->bytes_written; 4259 total->num_write_ops += add->num_write_ops; 4260 total->bytes_unmapped += add->bytes_unmapped; 4261 total->num_unmap_ops += add->num_unmap_ops; 4262 total->bytes_copied += add->bytes_copied; 4263 total->num_copy_ops += add->num_copy_ops; 4264 total->read_latency_ticks += add->read_latency_ticks; 4265 total->write_latency_ticks += add->write_latency_ticks; 4266 total->unmap_latency_ticks += add->unmap_latency_ticks; 4267 total->copy_latency_ticks += add->copy_latency_ticks; 4268 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4269 total->max_read_latency_ticks = add->max_read_latency_ticks; 4270 } 4271 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4272 total->min_read_latency_ticks = add->min_read_latency_ticks; 4273 } 4274 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4275 total->max_write_latency_ticks = add->max_write_latency_ticks; 4276 } 4277 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4278 total->min_write_latency_ticks = add->min_write_latency_ticks; 4279 } 4280 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4281 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4282 } 4283 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4284 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4285 } 4286 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4287 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4288 } 4289 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4290 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4291 } 4292 } 4293 4294 static void 4295 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4296 { 4297 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4298 4299 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4300 memcpy(to_stat->io_error, from_stat->io_error, 4301 sizeof(struct spdk_bdev_io_error_stat)); 4302 } 4303 } 4304 4305 void 4306 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4307 { 4308 stat->max_read_latency_ticks = 0; 4309 stat->min_read_latency_ticks = UINT64_MAX; 4310 stat->max_write_latency_ticks = 0; 4311 stat->min_write_latency_ticks = UINT64_MAX; 4312 stat->max_unmap_latency_ticks = 0; 4313 stat->min_unmap_latency_ticks = UINT64_MAX; 4314 stat->max_copy_latency_ticks = 0; 4315 stat->min_copy_latency_ticks = UINT64_MAX; 4316 4317 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4318 return; 4319 } 4320 4321 stat->bytes_read = 0; 4322 stat->num_read_ops = 0; 4323 stat->bytes_written = 0; 4324 stat->num_write_ops = 0; 4325 stat->bytes_unmapped = 0; 4326 stat->num_unmap_ops = 0; 4327 stat->bytes_copied = 0; 4328 stat->num_copy_ops = 0; 4329 stat->read_latency_ticks = 0; 4330 stat->write_latency_ticks = 0; 4331 stat->unmap_latency_ticks = 0; 4332 stat->copy_latency_ticks = 0; 4333 4334 if (stat->io_error != NULL) { 4335 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4336 } 4337 } 4338 4339 struct spdk_bdev_io_stat * 4340 bdev_alloc_io_stat(bool io_error_stat) 4341 { 4342 struct spdk_bdev_io_stat *stat; 4343 4344 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4345 if (stat == NULL) { 4346 return NULL; 4347 } 4348 4349 if (io_error_stat) { 4350 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4351 if (stat->io_error == NULL) { 4352 free(stat); 4353 return NULL; 4354 } 4355 } else { 4356 stat->io_error = NULL; 4357 } 4358 4359 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4360 4361 return stat; 4362 } 4363 4364 void 4365 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4366 { 4367 if (stat != NULL) { 4368 free(stat->io_error); 4369 free(stat); 4370 } 4371 } 4372 4373 void 4374 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4375 { 4376 int i; 4377 4378 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4379 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4380 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4381 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4382 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4383 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4384 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4385 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4386 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4387 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4388 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4389 stat->min_read_latency_ticks != UINT64_MAX ? 4390 stat->min_read_latency_ticks : 0); 4391 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4392 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4393 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4394 stat->min_write_latency_ticks != UINT64_MAX ? 4395 stat->min_write_latency_ticks : 0); 4396 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4397 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4398 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4399 stat->min_unmap_latency_ticks != UINT64_MAX ? 4400 stat->min_unmap_latency_ticks : 0); 4401 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4402 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4403 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4404 stat->min_copy_latency_ticks != UINT64_MAX ? 4405 stat->min_copy_latency_ticks : 0); 4406 4407 if (stat->io_error != NULL) { 4408 spdk_json_write_named_object_begin(w, "io_error"); 4409 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4410 if (stat->io_error->error_status[i] != 0) { 4411 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4412 stat->io_error->error_status[i]); 4413 } 4414 } 4415 spdk_json_write_object_end(w); 4416 } 4417 } 4418 4419 static void 4420 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4421 { 4422 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4423 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4424 4425 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4426 bdev_abort_all_buf_io(mgmt_ch, ch); 4427 } 4428 4429 static void 4430 bdev_channel_destroy(void *io_device, void *ctx_buf) 4431 { 4432 struct spdk_bdev_channel *ch = ctx_buf; 4433 4434 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4435 spdk_get_thread()); 4436 4437 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4438 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4439 4440 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4441 spdk_spin_lock(&ch->bdev->internal.spinlock); 4442 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4443 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4444 4445 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4446 4447 bdev_channel_abort_queued_ios(ch); 4448 4449 if (ch->histogram) { 4450 spdk_histogram_data_free(ch->histogram); 4451 } 4452 4453 bdev_channel_destroy_resource(ch); 4454 } 4455 4456 /* 4457 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4458 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4459 */ 4460 static int 4461 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4462 { 4463 struct spdk_bdev_name *tmp; 4464 4465 bdev_name->name = strdup(name); 4466 if (bdev_name->name == NULL) { 4467 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4468 return -ENOMEM; 4469 } 4470 4471 bdev_name->bdev = bdev; 4472 4473 spdk_spin_lock(&g_bdev_mgr.spinlock); 4474 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4475 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4476 4477 if (tmp != NULL) { 4478 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4479 free(bdev_name->name); 4480 return -EEXIST; 4481 } 4482 4483 return 0; 4484 } 4485 4486 static void 4487 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4488 { 4489 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4490 free(bdev_name->name); 4491 } 4492 4493 static void 4494 bdev_name_del(struct spdk_bdev_name *bdev_name) 4495 { 4496 spdk_spin_lock(&g_bdev_mgr.spinlock); 4497 bdev_name_del_unsafe(bdev_name); 4498 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4499 } 4500 4501 int 4502 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4503 { 4504 struct spdk_bdev_alias *tmp; 4505 int ret; 4506 4507 if (alias == NULL) { 4508 SPDK_ERRLOG("Empty alias passed\n"); 4509 return -EINVAL; 4510 } 4511 4512 tmp = calloc(1, sizeof(*tmp)); 4513 if (tmp == NULL) { 4514 SPDK_ERRLOG("Unable to allocate alias\n"); 4515 return -ENOMEM; 4516 } 4517 4518 ret = bdev_name_add(&tmp->alias, bdev, alias); 4519 if (ret != 0) { 4520 free(tmp); 4521 return ret; 4522 } 4523 4524 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4525 4526 return 0; 4527 } 4528 4529 static int 4530 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4531 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4532 { 4533 struct spdk_bdev_alias *tmp; 4534 4535 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4536 if (strcmp(alias, tmp->alias.name) == 0) { 4537 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4538 alias_del_fn(&tmp->alias); 4539 free(tmp); 4540 return 0; 4541 } 4542 } 4543 4544 return -ENOENT; 4545 } 4546 4547 int 4548 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4549 { 4550 int rc; 4551 4552 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4553 if (rc == -ENOENT) { 4554 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4555 } 4556 4557 return rc; 4558 } 4559 4560 void 4561 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4562 { 4563 struct spdk_bdev_alias *p, *tmp; 4564 4565 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4566 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4567 bdev_name_del(&p->alias); 4568 free(p); 4569 } 4570 } 4571 4572 struct spdk_io_channel * 4573 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4574 { 4575 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4576 } 4577 4578 void * 4579 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4580 { 4581 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4582 void *ctx = NULL; 4583 4584 if (bdev->fn_table->get_module_ctx) { 4585 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4586 } 4587 4588 return ctx; 4589 } 4590 4591 const char * 4592 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4593 { 4594 return bdev->module->name; 4595 } 4596 4597 const char * 4598 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4599 { 4600 return bdev->name; 4601 } 4602 4603 const char * 4604 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4605 { 4606 return bdev->product_name; 4607 } 4608 4609 const struct spdk_bdev_aliases_list * 4610 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4611 { 4612 return &bdev->aliases; 4613 } 4614 4615 uint32_t 4616 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4617 { 4618 return bdev->blocklen; 4619 } 4620 4621 uint32_t 4622 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4623 { 4624 return bdev->write_unit_size; 4625 } 4626 4627 uint64_t 4628 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4629 { 4630 return bdev->blockcnt; 4631 } 4632 4633 const char * 4634 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4635 { 4636 return qos_rpc_type[type]; 4637 } 4638 4639 void 4640 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4641 { 4642 int i; 4643 4644 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4645 4646 spdk_spin_lock(&bdev->internal.spinlock); 4647 if (bdev->internal.qos) { 4648 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4649 if (bdev->internal.qos->rate_limits[i].limit != 4650 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4651 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4652 if (bdev_qos_is_iops_rate_limit(i) == false) { 4653 /* Change from Byte to Megabyte which is user visible. */ 4654 limits[i] = limits[i] / 1024 / 1024; 4655 } 4656 } 4657 } 4658 } 4659 spdk_spin_unlock(&bdev->internal.spinlock); 4660 } 4661 4662 size_t 4663 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4664 { 4665 return 1 << bdev->required_alignment; 4666 } 4667 4668 uint32_t 4669 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4670 { 4671 return bdev->optimal_io_boundary; 4672 } 4673 4674 bool 4675 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4676 { 4677 return bdev->write_cache; 4678 } 4679 4680 const struct spdk_uuid * 4681 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4682 { 4683 return &bdev->uuid; 4684 } 4685 4686 uint16_t 4687 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4688 { 4689 return bdev->acwu; 4690 } 4691 4692 uint32_t 4693 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4694 { 4695 return bdev->md_len; 4696 } 4697 4698 bool 4699 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4700 { 4701 return (bdev->md_len != 0) && bdev->md_interleave; 4702 } 4703 4704 bool 4705 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4706 { 4707 return (bdev->md_len != 0) && !bdev->md_interleave; 4708 } 4709 4710 bool 4711 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4712 { 4713 return bdev->zoned; 4714 } 4715 4716 uint32_t 4717 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4718 { 4719 if (spdk_bdev_is_md_interleaved(bdev)) { 4720 return bdev->blocklen - bdev->md_len; 4721 } else { 4722 return bdev->blocklen; 4723 } 4724 } 4725 4726 uint32_t 4727 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4728 { 4729 return bdev->phys_blocklen; 4730 } 4731 4732 static uint32_t 4733 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4734 { 4735 if (!spdk_bdev_is_md_interleaved(bdev)) { 4736 return bdev->blocklen + bdev->md_len; 4737 } else { 4738 return bdev->blocklen; 4739 } 4740 } 4741 4742 /* We have to use the typedef in the function declaration to appease astyle. */ 4743 typedef enum spdk_dif_type spdk_dif_type_t; 4744 4745 spdk_dif_type_t 4746 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4747 { 4748 if (bdev->md_len != 0) { 4749 return bdev->dif_type; 4750 } else { 4751 return SPDK_DIF_DISABLE; 4752 } 4753 } 4754 4755 bool 4756 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4757 { 4758 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4759 return bdev->dif_is_head_of_md; 4760 } else { 4761 return false; 4762 } 4763 } 4764 4765 bool 4766 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4767 enum spdk_dif_check_type check_type) 4768 { 4769 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4770 return false; 4771 } 4772 4773 switch (check_type) { 4774 case SPDK_DIF_CHECK_TYPE_REFTAG: 4775 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4776 case SPDK_DIF_CHECK_TYPE_APPTAG: 4777 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4778 case SPDK_DIF_CHECK_TYPE_GUARD: 4779 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4780 default: 4781 return false; 4782 } 4783 } 4784 4785 static uint32_t 4786 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4787 { 4788 uint64_t aligned_length, max_write_blocks; 4789 4790 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4791 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4792 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4793 4794 return max_write_blocks; 4795 } 4796 4797 uint32_t 4798 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4799 { 4800 return bdev->max_copy; 4801 } 4802 4803 uint64_t 4804 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4805 { 4806 return bdev->internal.measured_queue_depth; 4807 } 4808 4809 uint64_t 4810 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4811 { 4812 return bdev->internal.period; 4813 } 4814 4815 uint64_t 4816 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4817 { 4818 return bdev->internal.weighted_io_time; 4819 } 4820 4821 uint64_t 4822 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4823 { 4824 return bdev->internal.io_time; 4825 } 4826 4827 static void bdev_update_qd_sampling_period(void *ctx); 4828 4829 static void 4830 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4831 { 4832 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4833 4834 if (bdev->internal.measured_queue_depth) { 4835 bdev->internal.io_time += bdev->internal.period; 4836 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4837 } 4838 4839 bdev->internal.qd_poll_in_progress = false; 4840 4841 bdev_update_qd_sampling_period(bdev); 4842 } 4843 4844 static void 4845 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4846 struct spdk_io_channel *io_ch, void *_ctx) 4847 { 4848 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4849 4850 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4851 spdk_bdev_for_each_channel_continue(i, 0); 4852 } 4853 4854 static int 4855 bdev_calculate_measured_queue_depth(void *ctx) 4856 { 4857 struct spdk_bdev *bdev = ctx; 4858 4859 bdev->internal.qd_poll_in_progress = true; 4860 bdev->internal.temporary_queue_depth = 0; 4861 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4862 return SPDK_POLLER_BUSY; 4863 } 4864 4865 static void 4866 bdev_update_qd_sampling_period(void *ctx) 4867 { 4868 struct spdk_bdev *bdev = ctx; 4869 4870 if (bdev->internal.period == bdev->internal.new_period) { 4871 return; 4872 } 4873 4874 if (bdev->internal.qd_poll_in_progress) { 4875 return; 4876 } 4877 4878 bdev->internal.period = bdev->internal.new_period; 4879 4880 spdk_poller_unregister(&bdev->internal.qd_poller); 4881 if (bdev->internal.period != 0) { 4882 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4883 bdev, bdev->internal.period); 4884 } else { 4885 spdk_bdev_close(bdev->internal.qd_desc); 4886 bdev->internal.qd_desc = NULL; 4887 } 4888 } 4889 4890 static void 4891 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4892 { 4893 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4894 } 4895 4896 void 4897 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4898 { 4899 int rc; 4900 4901 if (bdev->internal.new_period == period) { 4902 return; 4903 } 4904 4905 bdev->internal.new_period = period; 4906 4907 if (bdev->internal.qd_desc != NULL) { 4908 assert(bdev->internal.period != 0); 4909 4910 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4911 bdev_update_qd_sampling_period, bdev); 4912 return; 4913 } 4914 4915 assert(bdev->internal.period == 0); 4916 4917 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4918 NULL, &bdev->internal.qd_desc); 4919 if (rc != 0) { 4920 return; 4921 } 4922 4923 bdev->internal.period = period; 4924 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4925 bdev, period); 4926 } 4927 4928 struct bdev_get_current_qd_ctx { 4929 uint64_t current_qd; 4930 spdk_bdev_get_current_qd_cb cb_fn; 4931 void *cb_arg; 4932 }; 4933 4934 static void 4935 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4936 { 4937 struct bdev_get_current_qd_ctx *ctx = _ctx; 4938 4939 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4940 4941 free(ctx); 4942 } 4943 4944 static void 4945 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4946 struct spdk_io_channel *io_ch, void *_ctx) 4947 { 4948 struct bdev_get_current_qd_ctx *ctx = _ctx; 4949 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4950 4951 ctx->current_qd += bdev_ch->io_outstanding; 4952 4953 spdk_bdev_for_each_channel_continue(i, 0); 4954 } 4955 4956 void 4957 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4958 void *cb_arg) 4959 { 4960 struct bdev_get_current_qd_ctx *ctx; 4961 4962 assert(cb_fn != NULL); 4963 4964 ctx = calloc(1, sizeof(*ctx)); 4965 if (ctx == NULL) { 4966 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4967 return; 4968 } 4969 4970 ctx->cb_fn = cb_fn; 4971 ctx->cb_arg = cb_arg; 4972 4973 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4974 } 4975 4976 static void 4977 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4978 { 4979 assert(desc->thread == spdk_get_thread()); 4980 4981 spdk_spin_lock(&desc->spinlock); 4982 desc->refs--; 4983 if (!desc->closed) { 4984 spdk_spin_unlock(&desc->spinlock); 4985 desc->callback.event_fn(type, 4986 desc->bdev, 4987 desc->callback.ctx); 4988 return; 4989 } else if (desc->refs == 0) { 4990 /* This descriptor was closed after this event_notify message was sent. 4991 * spdk_bdev_close() could not free the descriptor since this message was 4992 * in flight, so we free it now using bdev_desc_free(). 4993 */ 4994 spdk_spin_unlock(&desc->spinlock); 4995 bdev_desc_free(desc); 4996 return; 4997 } 4998 spdk_spin_unlock(&desc->spinlock); 4999 } 5000 5001 static void 5002 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5003 { 5004 spdk_spin_lock(&desc->spinlock); 5005 desc->refs++; 5006 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5007 spdk_spin_unlock(&desc->spinlock); 5008 } 5009 5010 static void 5011 _resize_notify(void *ctx) 5012 { 5013 struct spdk_bdev_desc *desc = ctx; 5014 5015 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5016 } 5017 5018 int 5019 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5020 { 5021 struct spdk_bdev_desc *desc; 5022 int ret; 5023 5024 if (size == bdev->blockcnt) { 5025 return 0; 5026 } 5027 5028 spdk_spin_lock(&bdev->internal.spinlock); 5029 5030 /* bdev has open descriptors */ 5031 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5032 bdev->blockcnt > size) { 5033 ret = -EBUSY; 5034 } else { 5035 bdev->blockcnt = size; 5036 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5037 event_notify(desc, _resize_notify); 5038 } 5039 ret = 0; 5040 } 5041 5042 spdk_spin_unlock(&bdev->internal.spinlock); 5043 5044 return ret; 5045 } 5046 5047 /* 5048 * Convert I/O offset and length from bytes to blocks. 5049 * 5050 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5051 */ 5052 static uint64_t 5053 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5054 uint64_t num_bytes, uint64_t *num_blocks) 5055 { 5056 uint32_t block_size = bdev->blocklen; 5057 uint8_t shift_cnt; 5058 5059 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5060 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5061 shift_cnt = spdk_u32log2(block_size); 5062 *offset_blocks = offset_bytes >> shift_cnt; 5063 *num_blocks = num_bytes >> shift_cnt; 5064 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5065 (num_bytes - (*num_blocks << shift_cnt)); 5066 } else { 5067 *offset_blocks = offset_bytes / block_size; 5068 *num_blocks = num_bytes / block_size; 5069 return (offset_bytes % block_size) | (num_bytes % block_size); 5070 } 5071 } 5072 5073 static bool 5074 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5075 { 5076 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5077 * has been an overflow and hence the offset has been wrapped around */ 5078 if (offset_blocks + num_blocks < offset_blocks) { 5079 return false; 5080 } 5081 5082 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5083 if (offset_blocks + num_blocks > bdev->blockcnt) { 5084 return false; 5085 } 5086 5087 return true; 5088 } 5089 5090 static void 5091 bdev_seek_complete_cb(void *ctx) 5092 { 5093 struct spdk_bdev_io *bdev_io = ctx; 5094 5095 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5096 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5097 } 5098 5099 static int 5100 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5101 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5102 spdk_bdev_io_completion_cb cb, void *cb_arg) 5103 { 5104 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5105 struct spdk_bdev_io *bdev_io; 5106 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5107 5108 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5109 5110 /* Check if offset_blocks is valid looking at the validity of one block */ 5111 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5112 return -EINVAL; 5113 } 5114 5115 bdev_io = bdev_channel_get_io(channel); 5116 if (!bdev_io) { 5117 return -ENOMEM; 5118 } 5119 5120 bdev_io->internal.ch = channel; 5121 bdev_io->internal.desc = desc; 5122 bdev_io->type = io_type; 5123 bdev_io->u.bdev.offset_blocks = offset_blocks; 5124 bdev_io->u.bdev.memory_domain = NULL; 5125 bdev_io->u.bdev.memory_domain_ctx = NULL; 5126 bdev_io->u.bdev.accel_sequence = NULL; 5127 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5128 5129 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5130 /* In case bdev doesn't support seek to next data/hole offset, 5131 * it is assumed that only data and no holes are present */ 5132 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5133 bdev_io->u.bdev.seek.offset = offset_blocks; 5134 } else { 5135 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5136 } 5137 5138 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5139 return 0; 5140 } 5141 5142 bdev_io_submit(bdev_io); 5143 return 0; 5144 } 5145 5146 int 5147 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5148 uint64_t offset_blocks, 5149 spdk_bdev_io_completion_cb cb, void *cb_arg) 5150 { 5151 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5152 } 5153 5154 int 5155 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5156 uint64_t offset_blocks, 5157 spdk_bdev_io_completion_cb cb, void *cb_arg) 5158 { 5159 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5160 } 5161 5162 uint64_t 5163 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5164 { 5165 return bdev_io->u.bdev.seek.offset; 5166 } 5167 5168 static int 5169 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5170 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5171 spdk_bdev_io_completion_cb cb, void *cb_arg) 5172 { 5173 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5174 struct spdk_bdev_io *bdev_io; 5175 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5176 5177 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5178 return -EINVAL; 5179 } 5180 5181 bdev_io = bdev_channel_get_io(channel); 5182 if (!bdev_io) { 5183 return -ENOMEM; 5184 } 5185 5186 bdev_io->internal.ch = channel; 5187 bdev_io->internal.desc = desc; 5188 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5189 bdev_io->u.bdev.iovs = &bdev_io->iov; 5190 bdev_io->u.bdev.iovs[0].iov_base = buf; 5191 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5192 bdev_io->u.bdev.iovcnt = 1; 5193 bdev_io->u.bdev.md_buf = md_buf; 5194 bdev_io->u.bdev.num_blocks = num_blocks; 5195 bdev_io->u.bdev.offset_blocks = offset_blocks; 5196 bdev_io->u.bdev.memory_domain = NULL; 5197 bdev_io->u.bdev.memory_domain_ctx = NULL; 5198 bdev_io->u.bdev.accel_sequence = NULL; 5199 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5200 5201 bdev_io_submit(bdev_io); 5202 return 0; 5203 } 5204 5205 int 5206 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5207 void *buf, uint64_t offset, uint64_t nbytes, 5208 spdk_bdev_io_completion_cb cb, void *cb_arg) 5209 { 5210 uint64_t offset_blocks, num_blocks; 5211 5212 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5213 nbytes, &num_blocks) != 0) { 5214 return -EINVAL; 5215 } 5216 5217 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5218 } 5219 5220 int 5221 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5222 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5223 spdk_bdev_io_completion_cb cb, void *cb_arg) 5224 { 5225 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5226 } 5227 5228 int 5229 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5230 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5231 spdk_bdev_io_completion_cb cb, void *cb_arg) 5232 { 5233 struct iovec iov = { 5234 .iov_base = buf, 5235 }; 5236 5237 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5238 return -EINVAL; 5239 } 5240 5241 if (md_buf && !_is_buf_allocated(&iov)) { 5242 return -EINVAL; 5243 } 5244 5245 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5246 cb, cb_arg); 5247 } 5248 5249 int 5250 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5251 struct iovec *iov, int iovcnt, 5252 uint64_t offset, uint64_t nbytes, 5253 spdk_bdev_io_completion_cb cb, void *cb_arg) 5254 { 5255 uint64_t offset_blocks, num_blocks; 5256 5257 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5258 nbytes, &num_blocks) != 0) { 5259 return -EINVAL; 5260 } 5261 5262 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5263 } 5264 5265 static int 5266 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5267 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5268 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5269 struct spdk_accel_sequence *seq, 5270 spdk_bdev_io_completion_cb cb, void *cb_arg) 5271 { 5272 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5273 struct spdk_bdev_io *bdev_io; 5274 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5275 5276 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5277 return -EINVAL; 5278 } 5279 5280 bdev_io = bdev_channel_get_io(channel); 5281 if (!bdev_io) { 5282 return -ENOMEM; 5283 } 5284 5285 bdev_io->internal.ch = channel; 5286 bdev_io->internal.desc = desc; 5287 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5288 bdev_io->u.bdev.iovs = iov; 5289 bdev_io->u.bdev.iovcnt = iovcnt; 5290 bdev_io->u.bdev.md_buf = md_buf; 5291 bdev_io->u.bdev.num_blocks = num_blocks; 5292 bdev_io->u.bdev.offset_blocks = offset_blocks; 5293 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5294 bdev_io->internal.memory_domain = domain; 5295 bdev_io->internal.memory_domain_ctx = domain_ctx; 5296 bdev_io->internal.accel_sequence = seq; 5297 bdev_io->internal.has_accel_sequence = seq != NULL; 5298 bdev_io->u.bdev.memory_domain = domain; 5299 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5300 bdev_io->u.bdev.accel_sequence = seq; 5301 5302 _bdev_io_submit_ext(desc, bdev_io); 5303 5304 return 0; 5305 } 5306 5307 int 5308 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5309 struct iovec *iov, int iovcnt, 5310 uint64_t offset_blocks, uint64_t num_blocks, 5311 spdk_bdev_io_completion_cb cb, void *cb_arg) 5312 { 5313 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5314 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5315 } 5316 5317 int 5318 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5319 struct iovec *iov, int iovcnt, void *md_buf, 5320 uint64_t offset_blocks, uint64_t num_blocks, 5321 spdk_bdev_io_completion_cb cb, void *cb_arg) 5322 { 5323 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5324 return -EINVAL; 5325 } 5326 5327 if (md_buf && !_is_buf_allocated(iov)) { 5328 return -EINVAL; 5329 } 5330 5331 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5332 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5333 } 5334 5335 static inline bool 5336 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5337 { 5338 /* 5339 * We check if opts size is at least of size when we first introduced 5340 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5341 * are not checked internal. 5342 */ 5343 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5344 sizeof(opts->metadata) && 5345 opts->size <= sizeof(*opts) && 5346 /* When memory domain is used, the user must provide data buffers */ 5347 (!opts->memory_domain || (iov && iov[0].iov_base)); 5348 } 5349 5350 int 5351 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5352 struct iovec *iov, int iovcnt, 5353 uint64_t offset_blocks, uint64_t num_blocks, 5354 spdk_bdev_io_completion_cb cb, void *cb_arg, 5355 struct spdk_bdev_ext_io_opts *opts) 5356 { 5357 void *md = NULL; 5358 5359 if (opts) { 5360 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5361 return -EINVAL; 5362 } 5363 md = opts->metadata; 5364 } 5365 5366 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5367 return -EINVAL; 5368 } 5369 5370 if (md && !_is_buf_allocated(iov)) { 5371 return -EINVAL; 5372 } 5373 5374 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5375 num_blocks, 5376 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5377 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5378 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5379 cb, cb_arg); 5380 } 5381 5382 static int 5383 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5384 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5385 spdk_bdev_io_completion_cb cb, void *cb_arg) 5386 { 5387 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5388 struct spdk_bdev_io *bdev_io; 5389 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5390 5391 if (!desc->write) { 5392 return -EBADF; 5393 } 5394 5395 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5396 return -EINVAL; 5397 } 5398 5399 bdev_io = bdev_channel_get_io(channel); 5400 if (!bdev_io) { 5401 return -ENOMEM; 5402 } 5403 5404 bdev_io->internal.ch = channel; 5405 bdev_io->internal.desc = desc; 5406 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5407 bdev_io->u.bdev.iovs = &bdev_io->iov; 5408 bdev_io->u.bdev.iovs[0].iov_base = buf; 5409 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5410 bdev_io->u.bdev.iovcnt = 1; 5411 bdev_io->u.bdev.md_buf = md_buf; 5412 bdev_io->u.bdev.num_blocks = num_blocks; 5413 bdev_io->u.bdev.offset_blocks = offset_blocks; 5414 bdev_io->u.bdev.memory_domain = NULL; 5415 bdev_io->u.bdev.memory_domain_ctx = NULL; 5416 bdev_io->u.bdev.accel_sequence = NULL; 5417 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5418 5419 bdev_io_submit(bdev_io); 5420 return 0; 5421 } 5422 5423 int 5424 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5425 void *buf, uint64_t offset, uint64_t nbytes, 5426 spdk_bdev_io_completion_cb cb, void *cb_arg) 5427 { 5428 uint64_t offset_blocks, num_blocks; 5429 5430 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5431 nbytes, &num_blocks) != 0) { 5432 return -EINVAL; 5433 } 5434 5435 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5436 } 5437 5438 int 5439 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5440 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5441 spdk_bdev_io_completion_cb cb, void *cb_arg) 5442 { 5443 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5444 cb, cb_arg); 5445 } 5446 5447 int 5448 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5449 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5450 spdk_bdev_io_completion_cb cb, void *cb_arg) 5451 { 5452 struct iovec iov = { 5453 .iov_base = buf, 5454 }; 5455 5456 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5457 return -EINVAL; 5458 } 5459 5460 if (md_buf && !_is_buf_allocated(&iov)) { 5461 return -EINVAL; 5462 } 5463 5464 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5465 cb, cb_arg); 5466 } 5467 5468 static int 5469 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5470 struct iovec *iov, int iovcnt, void *md_buf, 5471 uint64_t offset_blocks, uint64_t num_blocks, 5472 struct spdk_memory_domain *domain, void *domain_ctx, 5473 struct spdk_accel_sequence *seq, 5474 spdk_bdev_io_completion_cb cb, void *cb_arg) 5475 { 5476 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5477 struct spdk_bdev_io *bdev_io; 5478 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5479 5480 if (!desc->write) { 5481 return -EBADF; 5482 } 5483 5484 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5485 return -EINVAL; 5486 } 5487 5488 bdev_io = bdev_channel_get_io(channel); 5489 if (!bdev_io) { 5490 return -ENOMEM; 5491 } 5492 5493 bdev_io->internal.ch = channel; 5494 bdev_io->internal.desc = desc; 5495 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5496 bdev_io->u.bdev.iovs = iov; 5497 bdev_io->u.bdev.iovcnt = iovcnt; 5498 bdev_io->u.bdev.md_buf = md_buf; 5499 bdev_io->u.bdev.num_blocks = num_blocks; 5500 bdev_io->u.bdev.offset_blocks = offset_blocks; 5501 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5502 bdev_io->internal.memory_domain = domain; 5503 bdev_io->internal.memory_domain_ctx = domain_ctx; 5504 bdev_io->internal.accel_sequence = seq; 5505 bdev_io->internal.has_accel_sequence = seq != NULL; 5506 bdev_io->u.bdev.memory_domain = domain; 5507 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5508 bdev_io->u.bdev.accel_sequence = seq; 5509 5510 _bdev_io_submit_ext(desc, bdev_io); 5511 5512 return 0; 5513 } 5514 5515 int 5516 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5517 struct iovec *iov, int iovcnt, 5518 uint64_t offset, uint64_t len, 5519 spdk_bdev_io_completion_cb cb, void *cb_arg) 5520 { 5521 uint64_t offset_blocks, num_blocks; 5522 5523 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5524 len, &num_blocks) != 0) { 5525 return -EINVAL; 5526 } 5527 5528 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5529 } 5530 5531 int 5532 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5533 struct iovec *iov, int iovcnt, 5534 uint64_t offset_blocks, uint64_t num_blocks, 5535 spdk_bdev_io_completion_cb cb, void *cb_arg) 5536 { 5537 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5538 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5539 } 5540 5541 int 5542 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5543 struct iovec *iov, int iovcnt, void *md_buf, 5544 uint64_t offset_blocks, uint64_t num_blocks, 5545 spdk_bdev_io_completion_cb cb, void *cb_arg) 5546 { 5547 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5548 return -EINVAL; 5549 } 5550 5551 if (md_buf && !_is_buf_allocated(iov)) { 5552 return -EINVAL; 5553 } 5554 5555 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5556 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5557 } 5558 5559 int 5560 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5561 struct iovec *iov, int iovcnt, 5562 uint64_t offset_blocks, uint64_t num_blocks, 5563 spdk_bdev_io_completion_cb cb, void *cb_arg, 5564 struct spdk_bdev_ext_io_opts *opts) 5565 { 5566 void *md = NULL; 5567 5568 if (opts) { 5569 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5570 return -EINVAL; 5571 } 5572 md = opts->metadata; 5573 } 5574 5575 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5576 return -EINVAL; 5577 } 5578 5579 if (md && !_is_buf_allocated(iov)) { 5580 return -EINVAL; 5581 } 5582 5583 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5584 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5585 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5586 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5587 cb, cb_arg); 5588 } 5589 5590 static void 5591 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5592 { 5593 struct spdk_bdev_io *parent_io = cb_arg; 5594 struct spdk_bdev *bdev = parent_io->bdev; 5595 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5596 int i, rc = 0; 5597 5598 if (!success) { 5599 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5600 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5601 spdk_bdev_free_io(bdev_io); 5602 return; 5603 } 5604 5605 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5606 rc = memcmp(read_buf, 5607 parent_io->u.bdev.iovs[i].iov_base, 5608 parent_io->u.bdev.iovs[i].iov_len); 5609 if (rc) { 5610 break; 5611 } 5612 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5613 } 5614 5615 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5616 rc = memcmp(bdev_io->u.bdev.md_buf, 5617 parent_io->u.bdev.md_buf, 5618 spdk_bdev_get_md_size(bdev)); 5619 } 5620 5621 spdk_bdev_free_io(bdev_io); 5622 5623 if (rc == 0) { 5624 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5625 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5626 } else { 5627 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5628 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5629 } 5630 } 5631 5632 static void 5633 bdev_compare_do_read(void *_bdev_io) 5634 { 5635 struct spdk_bdev_io *bdev_io = _bdev_io; 5636 int rc; 5637 5638 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5639 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5640 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5641 bdev_compare_do_read_done, bdev_io); 5642 5643 if (rc == -ENOMEM) { 5644 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5645 } else if (rc != 0) { 5646 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5647 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5648 } 5649 } 5650 5651 static int 5652 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5653 struct iovec *iov, int iovcnt, void *md_buf, 5654 uint64_t offset_blocks, uint64_t num_blocks, 5655 spdk_bdev_io_completion_cb cb, void *cb_arg) 5656 { 5657 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5658 struct spdk_bdev_io *bdev_io; 5659 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5660 5661 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5662 return -EINVAL; 5663 } 5664 5665 bdev_io = bdev_channel_get_io(channel); 5666 if (!bdev_io) { 5667 return -ENOMEM; 5668 } 5669 5670 bdev_io->internal.ch = channel; 5671 bdev_io->internal.desc = desc; 5672 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5673 bdev_io->u.bdev.iovs = iov; 5674 bdev_io->u.bdev.iovcnt = iovcnt; 5675 bdev_io->u.bdev.md_buf = md_buf; 5676 bdev_io->u.bdev.num_blocks = num_blocks; 5677 bdev_io->u.bdev.offset_blocks = offset_blocks; 5678 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5679 bdev_io->u.bdev.memory_domain = NULL; 5680 bdev_io->u.bdev.memory_domain_ctx = NULL; 5681 bdev_io->u.bdev.accel_sequence = NULL; 5682 5683 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5684 bdev_io_submit(bdev_io); 5685 return 0; 5686 } 5687 5688 bdev_compare_do_read(bdev_io); 5689 5690 return 0; 5691 } 5692 5693 int 5694 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5695 struct iovec *iov, int iovcnt, 5696 uint64_t offset_blocks, uint64_t num_blocks, 5697 spdk_bdev_io_completion_cb cb, void *cb_arg) 5698 { 5699 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5700 num_blocks, cb, cb_arg); 5701 } 5702 5703 int 5704 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5705 struct iovec *iov, int iovcnt, void *md_buf, 5706 uint64_t offset_blocks, uint64_t num_blocks, 5707 spdk_bdev_io_completion_cb cb, void *cb_arg) 5708 { 5709 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5710 return -EINVAL; 5711 } 5712 5713 if (md_buf && !_is_buf_allocated(iov)) { 5714 return -EINVAL; 5715 } 5716 5717 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5718 num_blocks, cb, cb_arg); 5719 } 5720 5721 static int 5722 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5723 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5724 spdk_bdev_io_completion_cb cb, void *cb_arg) 5725 { 5726 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5727 struct spdk_bdev_io *bdev_io; 5728 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5729 5730 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5731 return -EINVAL; 5732 } 5733 5734 bdev_io = bdev_channel_get_io(channel); 5735 if (!bdev_io) { 5736 return -ENOMEM; 5737 } 5738 5739 bdev_io->internal.ch = channel; 5740 bdev_io->internal.desc = desc; 5741 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5742 bdev_io->u.bdev.iovs = &bdev_io->iov; 5743 bdev_io->u.bdev.iovs[0].iov_base = buf; 5744 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5745 bdev_io->u.bdev.iovcnt = 1; 5746 bdev_io->u.bdev.md_buf = md_buf; 5747 bdev_io->u.bdev.num_blocks = num_blocks; 5748 bdev_io->u.bdev.offset_blocks = offset_blocks; 5749 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5750 bdev_io->u.bdev.memory_domain = NULL; 5751 bdev_io->u.bdev.memory_domain_ctx = NULL; 5752 bdev_io->u.bdev.accel_sequence = NULL; 5753 5754 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5755 bdev_io_submit(bdev_io); 5756 return 0; 5757 } 5758 5759 bdev_compare_do_read(bdev_io); 5760 5761 return 0; 5762 } 5763 5764 int 5765 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5766 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5767 spdk_bdev_io_completion_cb cb, void *cb_arg) 5768 { 5769 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5770 cb, cb_arg); 5771 } 5772 5773 int 5774 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5775 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5776 spdk_bdev_io_completion_cb cb, void *cb_arg) 5777 { 5778 struct iovec iov = { 5779 .iov_base = buf, 5780 }; 5781 5782 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5783 return -EINVAL; 5784 } 5785 5786 if (md_buf && !_is_buf_allocated(&iov)) { 5787 return -EINVAL; 5788 } 5789 5790 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5791 cb, cb_arg); 5792 } 5793 5794 static void 5795 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5796 { 5797 struct spdk_bdev_io *bdev_io = ctx; 5798 5799 if (unlock_status) { 5800 SPDK_ERRLOG("LBA range unlock failed\n"); 5801 } 5802 5803 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5804 false, bdev_io->internal.caller_ctx); 5805 } 5806 5807 static void 5808 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5809 { 5810 bdev_io->internal.status = status; 5811 5812 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5813 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5814 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5815 } 5816 5817 static void 5818 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5819 { 5820 struct spdk_bdev_io *parent_io = cb_arg; 5821 5822 if (!success) { 5823 SPDK_ERRLOG("Compare and write operation failed\n"); 5824 } 5825 5826 spdk_bdev_free_io(bdev_io); 5827 5828 bdev_comparev_and_writev_blocks_unlock(parent_io, 5829 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5830 } 5831 5832 static void 5833 bdev_compare_and_write_do_write(void *_bdev_io) 5834 { 5835 struct spdk_bdev_io *bdev_io = _bdev_io; 5836 int rc; 5837 5838 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5839 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5840 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5841 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5842 bdev_compare_and_write_do_write_done, bdev_io); 5843 5844 5845 if (rc == -ENOMEM) { 5846 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5847 } else if (rc != 0) { 5848 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5849 } 5850 } 5851 5852 static void 5853 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5854 { 5855 struct spdk_bdev_io *parent_io = cb_arg; 5856 5857 spdk_bdev_free_io(bdev_io); 5858 5859 if (!success) { 5860 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5861 return; 5862 } 5863 5864 bdev_compare_and_write_do_write(parent_io); 5865 } 5866 5867 static void 5868 bdev_compare_and_write_do_compare(void *_bdev_io) 5869 { 5870 struct spdk_bdev_io *bdev_io = _bdev_io; 5871 int rc; 5872 5873 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5874 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5875 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5876 bdev_compare_and_write_do_compare_done, bdev_io); 5877 5878 if (rc == -ENOMEM) { 5879 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5880 } else if (rc != 0) { 5881 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5882 } 5883 } 5884 5885 static void 5886 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5887 { 5888 struct spdk_bdev_io *bdev_io = ctx; 5889 5890 if (status) { 5891 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5892 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5893 return; 5894 } 5895 5896 bdev_compare_and_write_do_compare(bdev_io); 5897 } 5898 5899 int 5900 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5901 struct iovec *compare_iov, int compare_iovcnt, 5902 struct iovec *write_iov, int write_iovcnt, 5903 uint64_t offset_blocks, uint64_t num_blocks, 5904 spdk_bdev_io_completion_cb cb, void *cb_arg) 5905 { 5906 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5907 struct spdk_bdev_io *bdev_io; 5908 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5909 5910 if (!desc->write) { 5911 return -EBADF; 5912 } 5913 5914 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5915 return -EINVAL; 5916 } 5917 5918 if (num_blocks > bdev->acwu) { 5919 return -EINVAL; 5920 } 5921 5922 bdev_io = bdev_channel_get_io(channel); 5923 if (!bdev_io) { 5924 return -ENOMEM; 5925 } 5926 5927 bdev_io->internal.ch = channel; 5928 bdev_io->internal.desc = desc; 5929 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5930 bdev_io->u.bdev.iovs = compare_iov; 5931 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5932 bdev_io->u.bdev.fused_iovs = write_iov; 5933 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5934 bdev_io->u.bdev.md_buf = NULL; 5935 bdev_io->u.bdev.num_blocks = num_blocks; 5936 bdev_io->u.bdev.offset_blocks = offset_blocks; 5937 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5938 bdev_io->u.bdev.memory_domain = NULL; 5939 bdev_io->u.bdev.memory_domain_ctx = NULL; 5940 bdev_io->u.bdev.accel_sequence = NULL; 5941 5942 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5943 bdev_io_submit(bdev_io); 5944 return 0; 5945 } 5946 5947 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5948 bdev_comparev_and_writev_blocks_locked, bdev_io); 5949 } 5950 5951 int 5952 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5953 struct iovec *iov, int iovcnt, 5954 uint64_t offset_blocks, uint64_t num_blocks, 5955 bool populate, 5956 spdk_bdev_io_completion_cb cb, void *cb_arg) 5957 { 5958 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5959 struct spdk_bdev_io *bdev_io; 5960 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5961 5962 if (!desc->write) { 5963 return -EBADF; 5964 } 5965 5966 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5967 return -EINVAL; 5968 } 5969 5970 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5971 return -ENOTSUP; 5972 } 5973 5974 bdev_io = bdev_channel_get_io(channel); 5975 if (!bdev_io) { 5976 return -ENOMEM; 5977 } 5978 5979 bdev_io->internal.ch = channel; 5980 bdev_io->internal.desc = desc; 5981 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5982 bdev_io->u.bdev.num_blocks = num_blocks; 5983 bdev_io->u.bdev.offset_blocks = offset_blocks; 5984 bdev_io->u.bdev.iovs = iov; 5985 bdev_io->u.bdev.iovcnt = iovcnt; 5986 bdev_io->u.bdev.md_buf = NULL; 5987 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5988 bdev_io->u.bdev.zcopy.commit = 0; 5989 bdev_io->u.bdev.zcopy.start = 1; 5990 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5991 bdev_io->u.bdev.memory_domain = NULL; 5992 bdev_io->u.bdev.memory_domain_ctx = NULL; 5993 bdev_io->u.bdev.accel_sequence = NULL; 5994 5995 bdev_io_submit(bdev_io); 5996 5997 return 0; 5998 } 5999 6000 int 6001 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6002 spdk_bdev_io_completion_cb cb, void *cb_arg) 6003 { 6004 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6005 return -EINVAL; 6006 } 6007 6008 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6009 bdev_io->u.bdev.zcopy.start = 0; 6010 bdev_io->internal.caller_ctx = cb_arg; 6011 bdev_io->internal.cb = cb; 6012 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6013 6014 bdev_io_submit(bdev_io); 6015 6016 return 0; 6017 } 6018 6019 int 6020 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6021 uint64_t offset, uint64_t len, 6022 spdk_bdev_io_completion_cb cb, void *cb_arg) 6023 { 6024 uint64_t offset_blocks, num_blocks; 6025 6026 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6027 len, &num_blocks) != 0) { 6028 return -EINVAL; 6029 } 6030 6031 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6032 } 6033 6034 int 6035 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6036 uint64_t offset_blocks, uint64_t num_blocks, 6037 spdk_bdev_io_completion_cb cb, void *cb_arg) 6038 { 6039 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6040 struct spdk_bdev_io *bdev_io; 6041 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6042 6043 if (!desc->write) { 6044 return -EBADF; 6045 } 6046 6047 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6048 return -EINVAL; 6049 } 6050 6051 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6052 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6053 return -ENOTSUP; 6054 } 6055 6056 bdev_io = bdev_channel_get_io(channel); 6057 6058 if (!bdev_io) { 6059 return -ENOMEM; 6060 } 6061 6062 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6063 bdev_io->internal.ch = channel; 6064 bdev_io->internal.desc = desc; 6065 bdev_io->u.bdev.offset_blocks = offset_blocks; 6066 bdev_io->u.bdev.num_blocks = num_blocks; 6067 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6068 bdev_io->u.bdev.memory_domain = NULL; 6069 bdev_io->u.bdev.memory_domain_ctx = NULL; 6070 bdev_io->u.bdev.accel_sequence = NULL; 6071 6072 /* If the write_zeroes size is large and should be split, use the generic split 6073 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6074 * 6075 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6076 * or emulate it using regular write request otherwise. 6077 */ 6078 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6079 bdev_io->internal.split) { 6080 bdev_io_submit(bdev_io); 6081 return 0; 6082 } 6083 6084 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6085 6086 return bdev_write_zero_buffer(bdev_io); 6087 } 6088 6089 int 6090 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6091 uint64_t offset, uint64_t nbytes, 6092 spdk_bdev_io_completion_cb cb, void *cb_arg) 6093 { 6094 uint64_t offset_blocks, num_blocks; 6095 6096 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6097 nbytes, &num_blocks) != 0) { 6098 return -EINVAL; 6099 } 6100 6101 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6102 } 6103 6104 int 6105 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6106 uint64_t offset_blocks, uint64_t num_blocks, 6107 spdk_bdev_io_completion_cb cb, void *cb_arg) 6108 { 6109 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6110 struct spdk_bdev_io *bdev_io; 6111 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6112 6113 if (!desc->write) { 6114 return -EBADF; 6115 } 6116 6117 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6118 return -EINVAL; 6119 } 6120 6121 if (num_blocks == 0) { 6122 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6123 return -EINVAL; 6124 } 6125 6126 bdev_io = bdev_channel_get_io(channel); 6127 if (!bdev_io) { 6128 return -ENOMEM; 6129 } 6130 6131 bdev_io->internal.ch = channel; 6132 bdev_io->internal.desc = desc; 6133 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6134 6135 bdev_io->u.bdev.iovs = &bdev_io->iov; 6136 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6137 bdev_io->u.bdev.iovs[0].iov_len = 0; 6138 bdev_io->u.bdev.iovcnt = 1; 6139 6140 bdev_io->u.bdev.offset_blocks = offset_blocks; 6141 bdev_io->u.bdev.num_blocks = num_blocks; 6142 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6143 bdev_io->u.bdev.memory_domain = NULL; 6144 bdev_io->u.bdev.memory_domain_ctx = NULL; 6145 bdev_io->u.bdev.accel_sequence = NULL; 6146 6147 bdev_io_submit(bdev_io); 6148 return 0; 6149 } 6150 6151 int 6152 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6153 uint64_t offset, uint64_t length, 6154 spdk_bdev_io_completion_cb cb, void *cb_arg) 6155 { 6156 uint64_t offset_blocks, num_blocks; 6157 6158 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6159 length, &num_blocks) != 0) { 6160 return -EINVAL; 6161 } 6162 6163 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6164 } 6165 6166 int 6167 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6168 uint64_t offset_blocks, uint64_t num_blocks, 6169 spdk_bdev_io_completion_cb cb, void *cb_arg) 6170 { 6171 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6172 struct spdk_bdev_io *bdev_io; 6173 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6174 6175 if (!desc->write) { 6176 return -EBADF; 6177 } 6178 6179 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6180 return -EINVAL; 6181 } 6182 6183 bdev_io = bdev_channel_get_io(channel); 6184 if (!bdev_io) { 6185 return -ENOMEM; 6186 } 6187 6188 bdev_io->internal.ch = channel; 6189 bdev_io->internal.desc = desc; 6190 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6191 bdev_io->u.bdev.iovs = NULL; 6192 bdev_io->u.bdev.iovcnt = 0; 6193 bdev_io->u.bdev.offset_blocks = offset_blocks; 6194 bdev_io->u.bdev.num_blocks = num_blocks; 6195 bdev_io->u.bdev.memory_domain = NULL; 6196 bdev_io->u.bdev.memory_domain_ctx = NULL; 6197 bdev_io->u.bdev.accel_sequence = NULL; 6198 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6199 6200 bdev_io_submit(bdev_io); 6201 return 0; 6202 } 6203 6204 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6205 6206 static void 6207 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6208 { 6209 struct spdk_bdev_channel *ch = _ctx; 6210 struct spdk_bdev_io *bdev_io; 6211 6212 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6213 6214 if (status == -EBUSY) { 6215 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6216 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6217 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6218 } else { 6219 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6220 6221 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6222 /* If outstanding IOs are still present and reset_io_drain_timeout 6223 * seconds passed, start the reset. */ 6224 bdev_io_submit_reset(bdev_io); 6225 } else { 6226 /* We still have in progress memory domain pull/push or we're 6227 * executing accel sequence. Since we cannot abort either of those 6228 * operaions, fail the reset request. */ 6229 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6230 } 6231 } 6232 } else { 6233 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6234 SPDK_DEBUGLOG(bdev, 6235 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6236 ch->bdev->name); 6237 /* Mark the completion status as a SUCCESS and complete the reset. */ 6238 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6239 } 6240 } 6241 6242 static void 6243 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6244 struct spdk_io_channel *io_ch, void *_ctx) 6245 { 6246 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6247 int status = 0; 6248 6249 if (cur_ch->io_outstanding > 0 || 6250 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6251 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6252 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6253 * further iteration over the rest of the channels and pass non-zero status 6254 * to the callback function. */ 6255 status = -EBUSY; 6256 } 6257 spdk_bdev_for_each_channel_continue(i, status); 6258 } 6259 6260 static int 6261 bdev_reset_poll_for_outstanding_io(void *ctx) 6262 { 6263 struct spdk_bdev_channel *ch = ctx; 6264 struct spdk_bdev_io *bdev_io; 6265 6266 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6267 6268 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6269 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6270 bdev_reset_check_outstanding_io_done); 6271 6272 return SPDK_POLLER_BUSY; 6273 } 6274 6275 static void 6276 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6277 { 6278 struct spdk_bdev_channel *ch = _ctx; 6279 struct spdk_bdev_io *bdev_io; 6280 6281 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6282 6283 if (bdev->reset_io_drain_timeout == 0) { 6284 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6285 6286 bdev_io_submit_reset(bdev_io); 6287 return; 6288 } 6289 6290 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6291 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6292 6293 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6294 * submit the reset to the underlying module only if outstanding I/O 6295 * remain after reset_io_drain_timeout seconds have passed. */ 6296 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6297 bdev_reset_check_outstanding_io_done); 6298 } 6299 6300 static void 6301 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6302 struct spdk_io_channel *ch, void *_ctx) 6303 { 6304 struct spdk_bdev_channel *channel; 6305 struct spdk_bdev_mgmt_channel *mgmt_channel; 6306 struct spdk_bdev_shared_resource *shared_resource; 6307 bdev_io_tailq_t tmp_queued; 6308 6309 TAILQ_INIT(&tmp_queued); 6310 6311 channel = __io_ch_to_bdev_ch(ch); 6312 shared_resource = channel->shared_resource; 6313 mgmt_channel = shared_resource->mgmt_ch; 6314 6315 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6316 6317 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6318 /* The QoS object is always valid and readable while 6319 * the channel flag is set, so the lock here should not 6320 * be necessary. We're not in the fast path though, so 6321 * just take it anyway. */ 6322 spdk_spin_lock(&channel->bdev->internal.spinlock); 6323 if (channel->bdev->internal.qos->ch == channel) { 6324 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6325 } 6326 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6327 } 6328 6329 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6330 bdev_abort_all_buf_io(mgmt_channel, channel); 6331 bdev_abort_all_queued_io(&tmp_queued, channel); 6332 6333 spdk_bdev_for_each_channel_continue(i, 0); 6334 } 6335 6336 static void 6337 bdev_start_reset(void *ctx) 6338 { 6339 struct spdk_bdev_channel *ch = ctx; 6340 6341 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6342 bdev_reset_freeze_channel_done); 6343 } 6344 6345 static void 6346 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6347 { 6348 struct spdk_bdev *bdev = ch->bdev; 6349 6350 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6351 6352 spdk_spin_lock(&bdev->internal.spinlock); 6353 if (bdev->internal.reset_in_progress == NULL) { 6354 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6355 /* 6356 * Take a channel reference for the target bdev for the life of this 6357 * reset. This guards against the channel getting destroyed while 6358 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6359 * progress. We will release the reference when this reset is 6360 * completed. 6361 */ 6362 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6363 bdev_start_reset(ch); 6364 } 6365 spdk_spin_unlock(&bdev->internal.spinlock); 6366 } 6367 6368 int 6369 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6370 spdk_bdev_io_completion_cb cb, void *cb_arg) 6371 { 6372 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6373 struct spdk_bdev_io *bdev_io; 6374 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6375 6376 bdev_io = bdev_channel_get_io(channel); 6377 if (!bdev_io) { 6378 return -ENOMEM; 6379 } 6380 6381 bdev_io->internal.ch = channel; 6382 bdev_io->internal.desc = desc; 6383 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6384 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6385 bdev_io->u.reset.ch_ref = NULL; 6386 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6387 6388 spdk_spin_lock(&bdev->internal.spinlock); 6389 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6390 spdk_spin_unlock(&bdev->internal.spinlock); 6391 6392 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6393 internal.ch_link); 6394 6395 bdev_channel_start_reset(channel); 6396 6397 return 0; 6398 } 6399 6400 void 6401 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6402 struct spdk_bdev_io_stat *stat) 6403 { 6404 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6405 6406 bdev_get_io_stat(stat, channel->stat); 6407 } 6408 6409 static void 6410 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6411 { 6412 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6413 6414 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6415 bdev_iostat_ctx->cb_arg, 0); 6416 free(bdev_iostat_ctx); 6417 } 6418 6419 static void 6420 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6421 struct spdk_io_channel *ch, void *_ctx) 6422 { 6423 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6424 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6425 6426 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6427 spdk_bdev_for_each_channel_continue(i, 0); 6428 } 6429 6430 void 6431 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6432 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6433 { 6434 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6435 6436 assert(bdev != NULL); 6437 assert(stat != NULL); 6438 assert(cb != NULL); 6439 6440 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6441 if (bdev_iostat_ctx == NULL) { 6442 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6443 cb(bdev, stat, cb_arg, -ENOMEM); 6444 return; 6445 } 6446 6447 bdev_iostat_ctx->stat = stat; 6448 bdev_iostat_ctx->cb = cb; 6449 bdev_iostat_ctx->cb_arg = cb_arg; 6450 6451 /* Start with the statistics from previously deleted channels. */ 6452 spdk_spin_lock(&bdev->internal.spinlock); 6453 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6454 spdk_spin_unlock(&bdev->internal.spinlock); 6455 6456 /* Then iterate and add the statistics from each existing channel. */ 6457 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6458 bdev_get_device_stat_done); 6459 } 6460 6461 struct bdev_iostat_reset_ctx { 6462 enum spdk_bdev_reset_stat_mode mode; 6463 bdev_reset_device_stat_cb cb; 6464 void *cb_arg; 6465 }; 6466 6467 static void 6468 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6469 { 6470 struct bdev_iostat_reset_ctx *ctx = _ctx; 6471 6472 ctx->cb(bdev, ctx->cb_arg, 0); 6473 6474 free(ctx); 6475 } 6476 6477 static void 6478 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6479 struct spdk_io_channel *ch, void *_ctx) 6480 { 6481 struct bdev_iostat_reset_ctx *ctx = _ctx; 6482 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6483 6484 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6485 6486 spdk_bdev_for_each_channel_continue(i, 0); 6487 } 6488 6489 void 6490 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6491 bdev_reset_device_stat_cb cb, void *cb_arg) 6492 { 6493 struct bdev_iostat_reset_ctx *ctx; 6494 6495 assert(bdev != NULL); 6496 assert(cb != NULL); 6497 6498 ctx = calloc(1, sizeof(*ctx)); 6499 if (ctx == NULL) { 6500 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6501 cb(bdev, cb_arg, -ENOMEM); 6502 return; 6503 } 6504 6505 ctx->mode = mode; 6506 ctx->cb = cb; 6507 ctx->cb_arg = cb_arg; 6508 6509 spdk_spin_lock(&bdev->internal.spinlock); 6510 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6511 spdk_spin_unlock(&bdev->internal.spinlock); 6512 6513 spdk_bdev_for_each_channel(bdev, 6514 bdev_reset_each_channel_stat, 6515 ctx, 6516 bdev_reset_device_stat_done); 6517 } 6518 6519 int 6520 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6521 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6522 spdk_bdev_io_completion_cb cb, void *cb_arg) 6523 { 6524 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6525 struct spdk_bdev_io *bdev_io; 6526 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6527 6528 if (!desc->write) { 6529 return -EBADF; 6530 } 6531 6532 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6533 return -ENOTSUP; 6534 } 6535 6536 bdev_io = bdev_channel_get_io(channel); 6537 if (!bdev_io) { 6538 return -ENOMEM; 6539 } 6540 6541 bdev_io->internal.ch = channel; 6542 bdev_io->internal.desc = desc; 6543 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6544 bdev_io->u.nvme_passthru.cmd = *cmd; 6545 bdev_io->u.nvme_passthru.buf = buf; 6546 bdev_io->u.nvme_passthru.nbytes = nbytes; 6547 bdev_io->u.nvme_passthru.md_buf = NULL; 6548 bdev_io->u.nvme_passthru.md_len = 0; 6549 6550 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6551 6552 bdev_io_submit(bdev_io); 6553 return 0; 6554 } 6555 6556 int 6557 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6558 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6559 spdk_bdev_io_completion_cb cb, void *cb_arg) 6560 { 6561 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6562 struct spdk_bdev_io *bdev_io; 6563 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6564 6565 if (!desc->write) { 6566 /* 6567 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6568 * to easily determine if the command is a read or write, but for now just 6569 * do not allow io_passthru with a read-only descriptor. 6570 */ 6571 return -EBADF; 6572 } 6573 6574 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6575 return -ENOTSUP; 6576 } 6577 6578 bdev_io = bdev_channel_get_io(channel); 6579 if (!bdev_io) { 6580 return -ENOMEM; 6581 } 6582 6583 bdev_io->internal.ch = channel; 6584 bdev_io->internal.desc = desc; 6585 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6586 bdev_io->u.nvme_passthru.cmd = *cmd; 6587 bdev_io->u.nvme_passthru.buf = buf; 6588 bdev_io->u.nvme_passthru.nbytes = nbytes; 6589 bdev_io->u.nvme_passthru.md_buf = NULL; 6590 bdev_io->u.nvme_passthru.md_len = 0; 6591 6592 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6593 6594 bdev_io_submit(bdev_io); 6595 return 0; 6596 } 6597 6598 int 6599 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6600 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6601 spdk_bdev_io_completion_cb cb, void *cb_arg) 6602 { 6603 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6604 struct spdk_bdev_io *bdev_io; 6605 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6606 6607 if (!desc->write) { 6608 /* 6609 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6610 * to easily determine if the command is a read or write, but for now just 6611 * do not allow io_passthru with a read-only descriptor. 6612 */ 6613 return -EBADF; 6614 } 6615 6616 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6617 return -ENOTSUP; 6618 } 6619 6620 bdev_io = bdev_channel_get_io(channel); 6621 if (!bdev_io) { 6622 return -ENOMEM; 6623 } 6624 6625 bdev_io->internal.ch = channel; 6626 bdev_io->internal.desc = desc; 6627 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6628 bdev_io->u.nvme_passthru.cmd = *cmd; 6629 bdev_io->u.nvme_passthru.buf = buf; 6630 bdev_io->u.nvme_passthru.nbytes = nbytes; 6631 bdev_io->u.nvme_passthru.md_buf = md_buf; 6632 bdev_io->u.nvme_passthru.md_len = md_len; 6633 6634 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6635 6636 bdev_io_submit(bdev_io); 6637 return 0; 6638 } 6639 6640 static void bdev_abort_retry(void *ctx); 6641 static void bdev_abort(struct spdk_bdev_io *parent_io); 6642 6643 static void 6644 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6645 { 6646 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6647 struct spdk_bdev_io *parent_io = cb_arg; 6648 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6649 6650 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6651 6652 spdk_bdev_free_io(bdev_io); 6653 6654 if (!success) { 6655 /* Check if the target I/O completed in the meantime. */ 6656 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6657 if (tmp_io == bio_to_abort) { 6658 break; 6659 } 6660 } 6661 6662 /* If the target I/O still exists, set the parent to failed. */ 6663 if (tmp_io != NULL) { 6664 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6665 } 6666 } 6667 6668 parent_io->u.bdev.split_outstanding--; 6669 if (parent_io->u.bdev.split_outstanding == 0) { 6670 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6671 bdev_abort_retry(parent_io); 6672 } else { 6673 bdev_io_complete(parent_io); 6674 } 6675 } 6676 } 6677 6678 static int 6679 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6680 struct spdk_bdev_io *bio_to_abort, 6681 spdk_bdev_io_completion_cb cb, void *cb_arg) 6682 { 6683 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6684 struct spdk_bdev_io *bdev_io; 6685 6686 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6687 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6688 /* TODO: Abort reset or abort request. */ 6689 return -ENOTSUP; 6690 } 6691 6692 bdev_io = bdev_channel_get_io(channel); 6693 if (bdev_io == NULL) { 6694 return -ENOMEM; 6695 } 6696 6697 bdev_io->internal.ch = channel; 6698 bdev_io->internal.desc = desc; 6699 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6700 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6701 6702 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6703 assert(bdev_io_should_split(bio_to_abort)); 6704 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6705 6706 /* Parent abort request is not submitted directly, but to manage its 6707 * execution add it to the submitted list here. 6708 */ 6709 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6710 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6711 6712 bdev_abort(bdev_io); 6713 6714 return 0; 6715 } 6716 6717 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6718 6719 /* Submit the abort request to the underlying bdev module. */ 6720 bdev_io_submit(bdev_io); 6721 6722 return 0; 6723 } 6724 6725 static bool 6726 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6727 { 6728 struct spdk_bdev_io *iter; 6729 6730 TAILQ_FOREACH(iter, tailq, internal.link) { 6731 if (iter == bdev_io) { 6732 return true; 6733 } 6734 } 6735 6736 return false; 6737 } 6738 6739 static uint32_t 6740 _bdev_abort(struct spdk_bdev_io *parent_io) 6741 { 6742 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6743 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6744 void *bio_cb_arg; 6745 struct spdk_bdev_io *bio_to_abort; 6746 uint32_t matched_ios; 6747 int rc; 6748 6749 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6750 6751 /* matched_ios is returned and will be kept by the caller. 6752 * 6753 * This function will be used for two cases, 1) the same cb_arg is used for 6754 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6755 * Incrementing split_outstanding directly here may confuse readers especially 6756 * for the 1st case. 6757 * 6758 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6759 * works as expected. 6760 */ 6761 matched_ios = 0; 6762 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6763 6764 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6765 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6766 continue; 6767 } 6768 6769 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6770 /* Any I/O which was submitted after this abort command should be excluded. */ 6771 continue; 6772 } 6773 6774 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6775 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6776 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6777 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6778 break; 6779 } 6780 6781 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6782 if (rc != 0) { 6783 if (rc == -ENOMEM) { 6784 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6785 } else { 6786 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6787 } 6788 break; 6789 } 6790 matched_ios++; 6791 } 6792 6793 return matched_ios; 6794 } 6795 6796 static void 6797 bdev_abort_retry(void *ctx) 6798 { 6799 struct spdk_bdev_io *parent_io = ctx; 6800 uint32_t matched_ios; 6801 6802 matched_ios = _bdev_abort(parent_io); 6803 6804 if (matched_ios == 0) { 6805 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6806 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6807 } else { 6808 /* For retry, the case that no target I/O was found is success 6809 * because it means target I/Os completed in the meantime. 6810 */ 6811 bdev_io_complete(parent_io); 6812 } 6813 return; 6814 } 6815 6816 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6817 parent_io->u.bdev.split_outstanding = matched_ios; 6818 } 6819 6820 static void 6821 bdev_abort(struct spdk_bdev_io *parent_io) 6822 { 6823 uint32_t matched_ios; 6824 6825 matched_ios = _bdev_abort(parent_io); 6826 6827 if (matched_ios == 0) { 6828 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6829 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6830 } else { 6831 /* The case the no target I/O was found is failure. */ 6832 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6833 bdev_io_complete(parent_io); 6834 } 6835 return; 6836 } 6837 6838 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6839 parent_io->u.bdev.split_outstanding = matched_ios; 6840 } 6841 6842 int 6843 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6844 void *bio_cb_arg, 6845 spdk_bdev_io_completion_cb cb, void *cb_arg) 6846 { 6847 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6848 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6849 struct spdk_bdev_io *bdev_io; 6850 6851 if (bio_cb_arg == NULL) { 6852 return -EINVAL; 6853 } 6854 6855 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6856 return -ENOTSUP; 6857 } 6858 6859 bdev_io = bdev_channel_get_io(channel); 6860 if (bdev_io == NULL) { 6861 return -ENOMEM; 6862 } 6863 6864 bdev_io->internal.ch = channel; 6865 bdev_io->internal.desc = desc; 6866 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6867 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6868 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6869 6870 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6871 6872 /* Parent abort request is not submitted directly, but to manage its execution, 6873 * add it to the submitted list here. 6874 */ 6875 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6876 6877 bdev_abort(bdev_io); 6878 6879 return 0; 6880 } 6881 6882 int 6883 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6884 struct spdk_bdev_io_wait_entry *entry) 6885 { 6886 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6887 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6888 6889 if (bdev != entry->bdev) { 6890 SPDK_ERRLOG("bdevs do not match\n"); 6891 return -EINVAL; 6892 } 6893 6894 if (mgmt_ch->per_thread_cache_count > 0) { 6895 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6896 return -EINVAL; 6897 } 6898 6899 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6900 return 0; 6901 } 6902 6903 static inline void 6904 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6905 { 6906 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6907 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6908 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6909 uint32_t blocklen = bdev_io->bdev->blocklen; 6910 6911 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6912 switch (bdev_io->type) { 6913 case SPDK_BDEV_IO_TYPE_READ: 6914 io_stat->bytes_read += num_blocks * blocklen; 6915 io_stat->num_read_ops++; 6916 io_stat->read_latency_ticks += tsc_diff; 6917 if (io_stat->max_read_latency_ticks < tsc_diff) { 6918 io_stat->max_read_latency_ticks = tsc_diff; 6919 } 6920 if (io_stat->min_read_latency_ticks > tsc_diff) { 6921 io_stat->min_read_latency_ticks = tsc_diff; 6922 } 6923 break; 6924 case SPDK_BDEV_IO_TYPE_WRITE: 6925 io_stat->bytes_written += num_blocks * blocklen; 6926 io_stat->num_write_ops++; 6927 io_stat->write_latency_ticks += tsc_diff; 6928 if (io_stat->max_write_latency_ticks < tsc_diff) { 6929 io_stat->max_write_latency_ticks = tsc_diff; 6930 } 6931 if (io_stat->min_write_latency_ticks > tsc_diff) { 6932 io_stat->min_write_latency_ticks = tsc_diff; 6933 } 6934 break; 6935 case SPDK_BDEV_IO_TYPE_UNMAP: 6936 io_stat->bytes_unmapped += num_blocks * blocklen; 6937 io_stat->num_unmap_ops++; 6938 io_stat->unmap_latency_ticks += tsc_diff; 6939 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6940 io_stat->max_unmap_latency_ticks = tsc_diff; 6941 } 6942 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6943 io_stat->min_unmap_latency_ticks = tsc_diff; 6944 } 6945 break; 6946 case SPDK_BDEV_IO_TYPE_ZCOPY: 6947 /* Track the data in the start phase only */ 6948 if (bdev_io->u.bdev.zcopy.start) { 6949 if (bdev_io->u.bdev.zcopy.populate) { 6950 io_stat->bytes_read += num_blocks * blocklen; 6951 io_stat->num_read_ops++; 6952 io_stat->read_latency_ticks += tsc_diff; 6953 if (io_stat->max_read_latency_ticks < tsc_diff) { 6954 io_stat->max_read_latency_ticks = tsc_diff; 6955 } 6956 if (io_stat->min_read_latency_ticks > tsc_diff) { 6957 io_stat->min_read_latency_ticks = tsc_diff; 6958 } 6959 } else { 6960 io_stat->bytes_written += num_blocks * blocklen; 6961 io_stat->num_write_ops++; 6962 io_stat->write_latency_ticks += tsc_diff; 6963 if (io_stat->max_write_latency_ticks < tsc_diff) { 6964 io_stat->max_write_latency_ticks = tsc_diff; 6965 } 6966 if (io_stat->min_write_latency_ticks > tsc_diff) { 6967 io_stat->min_write_latency_ticks = tsc_diff; 6968 } 6969 } 6970 } 6971 break; 6972 case SPDK_BDEV_IO_TYPE_COPY: 6973 io_stat->bytes_copied += num_blocks * blocklen; 6974 io_stat->num_copy_ops++; 6975 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6976 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6977 io_stat->max_copy_latency_ticks = tsc_diff; 6978 } 6979 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6980 io_stat->min_copy_latency_ticks = tsc_diff; 6981 } 6982 break; 6983 default: 6984 break; 6985 } 6986 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6987 io_stat = bdev_io->bdev->internal.stat; 6988 assert(io_stat->io_error != NULL); 6989 6990 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6991 io_stat->io_error->error_status[-io_status - 1]++; 6992 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6993 } 6994 6995 #ifdef SPDK_CONFIG_VTUNE 6996 uint64_t now_tsc = spdk_get_ticks(); 6997 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6998 uint64_t data[5]; 6999 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7000 7001 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7002 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7003 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7004 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7005 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7006 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7007 7008 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7009 __itt_metadata_u64, 5, data); 7010 7011 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7012 bdev_io->internal.ch->start_tsc = now_tsc; 7013 } 7014 #endif 7015 } 7016 7017 static inline void 7018 _bdev_io_complete(void *ctx) 7019 { 7020 struct spdk_bdev_io *bdev_io = ctx; 7021 7022 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 7023 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7024 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7025 } 7026 7027 assert(bdev_io->internal.cb != NULL); 7028 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7029 7030 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7031 bdev_io->internal.caller_ctx); 7032 } 7033 7034 static inline void 7035 bdev_io_complete(void *ctx) 7036 { 7037 struct spdk_bdev_io *bdev_io = ctx; 7038 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7039 uint64_t tsc, tsc_diff; 7040 7041 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7042 /* 7043 * Defer completion to avoid potential infinite recursion if the 7044 * user's completion callback issues a new I/O. 7045 */ 7046 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7047 bdev_io_complete, bdev_io); 7048 return; 7049 } 7050 7051 tsc = spdk_get_ticks(); 7052 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7053 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7054 bdev_io->internal.caller_ctx); 7055 7056 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7057 7058 if (bdev_io->internal.ch->histogram) { 7059 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7060 } 7061 7062 bdev_io_update_io_stat(bdev_io, tsc_diff); 7063 _bdev_io_complete(bdev_io); 7064 } 7065 7066 /* The difference between this function and bdev_io_complete() is that this should be called to 7067 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7068 * io_submitted list and don't have submit_tsc updated. 7069 */ 7070 static inline void 7071 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7072 { 7073 /* Since the IO hasn't been submitted it's bound to be failed */ 7074 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7075 7076 /* At this point we don't know if the IO is completed from submission context or not, but, 7077 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7078 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7079 _bdev_io_complete, bdev_io); 7080 } 7081 7082 static void bdev_destroy_cb(void *io_device); 7083 7084 static void 7085 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7086 { 7087 struct spdk_bdev_io *bdev_io = _ctx; 7088 7089 if (bdev_io->u.reset.ch_ref != NULL) { 7090 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7091 bdev_io->u.reset.ch_ref = NULL; 7092 } 7093 7094 bdev_io_complete(bdev_io); 7095 7096 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7097 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7098 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7099 } 7100 } 7101 7102 static void 7103 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7104 struct spdk_io_channel *_ch, void *_ctx) 7105 { 7106 struct spdk_bdev_io *bdev_io = _ctx; 7107 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7108 struct spdk_bdev_io *queued_reset; 7109 7110 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7111 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7112 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7113 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7114 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7115 } 7116 7117 spdk_bdev_for_each_channel_continue(i, 0); 7118 } 7119 7120 static void 7121 bdev_io_complete_sequence_cb(void *ctx, int status) 7122 { 7123 struct spdk_bdev_io *bdev_io = ctx; 7124 7125 /* u.bdev.accel_sequence should have already been cleared at this point */ 7126 assert(bdev_io->u.bdev.accel_sequence == NULL); 7127 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7128 bdev_io->internal.accel_sequence = NULL; 7129 7130 if (spdk_unlikely(status != 0)) { 7131 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7132 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7133 } 7134 7135 bdev_io_complete(bdev_io); 7136 } 7137 7138 void 7139 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7140 { 7141 struct spdk_bdev *bdev = bdev_io->bdev; 7142 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7143 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7144 7145 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7146 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7147 spdk_bdev_get_module_name(bdev), 7148 bdev_io_status_get_string(bdev_io->internal.status)); 7149 assert(false); 7150 } 7151 bdev_io->internal.status = status; 7152 7153 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7154 bool unlock_channels = false; 7155 7156 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7157 SPDK_ERRLOG("NOMEM returned for reset\n"); 7158 } 7159 spdk_spin_lock(&bdev->internal.spinlock); 7160 if (bdev_io == bdev->internal.reset_in_progress) { 7161 bdev->internal.reset_in_progress = NULL; 7162 unlock_channels = true; 7163 } 7164 spdk_spin_unlock(&bdev->internal.spinlock); 7165 7166 if (unlock_channels) { 7167 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7168 bdev_reset_complete); 7169 return; 7170 } 7171 } else { 7172 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7173 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7174 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7175 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7176 return; 7177 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7178 !bdev_io_use_accel_sequence(bdev_io))) { 7179 _bdev_io_push_bounce_data_buffer(bdev_io, 7180 _bdev_io_complete_push_bounce_done); 7181 /* bdev IO will be completed in the callback */ 7182 return; 7183 } 7184 } 7185 7186 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7187 return; 7188 } 7189 } 7190 7191 bdev_io_complete(bdev_io); 7192 } 7193 7194 void 7195 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7196 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7197 { 7198 enum spdk_bdev_io_status status; 7199 7200 if (sc == SPDK_SCSI_STATUS_GOOD) { 7201 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7202 } else { 7203 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7204 bdev_io->internal.error.scsi.sc = sc; 7205 bdev_io->internal.error.scsi.sk = sk; 7206 bdev_io->internal.error.scsi.asc = asc; 7207 bdev_io->internal.error.scsi.ascq = ascq; 7208 } 7209 7210 spdk_bdev_io_complete(bdev_io, status); 7211 } 7212 7213 void 7214 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7215 int *sc, int *sk, int *asc, int *ascq) 7216 { 7217 assert(sc != NULL); 7218 assert(sk != NULL); 7219 assert(asc != NULL); 7220 assert(ascq != NULL); 7221 7222 switch (bdev_io->internal.status) { 7223 case SPDK_BDEV_IO_STATUS_SUCCESS: 7224 *sc = SPDK_SCSI_STATUS_GOOD; 7225 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7226 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7227 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7228 break; 7229 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7230 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7231 break; 7232 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7233 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7234 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7235 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7236 *ascq = bdev_io->internal.error.scsi.ascq; 7237 break; 7238 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7239 *sc = bdev_io->internal.error.scsi.sc; 7240 *sk = bdev_io->internal.error.scsi.sk; 7241 *asc = bdev_io->internal.error.scsi.asc; 7242 *ascq = bdev_io->internal.error.scsi.ascq; 7243 break; 7244 default: 7245 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7246 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7247 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7248 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7249 break; 7250 } 7251 } 7252 7253 void 7254 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7255 { 7256 enum spdk_bdev_io_status status; 7257 7258 if (aio_result == 0) { 7259 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7260 } else { 7261 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7262 } 7263 7264 bdev_io->internal.error.aio_result = aio_result; 7265 7266 spdk_bdev_io_complete(bdev_io, status); 7267 } 7268 7269 void 7270 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7271 { 7272 assert(aio_result != NULL); 7273 7274 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7275 *aio_result = bdev_io->internal.error.aio_result; 7276 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7277 *aio_result = 0; 7278 } else { 7279 *aio_result = -EIO; 7280 } 7281 } 7282 7283 void 7284 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7285 { 7286 enum spdk_bdev_io_status status; 7287 7288 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7289 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7290 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7291 status = SPDK_BDEV_IO_STATUS_ABORTED; 7292 } else { 7293 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7294 } 7295 7296 bdev_io->internal.error.nvme.cdw0 = cdw0; 7297 bdev_io->internal.error.nvme.sct = sct; 7298 bdev_io->internal.error.nvme.sc = sc; 7299 7300 spdk_bdev_io_complete(bdev_io, status); 7301 } 7302 7303 void 7304 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7305 { 7306 assert(sct != NULL); 7307 assert(sc != NULL); 7308 assert(cdw0 != NULL); 7309 7310 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7311 *sct = SPDK_NVME_SCT_GENERIC; 7312 *sc = SPDK_NVME_SC_SUCCESS; 7313 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7314 *cdw0 = 0; 7315 } else { 7316 *cdw0 = 1U; 7317 } 7318 return; 7319 } 7320 7321 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7322 *sct = bdev_io->internal.error.nvme.sct; 7323 *sc = bdev_io->internal.error.nvme.sc; 7324 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7325 *sct = SPDK_NVME_SCT_GENERIC; 7326 *sc = SPDK_NVME_SC_SUCCESS; 7327 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7328 *sct = SPDK_NVME_SCT_GENERIC; 7329 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7330 } else { 7331 *sct = SPDK_NVME_SCT_GENERIC; 7332 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7333 } 7334 7335 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7336 } 7337 7338 void 7339 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7340 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7341 { 7342 assert(first_sct != NULL); 7343 assert(first_sc != NULL); 7344 assert(second_sct != NULL); 7345 assert(second_sc != NULL); 7346 assert(cdw0 != NULL); 7347 7348 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7349 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7350 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7351 *first_sct = bdev_io->internal.error.nvme.sct; 7352 *first_sc = bdev_io->internal.error.nvme.sc; 7353 *second_sct = SPDK_NVME_SCT_GENERIC; 7354 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7355 } else { 7356 *first_sct = SPDK_NVME_SCT_GENERIC; 7357 *first_sc = SPDK_NVME_SC_SUCCESS; 7358 *second_sct = bdev_io->internal.error.nvme.sct; 7359 *second_sc = bdev_io->internal.error.nvme.sc; 7360 } 7361 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7362 *first_sct = SPDK_NVME_SCT_GENERIC; 7363 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7364 *second_sct = SPDK_NVME_SCT_GENERIC; 7365 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7366 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7367 *first_sct = SPDK_NVME_SCT_GENERIC; 7368 *first_sc = SPDK_NVME_SC_SUCCESS; 7369 *second_sct = SPDK_NVME_SCT_GENERIC; 7370 *second_sc = SPDK_NVME_SC_SUCCESS; 7371 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7372 *first_sct = SPDK_NVME_SCT_GENERIC; 7373 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7374 *second_sct = SPDK_NVME_SCT_GENERIC; 7375 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7376 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7377 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7378 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7379 *second_sct = SPDK_NVME_SCT_GENERIC; 7380 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7381 } else { 7382 *first_sct = SPDK_NVME_SCT_GENERIC; 7383 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7384 *second_sct = SPDK_NVME_SCT_GENERIC; 7385 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7386 } 7387 7388 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7389 } 7390 7391 struct spdk_thread * 7392 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7393 { 7394 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7395 } 7396 7397 struct spdk_io_channel * 7398 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7399 { 7400 return bdev_io->internal.ch->channel; 7401 } 7402 7403 static int 7404 bdev_register(struct spdk_bdev *bdev) 7405 { 7406 char *bdev_name; 7407 char uuid[SPDK_UUID_STRING_LEN]; 7408 struct spdk_iobuf_opts iobuf_opts; 7409 int ret, i; 7410 7411 assert(bdev->module != NULL); 7412 7413 if (!bdev->name) { 7414 SPDK_ERRLOG("Bdev name is NULL\n"); 7415 return -EINVAL; 7416 } 7417 7418 if (!strlen(bdev->name)) { 7419 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7420 return -EINVAL; 7421 } 7422 7423 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7424 if (bdev->fn_table->accel_sequence_supported == NULL) { 7425 continue; 7426 } 7427 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7428 (enum spdk_bdev_io_type)i)) { 7429 continue; 7430 } 7431 7432 if (spdk_bdev_is_md_separate(bdev)) { 7433 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7434 "accel sequence support\n"); 7435 return -EINVAL; 7436 } 7437 } 7438 7439 /* Users often register their own I/O devices using the bdev name. In 7440 * order to avoid conflicts, prepend bdev_. */ 7441 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7442 if (!bdev_name) { 7443 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7444 return -ENOMEM; 7445 } 7446 7447 bdev->internal.stat = bdev_alloc_io_stat(true); 7448 if (!bdev->internal.stat) { 7449 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7450 free(bdev_name); 7451 return -ENOMEM; 7452 } 7453 7454 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7455 bdev->internal.measured_queue_depth = UINT64_MAX; 7456 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7457 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7458 bdev->internal.qd_poller = NULL; 7459 bdev->internal.qos = NULL; 7460 7461 TAILQ_INIT(&bdev->internal.open_descs); 7462 TAILQ_INIT(&bdev->internal.locked_ranges); 7463 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7464 TAILQ_INIT(&bdev->aliases); 7465 7466 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7467 if (ret != 0) { 7468 bdev_free_io_stat(bdev->internal.stat); 7469 free(bdev_name); 7470 return ret; 7471 } 7472 7473 /* UUID may be specified by the user or defined by bdev itself. 7474 * Otherwise it will be generated here, so this field will never be empty. */ 7475 if (spdk_uuid_is_null(&bdev->uuid)) { 7476 spdk_uuid_generate(&bdev->uuid); 7477 } 7478 7479 /* Add the UUID alias only if it's different than the name */ 7480 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7481 if (strcmp(bdev->name, uuid) != 0) { 7482 ret = spdk_bdev_alias_add(bdev, uuid); 7483 if (ret != 0) { 7484 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7485 bdev_name_del(&bdev->internal.bdev_name); 7486 bdev_free_io_stat(bdev->internal.stat); 7487 free(bdev_name); 7488 return ret; 7489 } 7490 } 7491 7492 if (spdk_bdev_get_buf_align(bdev) > 1) { 7493 if (bdev->split_on_optimal_io_boundary) { 7494 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7495 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7496 } else { 7497 bdev->split_on_optimal_io_boundary = true; 7498 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7499 } 7500 } 7501 7502 /* If the user didn't specify a write unit size, set it to one. */ 7503 if (bdev->write_unit_size == 0) { 7504 bdev->write_unit_size = 1; 7505 } 7506 7507 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7508 if (bdev->acwu == 0) { 7509 bdev->acwu = bdev->write_unit_size; 7510 } 7511 7512 if (bdev->phys_blocklen == 0) { 7513 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7514 } 7515 7516 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7517 spdk_iobuf_get_opts(&iobuf_opts); 7518 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7519 } 7520 7521 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7522 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7523 } 7524 7525 bdev->internal.reset_in_progress = NULL; 7526 bdev->internal.qd_poll_in_progress = false; 7527 bdev->internal.period = 0; 7528 bdev->internal.new_period = 0; 7529 7530 spdk_io_device_register(__bdev_to_io_dev(bdev), 7531 bdev_channel_create, bdev_channel_destroy, 7532 sizeof(struct spdk_bdev_channel), 7533 bdev_name); 7534 7535 free(bdev_name); 7536 7537 spdk_spin_init(&bdev->internal.spinlock); 7538 7539 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7540 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7541 7542 return 0; 7543 } 7544 7545 static void 7546 bdev_destroy_cb(void *io_device) 7547 { 7548 int rc; 7549 struct spdk_bdev *bdev; 7550 spdk_bdev_unregister_cb cb_fn; 7551 void *cb_arg; 7552 7553 bdev = __bdev_from_io_dev(io_device); 7554 7555 if (bdev->internal.unregister_td != spdk_get_thread()) { 7556 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7557 return; 7558 } 7559 7560 cb_fn = bdev->internal.unregister_cb; 7561 cb_arg = bdev->internal.unregister_ctx; 7562 7563 spdk_spin_destroy(&bdev->internal.spinlock); 7564 free(bdev->internal.qos); 7565 bdev_free_io_stat(bdev->internal.stat); 7566 7567 rc = bdev->fn_table->destruct(bdev->ctxt); 7568 if (rc < 0) { 7569 SPDK_ERRLOG("destruct failed\n"); 7570 } 7571 if (rc <= 0 && cb_fn != NULL) { 7572 cb_fn(cb_arg, rc); 7573 } 7574 } 7575 7576 void 7577 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7578 { 7579 if (bdev->internal.unregister_cb != NULL) { 7580 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7581 } 7582 } 7583 7584 static void 7585 _remove_notify(void *arg) 7586 { 7587 struct spdk_bdev_desc *desc = arg; 7588 7589 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7590 } 7591 7592 /* returns: 0 - bdev removed and ready to be destructed. 7593 * -EBUSY - bdev can't be destructed yet. */ 7594 static int 7595 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7596 { 7597 struct spdk_bdev_desc *desc, *tmp; 7598 int rc = 0; 7599 char uuid[SPDK_UUID_STRING_LEN]; 7600 7601 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7602 assert(spdk_spin_held(&bdev->internal.spinlock)); 7603 7604 /* Notify each descriptor about hotremoval */ 7605 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7606 rc = -EBUSY; 7607 /* 7608 * Defer invocation of the event_cb to a separate message that will 7609 * run later on its thread. This ensures this context unwinds and 7610 * we don't recursively unregister this bdev again if the event_cb 7611 * immediately closes its descriptor. 7612 */ 7613 event_notify(desc, _remove_notify); 7614 } 7615 7616 /* If there are no descriptors, proceed removing the bdev */ 7617 if (rc == 0) { 7618 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7619 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7620 7621 /* Delete the name and the UUID alias */ 7622 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7623 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7624 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7625 7626 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7627 7628 if (bdev->internal.reset_in_progress != NULL) { 7629 /* If reset is in progress, let the completion callback for reset 7630 * unregister the bdev. 7631 */ 7632 rc = -EBUSY; 7633 } 7634 } 7635 7636 return rc; 7637 } 7638 7639 static void 7640 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7641 struct spdk_io_channel *io_ch, void *_ctx) 7642 { 7643 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7644 7645 bdev_channel_abort_queued_ios(bdev_ch); 7646 spdk_bdev_for_each_channel_continue(i, 0); 7647 } 7648 7649 static void 7650 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7651 { 7652 int rc; 7653 7654 spdk_spin_lock(&g_bdev_mgr.spinlock); 7655 spdk_spin_lock(&bdev->internal.spinlock); 7656 /* 7657 * Set the status to REMOVING after completing to abort channels. Otherwise, 7658 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7659 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7660 * may fail. 7661 */ 7662 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7663 rc = bdev_unregister_unsafe(bdev); 7664 spdk_spin_unlock(&bdev->internal.spinlock); 7665 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7666 7667 if (rc == 0) { 7668 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7669 } 7670 } 7671 7672 void 7673 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7674 { 7675 struct spdk_thread *thread; 7676 7677 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7678 7679 thread = spdk_get_thread(); 7680 if (!thread) { 7681 /* The user called this from a non-SPDK thread. */ 7682 if (cb_fn != NULL) { 7683 cb_fn(cb_arg, -ENOTSUP); 7684 } 7685 return; 7686 } 7687 7688 spdk_spin_lock(&g_bdev_mgr.spinlock); 7689 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7690 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7691 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7692 if (cb_fn) { 7693 cb_fn(cb_arg, -EBUSY); 7694 } 7695 return; 7696 } 7697 7698 spdk_spin_lock(&bdev->internal.spinlock); 7699 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7700 bdev->internal.unregister_cb = cb_fn; 7701 bdev->internal.unregister_ctx = cb_arg; 7702 bdev->internal.unregister_td = thread; 7703 spdk_spin_unlock(&bdev->internal.spinlock); 7704 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7705 7706 spdk_bdev_set_qd_sampling_period(bdev, 0); 7707 7708 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7709 bdev_unregister); 7710 } 7711 7712 int 7713 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7714 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7715 { 7716 struct spdk_bdev_desc *desc; 7717 struct spdk_bdev *bdev; 7718 int rc; 7719 7720 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7721 if (rc != 0) { 7722 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7723 return rc; 7724 } 7725 7726 bdev = spdk_bdev_desc_get_bdev(desc); 7727 7728 if (bdev->module != module) { 7729 spdk_bdev_close(desc); 7730 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7731 bdev_name); 7732 return -ENODEV; 7733 } 7734 7735 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7736 7737 spdk_bdev_close(desc); 7738 7739 return 0; 7740 } 7741 7742 static int 7743 bdev_start_qos(struct spdk_bdev *bdev) 7744 { 7745 struct set_qos_limit_ctx *ctx; 7746 7747 /* Enable QoS */ 7748 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7749 ctx = calloc(1, sizeof(*ctx)); 7750 if (ctx == NULL) { 7751 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7752 return -ENOMEM; 7753 } 7754 ctx->bdev = bdev; 7755 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7756 } 7757 7758 return 0; 7759 } 7760 7761 static void 7762 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7763 struct spdk_bdev *bdev) 7764 { 7765 enum spdk_bdev_claim_type type; 7766 const char *typename, *modname; 7767 extern struct spdk_log_flag SPDK_LOG_bdev; 7768 7769 assert(spdk_spin_held(&bdev->internal.spinlock)); 7770 7771 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7772 return; 7773 } 7774 7775 type = bdev->internal.claim_type; 7776 typename = spdk_bdev_claim_get_name(type); 7777 7778 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7779 modname = bdev->internal.claim.v1.module->name; 7780 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7781 bdev->name, detail, typename, modname); 7782 return; 7783 } 7784 7785 if (claim_type_is_v2(type)) { 7786 struct spdk_bdev_module_claim *claim; 7787 7788 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7789 modname = claim->module->name; 7790 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7791 bdev->name, detail, typename, modname); 7792 } 7793 return; 7794 } 7795 7796 assert(false); 7797 } 7798 7799 static int 7800 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7801 { 7802 struct spdk_thread *thread; 7803 int rc = 0; 7804 7805 thread = spdk_get_thread(); 7806 if (!thread) { 7807 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7808 return -ENOTSUP; 7809 } 7810 7811 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7812 spdk_get_thread()); 7813 7814 desc->bdev = bdev; 7815 desc->thread = thread; 7816 desc->write = write; 7817 7818 spdk_spin_lock(&bdev->internal.spinlock); 7819 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7820 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7821 spdk_spin_unlock(&bdev->internal.spinlock); 7822 return -ENODEV; 7823 } 7824 7825 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7826 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7827 spdk_spin_unlock(&bdev->internal.spinlock); 7828 return -EPERM; 7829 } 7830 7831 rc = bdev_start_qos(bdev); 7832 if (rc != 0) { 7833 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7834 spdk_spin_unlock(&bdev->internal.spinlock); 7835 return rc; 7836 } 7837 7838 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7839 7840 spdk_spin_unlock(&bdev->internal.spinlock); 7841 7842 return 0; 7843 } 7844 7845 static int 7846 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7847 struct spdk_bdev_desc **_desc) 7848 { 7849 struct spdk_bdev_desc *desc; 7850 unsigned int i; 7851 7852 desc = calloc(1, sizeof(*desc)); 7853 if (desc == NULL) { 7854 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7855 return -ENOMEM; 7856 } 7857 7858 TAILQ_INIT(&desc->pending_media_events); 7859 TAILQ_INIT(&desc->free_media_events); 7860 7861 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7862 desc->callback.event_fn = event_cb; 7863 desc->callback.ctx = event_ctx; 7864 spdk_spin_init(&desc->spinlock); 7865 7866 if (bdev->media_events) { 7867 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7868 sizeof(*desc->media_events_buffer)); 7869 if (desc->media_events_buffer == NULL) { 7870 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7871 bdev_desc_free(desc); 7872 return -ENOMEM; 7873 } 7874 7875 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7876 TAILQ_INSERT_TAIL(&desc->free_media_events, 7877 &desc->media_events_buffer[i], tailq); 7878 } 7879 } 7880 7881 if (bdev->fn_table->accel_sequence_supported != NULL) { 7882 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7883 desc->accel_sequence_supported[i] = 7884 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7885 (enum spdk_bdev_io_type)i); 7886 } 7887 } 7888 7889 *_desc = desc; 7890 7891 return 0; 7892 } 7893 7894 static int 7895 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7896 void *event_ctx, struct spdk_bdev_desc **_desc) 7897 { 7898 struct spdk_bdev_desc *desc; 7899 struct spdk_bdev *bdev; 7900 int rc; 7901 7902 bdev = bdev_get_by_name(bdev_name); 7903 7904 if (bdev == NULL) { 7905 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7906 return -ENODEV; 7907 } 7908 7909 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7910 if (rc != 0) { 7911 return rc; 7912 } 7913 7914 rc = bdev_open(bdev, write, desc); 7915 if (rc != 0) { 7916 bdev_desc_free(desc); 7917 desc = NULL; 7918 } 7919 7920 *_desc = desc; 7921 7922 return rc; 7923 } 7924 7925 int 7926 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7927 void *event_ctx, struct spdk_bdev_desc **_desc) 7928 { 7929 int rc; 7930 7931 if (event_cb == NULL) { 7932 SPDK_ERRLOG("Missing event callback function\n"); 7933 return -EINVAL; 7934 } 7935 7936 spdk_spin_lock(&g_bdev_mgr.spinlock); 7937 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 7938 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7939 7940 return rc; 7941 } 7942 7943 struct spdk_bdev_open_async_ctx { 7944 char *bdev_name; 7945 spdk_bdev_event_cb_t event_cb; 7946 void *event_ctx; 7947 bool write; 7948 int rc; 7949 spdk_bdev_open_async_cb_t cb_fn; 7950 void *cb_arg; 7951 struct spdk_bdev_desc *desc; 7952 struct spdk_bdev_open_async_opts opts; 7953 uint64_t start_ticks; 7954 struct spdk_thread *orig_thread; 7955 struct spdk_poller *poller; 7956 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 7957 }; 7958 7959 static void 7960 bdev_open_async_done(void *arg) 7961 { 7962 struct spdk_bdev_open_async_ctx *ctx = arg; 7963 7964 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 7965 7966 free(ctx->bdev_name); 7967 free(ctx); 7968 } 7969 7970 static void 7971 bdev_open_async_cancel(void *arg) 7972 { 7973 struct spdk_bdev_open_async_ctx *ctx = arg; 7974 7975 assert(ctx->rc == -ESHUTDOWN); 7976 7977 spdk_poller_unregister(&ctx->poller); 7978 7979 bdev_open_async_done(ctx); 7980 } 7981 7982 /* This is called when the bdev library finishes at shutdown. */ 7983 static void 7984 bdev_open_async_fini(void) 7985 { 7986 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 7987 7988 spdk_spin_lock(&g_bdev_mgr.spinlock); 7989 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 7990 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 7991 /* 7992 * We have to move to ctx->orig_thread to unregister ctx->poller. 7993 * However, there is a chance that ctx->poller is executed before 7994 * message is executed, which could result in bdev_open_async_done() 7995 * being called twice. To avoid such race condition, set ctx->rc to 7996 * -ESHUTDOWN. 7997 */ 7998 ctx->rc = -ESHUTDOWN; 7999 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8000 } 8001 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8002 } 8003 8004 static int bdev_open_async(void *arg); 8005 8006 static void 8007 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8008 { 8009 uint64_t timeout_ticks; 8010 8011 if (ctx->rc == -ESHUTDOWN) { 8012 /* This context is being canceled. Do nothing. */ 8013 return; 8014 } 8015 8016 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8017 &ctx->desc); 8018 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8019 goto exit; 8020 } 8021 8022 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8023 if (spdk_get_ticks() >= timeout_ticks) { 8024 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8025 ctx->rc = -ETIMEDOUT; 8026 goto exit; 8027 } 8028 8029 return; 8030 8031 exit: 8032 spdk_poller_unregister(&ctx->poller); 8033 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8034 8035 /* Completion callback is processed after stack unwinding. */ 8036 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8037 } 8038 8039 static int 8040 bdev_open_async(void *arg) 8041 { 8042 struct spdk_bdev_open_async_ctx *ctx = arg; 8043 8044 spdk_spin_lock(&g_bdev_mgr.spinlock); 8045 8046 _bdev_open_async(ctx); 8047 8048 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8049 8050 return SPDK_POLLER_BUSY; 8051 } 8052 8053 static void 8054 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8055 struct spdk_bdev_open_async_opts *opts_src, 8056 size_t size) 8057 { 8058 assert(opts); 8059 assert(opts_src); 8060 8061 opts->size = size; 8062 8063 #define SET_FIELD(field) \ 8064 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8065 opts->field = opts_src->field; \ 8066 } \ 8067 8068 SET_FIELD(timeout_ms); 8069 8070 /* Do not remove this statement, you should always update this statement when you adding a new field, 8071 * and do not forget to add the SET_FIELD statement for your added field. */ 8072 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8073 8074 #undef SET_FIELD 8075 } 8076 8077 static void 8078 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8079 { 8080 assert(opts); 8081 8082 opts->size = size; 8083 8084 #define SET_FIELD(field, value) \ 8085 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8086 opts->field = value; \ 8087 } \ 8088 8089 SET_FIELD(timeout_ms, 0); 8090 8091 #undef SET_FIELD 8092 } 8093 8094 int 8095 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8096 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8097 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8098 { 8099 struct spdk_bdev_open_async_ctx *ctx; 8100 8101 if (event_cb == NULL) { 8102 SPDK_ERRLOG("Missing event callback function\n"); 8103 return -EINVAL; 8104 } 8105 8106 if (open_cb == NULL) { 8107 SPDK_ERRLOG("Missing open callback function\n"); 8108 return -EINVAL; 8109 } 8110 8111 if (opts != NULL && opts->size == 0) { 8112 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8113 return -EINVAL; 8114 } 8115 8116 ctx = calloc(1, sizeof(*ctx)); 8117 if (ctx == NULL) { 8118 SPDK_ERRLOG("Failed to allocate open context\n"); 8119 return -ENOMEM; 8120 } 8121 8122 ctx->bdev_name = strdup(bdev_name); 8123 if (ctx->bdev_name == NULL) { 8124 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8125 free(ctx); 8126 return -ENOMEM; 8127 } 8128 8129 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8130 if (ctx->poller == NULL) { 8131 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8132 free(ctx->bdev_name); 8133 free(ctx); 8134 return -ENOMEM; 8135 } 8136 8137 ctx->cb_fn = open_cb; 8138 ctx->cb_arg = open_cb_arg; 8139 ctx->write = write; 8140 ctx->event_cb = event_cb; 8141 ctx->event_ctx = event_ctx; 8142 ctx->orig_thread = spdk_get_thread(); 8143 ctx->start_ticks = spdk_get_ticks(); 8144 8145 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8146 if (opts != NULL) { 8147 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8148 } 8149 8150 spdk_spin_lock(&g_bdev_mgr.spinlock); 8151 8152 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8153 _bdev_open_async(ctx); 8154 8155 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8156 8157 return 0; 8158 } 8159 8160 static void 8161 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8162 { 8163 int rc; 8164 8165 spdk_spin_lock(&bdev->internal.spinlock); 8166 spdk_spin_lock(&desc->spinlock); 8167 8168 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8169 8170 desc->closed = true; 8171 8172 if (desc->claim != NULL) { 8173 bdev_desc_release_claims(desc); 8174 } 8175 8176 if (0 == desc->refs) { 8177 spdk_spin_unlock(&desc->spinlock); 8178 bdev_desc_free(desc); 8179 } else { 8180 spdk_spin_unlock(&desc->spinlock); 8181 } 8182 8183 /* If no more descriptors, kill QoS channel */ 8184 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8185 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8186 bdev->name, spdk_get_thread()); 8187 8188 if (bdev_qos_destroy(bdev)) { 8189 /* There isn't anything we can do to recover here. Just let the 8190 * old QoS poller keep running. The QoS handling won't change 8191 * cores when the user allocates a new channel, but it won't break. */ 8192 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8193 } 8194 } 8195 8196 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8197 rc = bdev_unregister_unsafe(bdev); 8198 spdk_spin_unlock(&bdev->internal.spinlock); 8199 8200 if (rc == 0) { 8201 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8202 } 8203 } else { 8204 spdk_spin_unlock(&bdev->internal.spinlock); 8205 } 8206 } 8207 8208 void 8209 spdk_bdev_close(struct spdk_bdev_desc *desc) 8210 { 8211 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8212 8213 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8214 spdk_get_thread()); 8215 8216 assert(desc->thread == spdk_get_thread()); 8217 8218 spdk_poller_unregister(&desc->io_timeout_poller); 8219 8220 spdk_spin_lock(&g_bdev_mgr.spinlock); 8221 8222 bdev_close(bdev, desc); 8223 8224 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8225 } 8226 8227 static void 8228 bdev_register_finished(void *arg) 8229 { 8230 struct spdk_bdev_desc *desc = arg; 8231 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8232 8233 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8234 8235 spdk_spin_lock(&g_bdev_mgr.spinlock); 8236 8237 bdev_close(bdev, desc); 8238 8239 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8240 } 8241 8242 int 8243 spdk_bdev_register(struct spdk_bdev *bdev) 8244 { 8245 struct spdk_bdev_desc *desc; 8246 struct spdk_thread *thread = spdk_get_thread(); 8247 int rc; 8248 8249 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8250 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 8251 thread ? spdk_thread_get_name(thread) : "null"); 8252 return -EINVAL; 8253 } 8254 8255 rc = bdev_register(bdev); 8256 if (rc != 0) { 8257 return rc; 8258 } 8259 8260 /* A descriptor is opened to prevent bdev deletion during examination */ 8261 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8262 if (rc != 0) { 8263 spdk_bdev_unregister(bdev, NULL, NULL); 8264 return rc; 8265 } 8266 8267 rc = bdev_open(bdev, false, desc); 8268 if (rc != 0) { 8269 bdev_desc_free(desc); 8270 spdk_bdev_unregister(bdev, NULL, NULL); 8271 return rc; 8272 } 8273 8274 /* Examine configuration before initializing I/O */ 8275 bdev_examine(bdev); 8276 8277 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8278 if (rc != 0) { 8279 bdev_close(bdev, desc); 8280 spdk_bdev_unregister(bdev, NULL, NULL); 8281 } 8282 8283 return rc; 8284 } 8285 8286 int 8287 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8288 struct spdk_bdev_module *module) 8289 { 8290 spdk_spin_lock(&bdev->internal.spinlock); 8291 8292 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8293 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8294 spdk_spin_unlock(&bdev->internal.spinlock); 8295 return -EPERM; 8296 } 8297 8298 if (desc && !desc->write) { 8299 desc->write = true; 8300 } 8301 8302 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8303 bdev->internal.claim.v1.module = module; 8304 8305 spdk_spin_unlock(&bdev->internal.spinlock); 8306 return 0; 8307 } 8308 8309 void 8310 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8311 { 8312 spdk_spin_lock(&bdev->internal.spinlock); 8313 8314 assert(bdev->internal.claim.v1.module != NULL); 8315 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8316 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8317 bdev->internal.claim.v1.module = NULL; 8318 8319 spdk_spin_unlock(&bdev->internal.spinlock); 8320 } 8321 8322 /* 8323 * Start claims v2 8324 */ 8325 8326 const char * 8327 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8328 { 8329 switch (type) { 8330 case SPDK_BDEV_CLAIM_NONE: 8331 return "not_claimed"; 8332 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8333 return "exclusive_write"; 8334 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8335 return "read_many_write_one"; 8336 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8337 return "read_many_write_none"; 8338 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8339 return "read_many_write_many"; 8340 default: 8341 break; 8342 } 8343 return "invalid_claim"; 8344 } 8345 8346 static bool 8347 claim_type_is_v2(enum spdk_bdev_claim_type type) 8348 { 8349 switch (type) { 8350 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8351 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8352 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8353 return true; 8354 default: 8355 break; 8356 } 8357 return false; 8358 } 8359 8360 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8361 static bool 8362 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8363 { 8364 switch (type) { 8365 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8366 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8367 return true; 8368 default: 8369 break; 8370 } 8371 return false; 8372 } 8373 8374 void 8375 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8376 { 8377 if (opts == NULL) { 8378 SPDK_ERRLOG("opts should not be NULL\n"); 8379 assert(opts != NULL); 8380 return; 8381 } 8382 if (size == 0) { 8383 SPDK_ERRLOG("size should not be zero\n"); 8384 assert(size != 0); 8385 return; 8386 } 8387 8388 memset(opts, 0, size); 8389 opts->opts_size = size; 8390 8391 #define FIELD_OK(field) \ 8392 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8393 8394 #define SET_FIELD(field, value) \ 8395 if (FIELD_OK(field)) { \ 8396 opts->field = value; \ 8397 } \ 8398 8399 SET_FIELD(shared_claim_key, 0); 8400 8401 #undef FIELD_OK 8402 #undef SET_FIELD 8403 } 8404 8405 static int 8406 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8407 { 8408 if (src->opts_size == 0) { 8409 SPDK_ERRLOG("size should not be zero\n"); 8410 return -1; 8411 } 8412 8413 memset(dst, 0, sizeof(*dst)); 8414 dst->opts_size = src->opts_size; 8415 8416 #define FIELD_OK(field) \ 8417 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8418 8419 #define SET_FIELD(field) \ 8420 if (FIELD_OK(field)) { \ 8421 dst->field = src->field; \ 8422 } \ 8423 8424 if (FIELD_OK(name)) { 8425 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8426 } 8427 8428 SET_FIELD(shared_claim_key); 8429 8430 /* You should not remove this statement, but need to update the assert statement 8431 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8432 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8433 8434 #undef FIELD_OK 8435 #undef SET_FIELD 8436 return 0; 8437 } 8438 8439 /* Returns 0 if a read-write-once claim can be taken. */ 8440 static int 8441 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8442 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8443 { 8444 struct spdk_bdev *bdev = desc->bdev; 8445 struct spdk_bdev_desc *open_desc; 8446 8447 assert(spdk_spin_held(&bdev->internal.spinlock)); 8448 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8449 8450 if (opts->shared_claim_key != 0) { 8451 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8452 bdev->name); 8453 return -EINVAL; 8454 } 8455 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8456 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8457 return -EPERM; 8458 } 8459 if (desc->claim != NULL) { 8460 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8461 bdev->name, desc->claim->module->name); 8462 return -EPERM; 8463 } 8464 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8465 if (desc != open_desc && open_desc->write) { 8466 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8467 "another descriptor is open for writing\n", 8468 bdev->name); 8469 return -EPERM; 8470 } 8471 } 8472 8473 return 0; 8474 } 8475 8476 /* Returns 0 if a read-only-many claim can be taken. */ 8477 static int 8478 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8479 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8480 { 8481 struct spdk_bdev *bdev = desc->bdev; 8482 struct spdk_bdev_desc *open_desc; 8483 8484 assert(spdk_spin_held(&bdev->internal.spinlock)); 8485 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8486 assert(desc->claim == NULL); 8487 8488 if (desc->write) { 8489 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8490 bdev->name); 8491 return -EINVAL; 8492 } 8493 if (opts->shared_claim_key != 0) { 8494 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8495 return -EINVAL; 8496 } 8497 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8498 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8499 if (open_desc->write) { 8500 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8501 "another descriptor is open for writing\n", 8502 bdev->name); 8503 return -EPERM; 8504 } 8505 } 8506 } 8507 8508 return 0; 8509 } 8510 8511 /* Returns 0 if a read-write-many claim can be taken. */ 8512 static int 8513 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8514 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8515 { 8516 struct spdk_bdev *bdev = desc->bdev; 8517 struct spdk_bdev_desc *open_desc; 8518 8519 assert(spdk_spin_held(&bdev->internal.spinlock)); 8520 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8521 assert(desc->claim == NULL); 8522 8523 if (opts->shared_claim_key == 0) { 8524 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8525 bdev->name); 8526 return -EINVAL; 8527 } 8528 switch (bdev->internal.claim_type) { 8529 case SPDK_BDEV_CLAIM_NONE: 8530 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8531 if (open_desc == desc) { 8532 continue; 8533 } 8534 if (open_desc->write) { 8535 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8536 "another descriptor is open for writing without a " 8537 "claim\n", bdev->name); 8538 return -EPERM; 8539 } 8540 } 8541 break; 8542 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8543 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8544 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8545 return -EPERM; 8546 } 8547 break; 8548 default: 8549 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8550 return -EBUSY; 8551 } 8552 8553 return 0; 8554 } 8555 8556 /* Updates desc and its bdev with a v2 claim. */ 8557 static int 8558 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8559 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8560 { 8561 struct spdk_bdev *bdev = desc->bdev; 8562 struct spdk_bdev_module_claim *claim; 8563 8564 assert(spdk_spin_held(&bdev->internal.spinlock)); 8565 assert(claim_type_is_v2(type)); 8566 assert(desc->claim == NULL); 8567 8568 claim = calloc(1, sizeof(*desc->claim)); 8569 if (claim == NULL) { 8570 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8571 return -ENOMEM; 8572 } 8573 claim->module = module; 8574 claim->desc = desc; 8575 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8576 memcpy(claim->name, opts->name, sizeof(claim->name)); 8577 desc->claim = claim; 8578 8579 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8580 bdev->internal.claim_type = type; 8581 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8582 bdev->internal.claim.v2.key = opts->shared_claim_key; 8583 } 8584 assert(type == bdev->internal.claim_type); 8585 8586 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8587 8588 if (!desc->write && claim_type_promotes_to_write(type)) { 8589 desc->write = true; 8590 } 8591 8592 return 0; 8593 } 8594 8595 int 8596 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8597 struct spdk_bdev_claim_opts *_opts, 8598 struct spdk_bdev_module *module) 8599 { 8600 struct spdk_bdev *bdev; 8601 struct spdk_bdev_claim_opts opts; 8602 int rc = 0; 8603 8604 if (desc == NULL) { 8605 SPDK_ERRLOG("descriptor must not be NULL\n"); 8606 return -EINVAL; 8607 } 8608 8609 bdev = desc->bdev; 8610 8611 if (_opts == NULL) { 8612 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8613 } else if (claim_opts_copy(_opts, &opts) != 0) { 8614 return -EINVAL; 8615 } 8616 8617 spdk_spin_lock(&bdev->internal.spinlock); 8618 8619 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8620 bdev->internal.claim_type != type) { 8621 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8622 spdk_spin_unlock(&bdev->internal.spinlock); 8623 return -EPERM; 8624 } 8625 8626 if (claim_type_is_v2(type) && desc->claim != NULL) { 8627 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8628 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8629 spdk_spin_unlock(&bdev->internal.spinlock); 8630 return -EPERM; 8631 } 8632 8633 switch (type) { 8634 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8635 spdk_spin_unlock(&bdev->internal.spinlock); 8636 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8637 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8638 rc = claim_verify_rwo(desc, type, &opts, module); 8639 break; 8640 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8641 rc = claim_verify_rom(desc, type, &opts, module); 8642 break; 8643 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8644 rc = claim_verify_rwm(desc, type, &opts, module); 8645 break; 8646 default: 8647 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8648 rc = -ENOTSUP; 8649 } 8650 8651 if (rc == 0) { 8652 rc = claim_bdev(desc, type, &opts, module); 8653 } 8654 8655 spdk_spin_unlock(&bdev->internal.spinlock); 8656 return rc; 8657 } 8658 8659 static void 8660 claim_reset(struct spdk_bdev *bdev) 8661 { 8662 assert(spdk_spin_held(&bdev->internal.spinlock)); 8663 assert(claim_type_is_v2(bdev->internal.claim_type)); 8664 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8665 8666 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8667 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8668 } 8669 8670 static void 8671 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8672 { 8673 struct spdk_bdev *bdev = desc->bdev; 8674 8675 assert(spdk_spin_held(&bdev->internal.spinlock)); 8676 assert(claim_type_is_v2(bdev->internal.claim_type)); 8677 8678 if (bdev->internal.examine_in_progress == 0) { 8679 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8680 free(desc->claim); 8681 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8682 claim_reset(bdev); 8683 } 8684 } else { 8685 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8686 desc->claim->module = NULL; 8687 desc->claim->desc = NULL; 8688 } 8689 desc->claim = NULL; 8690 } 8691 8692 /* 8693 * End claims v2 8694 */ 8695 8696 struct spdk_bdev * 8697 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8698 { 8699 assert(desc != NULL); 8700 return desc->bdev; 8701 } 8702 8703 int 8704 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8705 { 8706 struct spdk_bdev *bdev, *tmp; 8707 struct spdk_bdev_desc *desc; 8708 int rc = 0; 8709 8710 assert(fn != NULL); 8711 8712 spdk_spin_lock(&g_bdev_mgr.spinlock); 8713 bdev = spdk_bdev_first(); 8714 while (bdev != NULL) { 8715 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8716 if (rc != 0) { 8717 break; 8718 } 8719 rc = bdev_open(bdev, false, desc); 8720 if (rc != 0) { 8721 bdev_desc_free(desc); 8722 if (rc == -ENODEV) { 8723 /* Ignore the error and move to the next bdev. */ 8724 rc = 0; 8725 bdev = spdk_bdev_next(bdev); 8726 continue; 8727 } 8728 break; 8729 } 8730 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8731 8732 rc = fn(ctx, bdev); 8733 8734 spdk_spin_lock(&g_bdev_mgr.spinlock); 8735 tmp = spdk_bdev_next(bdev); 8736 bdev_close(bdev, desc); 8737 if (rc != 0) { 8738 break; 8739 } 8740 bdev = tmp; 8741 } 8742 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8743 8744 return rc; 8745 } 8746 8747 int 8748 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8749 { 8750 struct spdk_bdev *bdev, *tmp; 8751 struct spdk_bdev_desc *desc; 8752 int rc = 0; 8753 8754 assert(fn != NULL); 8755 8756 spdk_spin_lock(&g_bdev_mgr.spinlock); 8757 bdev = spdk_bdev_first_leaf(); 8758 while (bdev != NULL) { 8759 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8760 if (rc != 0) { 8761 break; 8762 } 8763 rc = bdev_open(bdev, false, desc); 8764 if (rc != 0) { 8765 bdev_desc_free(desc); 8766 if (rc == -ENODEV) { 8767 /* Ignore the error and move to the next bdev. */ 8768 rc = 0; 8769 bdev = spdk_bdev_next_leaf(bdev); 8770 continue; 8771 } 8772 break; 8773 } 8774 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8775 8776 rc = fn(ctx, bdev); 8777 8778 spdk_spin_lock(&g_bdev_mgr.spinlock); 8779 tmp = spdk_bdev_next_leaf(bdev); 8780 bdev_close(bdev, desc); 8781 if (rc != 0) { 8782 break; 8783 } 8784 bdev = tmp; 8785 } 8786 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8787 8788 return rc; 8789 } 8790 8791 void 8792 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8793 { 8794 struct iovec *iovs; 8795 int iovcnt; 8796 8797 if (bdev_io == NULL) { 8798 return; 8799 } 8800 8801 switch (bdev_io->type) { 8802 case SPDK_BDEV_IO_TYPE_READ: 8803 case SPDK_BDEV_IO_TYPE_WRITE: 8804 case SPDK_BDEV_IO_TYPE_ZCOPY: 8805 iovs = bdev_io->u.bdev.iovs; 8806 iovcnt = bdev_io->u.bdev.iovcnt; 8807 break; 8808 default: 8809 iovs = NULL; 8810 iovcnt = 0; 8811 break; 8812 } 8813 8814 if (iovp) { 8815 *iovp = iovs; 8816 } 8817 if (iovcntp) { 8818 *iovcntp = iovcnt; 8819 } 8820 } 8821 8822 void * 8823 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8824 { 8825 if (bdev_io == NULL) { 8826 return NULL; 8827 } 8828 8829 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8830 return NULL; 8831 } 8832 8833 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8834 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8835 return bdev_io->u.bdev.md_buf; 8836 } 8837 8838 return NULL; 8839 } 8840 8841 void * 8842 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8843 { 8844 if (bdev_io == NULL) { 8845 assert(false); 8846 return NULL; 8847 } 8848 8849 return bdev_io->internal.caller_ctx; 8850 } 8851 8852 void 8853 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8854 { 8855 8856 if (spdk_bdev_module_list_find(bdev_module->name)) { 8857 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8858 assert(false); 8859 } 8860 8861 spdk_spin_init(&bdev_module->internal.spinlock); 8862 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8863 8864 /* 8865 * Modules with examine callbacks must be initialized first, so they are 8866 * ready to handle examine callbacks from later modules that will 8867 * register physical bdevs. 8868 */ 8869 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8870 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8871 } else { 8872 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8873 } 8874 } 8875 8876 struct spdk_bdev_module * 8877 spdk_bdev_module_list_find(const char *name) 8878 { 8879 struct spdk_bdev_module *bdev_module; 8880 8881 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8882 if (strcmp(name, bdev_module->name) == 0) { 8883 break; 8884 } 8885 } 8886 8887 return bdev_module; 8888 } 8889 8890 static int 8891 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8892 { 8893 uint64_t num_blocks; 8894 void *md_buf = NULL; 8895 8896 num_blocks = bdev_io->u.bdev.num_blocks; 8897 8898 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8899 md_buf = (char *)g_bdev_mgr.zero_buffer + 8900 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8901 } 8902 8903 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8904 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8905 g_bdev_mgr.zero_buffer, md_buf, 8906 bdev_io->u.bdev.offset_blocks, num_blocks, 8907 bdev_write_zero_buffer_done, bdev_io); 8908 } 8909 8910 static void 8911 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8912 { 8913 struct spdk_bdev_io *parent_io = cb_arg; 8914 8915 spdk_bdev_free_io(bdev_io); 8916 8917 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8918 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8919 } 8920 8921 static void 8922 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8923 { 8924 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8925 ctx->bdev->internal.qos_mod_in_progress = false; 8926 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8927 8928 if (ctx->cb_fn) { 8929 ctx->cb_fn(ctx->cb_arg, status); 8930 } 8931 free(ctx); 8932 } 8933 8934 static void 8935 bdev_disable_qos_done(void *cb_arg) 8936 { 8937 struct set_qos_limit_ctx *ctx = cb_arg; 8938 struct spdk_bdev *bdev = ctx->bdev; 8939 struct spdk_bdev_io *bdev_io; 8940 struct spdk_bdev_qos *qos; 8941 8942 spdk_spin_lock(&bdev->internal.spinlock); 8943 qos = bdev->internal.qos; 8944 bdev->internal.qos = NULL; 8945 spdk_spin_unlock(&bdev->internal.spinlock); 8946 8947 while (!TAILQ_EMPTY(&qos->queued)) { 8948 /* Send queued I/O back to their original thread for resubmission. */ 8949 bdev_io = TAILQ_FIRST(&qos->queued); 8950 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8951 8952 if (bdev_io->internal.io_submit_ch) { 8953 /* 8954 * Channel was changed when sending it to the QoS thread - change it back 8955 * before sending it back to the original thread. 8956 */ 8957 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8958 bdev_io->internal.io_submit_ch = NULL; 8959 } 8960 8961 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8962 _bdev_io_submit, bdev_io); 8963 } 8964 8965 if (qos->thread != NULL) { 8966 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8967 spdk_poller_unregister(&qos->poller); 8968 } 8969 8970 free(qos); 8971 8972 bdev_set_qos_limit_done(ctx, 0); 8973 } 8974 8975 static void 8976 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8977 { 8978 struct set_qos_limit_ctx *ctx = _ctx; 8979 struct spdk_thread *thread; 8980 8981 spdk_spin_lock(&bdev->internal.spinlock); 8982 thread = bdev->internal.qos->thread; 8983 spdk_spin_unlock(&bdev->internal.spinlock); 8984 8985 if (thread != NULL) { 8986 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8987 } else { 8988 bdev_disable_qos_done(ctx); 8989 } 8990 } 8991 8992 static void 8993 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8994 struct spdk_io_channel *ch, void *_ctx) 8995 { 8996 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8997 8998 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8999 9000 spdk_bdev_for_each_channel_continue(i, 0); 9001 } 9002 9003 static void 9004 bdev_update_qos_rate_limit_msg(void *cb_arg) 9005 { 9006 struct set_qos_limit_ctx *ctx = cb_arg; 9007 struct spdk_bdev *bdev = ctx->bdev; 9008 9009 spdk_spin_lock(&bdev->internal.spinlock); 9010 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9011 spdk_spin_unlock(&bdev->internal.spinlock); 9012 9013 bdev_set_qos_limit_done(ctx, 0); 9014 } 9015 9016 static void 9017 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9018 struct spdk_io_channel *ch, void *_ctx) 9019 { 9020 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9021 9022 spdk_spin_lock(&bdev->internal.spinlock); 9023 bdev_enable_qos(bdev, bdev_ch); 9024 spdk_spin_unlock(&bdev->internal.spinlock); 9025 spdk_bdev_for_each_channel_continue(i, 0); 9026 } 9027 9028 static void 9029 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9030 { 9031 struct set_qos_limit_ctx *ctx = _ctx; 9032 9033 bdev_set_qos_limit_done(ctx, status); 9034 } 9035 9036 static void 9037 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9038 { 9039 int i; 9040 9041 assert(bdev->internal.qos != NULL); 9042 9043 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9044 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9045 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9046 9047 if (limits[i] == 0) { 9048 bdev->internal.qos->rate_limits[i].limit = 9049 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9050 } 9051 } 9052 } 9053 } 9054 9055 void 9056 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9057 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9058 { 9059 struct set_qos_limit_ctx *ctx; 9060 uint32_t limit_set_complement; 9061 uint64_t min_limit_per_sec; 9062 int i; 9063 bool disable_rate_limit = true; 9064 9065 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9066 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9067 continue; 9068 } 9069 9070 if (limits[i] > 0) { 9071 disable_rate_limit = false; 9072 } 9073 9074 if (bdev_qos_is_iops_rate_limit(i) == true) { 9075 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9076 } else { 9077 /* Change from megabyte to byte rate limit */ 9078 limits[i] = limits[i] * 1024 * 1024; 9079 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9080 } 9081 9082 limit_set_complement = limits[i] % min_limit_per_sec; 9083 if (limit_set_complement) { 9084 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9085 limits[i], min_limit_per_sec); 9086 limits[i] += min_limit_per_sec - limit_set_complement; 9087 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9088 } 9089 } 9090 9091 ctx = calloc(1, sizeof(*ctx)); 9092 if (ctx == NULL) { 9093 cb_fn(cb_arg, -ENOMEM); 9094 return; 9095 } 9096 9097 ctx->cb_fn = cb_fn; 9098 ctx->cb_arg = cb_arg; 9099 ctx->bdev = bdev; 9100 9101 spdk_spin_lock(&bdev->internal.spinlock); 9102 if (bdev->internal.qos_mod_in_progress) { 9103 spdk_spin_unlock(&bdev->internal.spinlock); 9104 free(ctx); 9105 cb_fn(cb_arg, -EAGAIN); 9106 return; 9107 } 9108 bdev->internal.qos_mod_in_progress = true; 9109 9110 if (disable_rate_limit == true && bdev->internal.qos) { 9111 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9112 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9113 (bdev->internal.qos->rate_limits[i].limit > 0 && 9114 bdev->internal.qos->rate_limits[i].limit != 9115 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9116 disable_rate_limit = false; 9117 break; 9118 } 9119 } 9120 } 9121 9122 if (disable_rate_limit == false) { 9123 if (bdev->internal.qos == NULL) { 9124 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9125 if (!bdev->internal.qos) { 9126 spdk_spin_unlock(&bdev->internal.spinlock); 9127 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9128 bdev_set_qos_limit_done(ctx, -ENOMEM); 9129 return; 9130 } 9131 } 9132 9133 if (bdev->internal.qos->thread == NULL) { 9134 /* Enabling */ 9135 bdev_set_qos_rate_limits(bdev, limits); 9136 9137 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9138 bdev_enable_qos_done); 9139 } else { 9140 /* Updating */ 9141 bdev_set_qos_rate_limits(bdev, limits); 9142 9143 spdk_thread_send_msg(bdev->internal.qos->thread, 9144 bdev_update_qos_rate_limit_msg, ctx); 9145 } 9146 } else { 9147 if (bdev->internal.qos != NULL) { 9148 bdev_set_qos_rate_limits(bdev, limits); 9149 9150 /* Disabling */ 9151 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9152 bdev_disable_qos_msg_done); 9153 } else { 9154 spdk_spin_unlock(&bdev->internal.spinlock); 9155 bdev_set_qos_limit_done(ctx, 0); 9156 return; 9157 } 9158 } 9159 9160 spdk_spin_unlock(&bdev->internal.spinlock); 9161 } 9162 9163 struct spdk_bdev_histogram_ctx { 9164 spdk_bdev_histogram_status_cb cb_fn; 9165 void *cb_arg; 9166 struct spdk_bdev *bdev; 9167 int status; 9168 }; 9169 9170 static void 9171 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9172 { 9173 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9174 9175 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9176 ctx->bdev->internal.histogram_in_progress = false; 9177 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9178 ctx->cb_fn(ctx->cb_arg, ctx->status); 9179 free(ctx); 9180 } 9181 9182 static void 9183 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9184 struct spdk_io_channel *_ch, void *_ctx) 9185 { 9186 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9187 9188 if (ch->histogram != NULL) { 9189 spdk_histogram_data_free(ch->histogram); 9190 ch->histogram = NULL; 9191 } 9192 spdk_bdev_for_each_channel_continue(i, 0); 9193 } 9194 9195 static void 9196 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9197 { 9198 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9199 9200 if (status != 0) { 9201 ctx->status = status; 9202 ctx->bdev->internal.histogram_enabled = false; 9203 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9204 bdev_histogram_disable_channel_cb); 9205 } else { 9206 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9207 ctx->bdev->internal.histogram_in_progress = false; 9208 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9209 ctx->cb_fn(ctx->cb_arg, ctx->status); 9210 free(ctx); 9211 } 9212 } 9213 9214 static void 9215 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9216 struct spdk_io_channel *_ch, void *_ctx) 9217 { 9218 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9219 int status = 0; 9220 9221 if (ch->histogram == NULL) { 9222 ch->histogram = spdk_histogram_data_alloc(); 9223 if (ch->histogram == NULL) { 9224 status = -ENOMEM; 9225 } 9226 } 9227 9228 spdk_bdev_for_each_channel_continue(i, status); 9229 } 9230 9231 void 9232 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9233 void *cb_arg, bool enable) 9234 { 9235 struct spdk_bdev_histogram_ctx *ctx; 9236 9237 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9238 if (ctx == NULL) { 9239 cb_fn(cb_arg, -ENOMEM); 9240 return; 9241 } 9242 9243 ctx->bdev = bdev; 9244 ctx->status = 0; 9245 ctx->cb_fn = cb_fn; 9246 ctx->cb_arg = cb_arg; 9247 9248 spdk_spin_lock(&bdev->internal.spinlock); 9249 if (bdev->internal.histogram_in_progress) { 9250 spdk_spin_unlock(&bdev->internal.spinlock); 9251 free(ctx); 9252 cb_fn(cb_arg, -EAGAIN); 9253 return; 9254 } 9255 9256 bdev->internal.histogram_in_progress = true; 9257 spdk_spin_unlock(&bdev->internal.spinlock); 9258 9259 bdev->internal.histogram_enabled = enable; 9260 9261 if (enable) { 9262 /* Allocate histogram for each channel */ 9263 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9264 bdev_histogram_enable_channel_cb); 9265 } else { 9266 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9267 bdev_histogram_disable_channel_cb); 9268 } 9269 } 9270 9271 struct spdk_bdev_histogram_data_ctx { 9272 spdk_bdev_histogram_data_cb cb_fn; 9273 void *cb_arg; 9274 struct spdk_bdev *bdev; 9275 /** merged histogram data from all channels */ 9276 struct spdk_histogram_data *histogram; 9277 }; 9278 9279 static void 9280 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9281 { 9282 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9283 9284 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9285 free(ctx); 9286 } 9287 9288 static void 9289 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9290 struct spdk_io_channel *_ch, void *_ctx) 9291 { 9292 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9293 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9294 int status = 0; 9295 9296 if (ch->histogram == NULL) { 9297 status = -EFAULT; 9298 } else { 9299 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9300 } 9301 9302 spdk_bdev_for_each_channel_continue(i, status); 9303 } 9304 9305 void 9306 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9307 spdk_bdev_histogram_data_cb cb_fn, 9308 void *cb_arg) 9309 { 9310 struct spdk_bdev_histogram_data_ctx *ctx; 9311 9312 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9313 if (ctx == NULL) { 9314 cb_fn(cb_arg, -ENOMEM, NULL); 9315 return; 9316 } 9317 9318 ctx->bdev = bdev; 9319 ctx->cb_fn = cb_fn; 9320 ctx->cb_arg = cb_arg; 9321 9322 ctx->histogram = histogram; 9323 9324 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9325 bdev_histogram_get_channel_cb); 9326 } 9327 9328 void 9329 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9330 void *cb_arg) 9331 { 9332 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9333 int status = 0; 9334 9335 assert(cb_fn != NULL); 9336 9337 if (bdev_ch->histogram == NULL) { 9338 status = -EFAULT; 9339 } 9340 cb_fn(cb_arg, status, bdev_ch->histogram); 9341 } 9342 9343 size_t 9344 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9345 size_t max_events) 9346 { 9347 struct media_event_entry *entry; 9348 size_t num_events = 0; 9349 9350 for (; num_events < max_events; ++num_events) { 9351 entry = TAILQ_FIRST(&desc->pending_media_events); 9352 if (entry == NULL) { 9353 break; 9354 } 9355 9356 events[num_events] = entry->event; 9357 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9358 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9359 } 9360 9361 return num_events; 9362 } 9363 9364 int 9365 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9366 size_t num_events) 9367 { 9368 struct spdk_bdev_desc *desc; 9369 struct media_event_entry *entry; 9370 size_t event_id; 9371 int rc = 0; 9372 9373 assert(bdev->media_events); 9374 9375 spdk_spin_lock(&bdev->internal.spinlock); 9376 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9377 if (desc->write) { 9378 break; 9379 } 9380 } 9381 9382 if (desc == NULL || desc->media_events_buffer == NULL) { 9383 rc = -ENODEV; 9384 goto out; 9385 } 9386 9387 for (event_id = 0; event_id < num_events; ++event_id) { 9388 entry = TAILQ_FIRST(&desc->free_media_events); 9389 if (entry == NULL) { 9390 break; 9391 } 9392 9393 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9394 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9395 entry->event = events[event_id]; 9396 } 9397 9398 rc = event_id; 9399 out: 9400 spdk_spin_unlock(&bdev->internal.spinlock); 9401 return rc; 9402 } 9403 9404 static void 9405 _media_management_notify(void *arg) 9406 { 9407 struct spdk_bdev_desc *desc = arg; 9408 9409 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9410 } 9411 9412 void 9413 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9414 { 9415 struct spdk_bdev_desc *desc; 9416 9417 spdk_spin_lock(&bdev->internal.spinlock); 9418 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9419 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9420 event_notify(desc, _media_management_notify); 9421 } 9422 } 9423 spdk_spin_unlock(&bdev->internal.spinlock); 9424 } 9425 9426 struct locked_lba_range_ctx { 9427 struct lba_range range; 9428 struct lba_range *current_range; 9429 struct lba_range *owner_range; 9430 struct spdk_poller *poller; 9431 lock_range_cb cb_fn; 9432 void *cb_arg; 9433 }; 9434 9435 static void 9436 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9437 { 9438 struct locked_lba_range_ctx *ctx = _ctx; 9439 9440 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9441 free(ctx); 9442 } 9443 9444 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9445 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9446 9447 static void 9448 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9449 { 9450 struct locked_lba_range_ctx *ctx = _ctx; 9451 9452 if (status == -ENOMEM) { 9453 /* One of the channels could not allocate a range object. 9454 * So we have to go back and clean up any ranges that were 9455 * allocated successfully before we return error status to 9456 * the caller. We can reuse the unlock function to do that 9457 * clean up. 9458 */ 9459 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9460 bdev_lock_error_cleanup_cb); 9461 return; 9462 } 9463 9464 /* All channels have locked this range and no I/O overlapping the range 9465 * are outstanding! Set the owner_ch for the range object for the 9466 * locking channel, so that this channel will know that it is allowed 9467 * to write to this range. 9468 */ 9469 if (ctx->owner_range != NULL) { 9470 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9471 } 9472 9473 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9474 9475 /* Don't free the ctx here. Its range is in the bdev's global list of 9476 * locked ranges still, and will be removed and freed when this range 9477 * is later unlocked. 9478 */ 9479 } 9480 9481 static int 9482 bdev_lock_lba_range_check_io(void *_i) 9483 { 9484 struct spdk_bdev_channel_iter *i = _i; 9485 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9486 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9487 struct locked_lba_range_ctx *ctx = i->ctx; 9488 struct lba_range *range = ctx->current_range; 9489 struct spdk_bdev_io *bdev_io; 9490 9491 spdk_poller_unregister(&ctx->poller); 9492 9493 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9494 * range. But we need to wait until any outstanding IO overlapping with this range 9495 * are completed. 9496 */ 9497 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9498 if (bdev_io_range_is_locked(bdev_io, range)) { 9499 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9500 return SPDK_POLLER_BUSY; 9501 } 9502 } 9503 9504 spdk_bdev_for_each_channel_continue(i, 0); 9505 return SPDK_POLLER_BUSY; 9506 } 9507 9508 static void 9509 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9510 struct spdk_io_channel *_ch, void *_ctx) 9511 { 9512 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9513 struct locked_lba_range_ctx *ctx = _ctx; 9514 struct lba_range *range; 9515 9516 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9517 if (range->length == ctx->range.length && 9518 range->offset == ctx->range.offset && 9519 range->locked_ctx == ctx->range.locked_ctx) { 9520 /* This range already exists on this channel, so don't add 9521 * it again. This can happen when a new channel is created 9522 * while the for_each_channel operation is in progress. 9523 * Do not check for outstanding I/O in that case, since the 9524 * range was locked before any I/O could be submitted to the 9525 * new channel. 9526 */ 9527 spdk_bdev_for_each_channel_continue(i, 0); 9528 return; 9529 } 9530 } 9531 9532 range = calloc(1, sizeof(*range)); 9533 if (range == NULL) { 9534 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9535 return; 9536 } 9537 9538 range->length = ctx->range.length; 9539 range->offset = ctx->range.offset; 9540 range->locked_ctx = ctx->range.locked_ctx; 9541 ctx->current_range = range; 9542 if (ctx->range.owner_ch == ch) { 9543 /* This is the range object for the channel that will hold 9544 * the lock. Store it in the ctx object so that we can easily 9545 * set its owner_ch after the lock is finally acquired. 9546 */ 9547 ctx->owner_range = range; 9548 } 9549 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9550 bdev_lock_lba_range_check_io(i); 9551 } 9552 9553 static void 9554 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9555 { 9556 assert(spdk_get_thread() == ctx->range.owner_thread); 9557 assert(ctx->range.owner_ch == NULL || 9558 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9559 9560 /* We will add a copy of this range to each channel now. */ 9561 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9562 bdev_lock_lba_range_cb); 9563 } 9564 9565 static bool 9566 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9567 { 9568 struct lba_range *r; 9569 9570 TAILQ_FOREACH(r, tailq, tailq) { 9571 if (bdev_lba_range_overlapped(range, r)) { 9572 return true; 9573 } 9574 } 9575 return false; 9576 } 9577 9578 static int 9579 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9580 uint64_t offset, uint64_t length, 9581 lock_range_cb cb_fn, void *cb_arg) 9582 { 9583 struct locked_lba_range_ctx *ctx; 9584 9585 ctx = calloc(1, sizeof(*ctx)); 9586 if (ctx == NULL) { 9587 return -ENOMEM; 9588 } 9589 9590 ctx->range.offset = offset; 9591 ctx->range.length = length; 9592 ctx->range.owner_thread = spdk_get_thread(); 9593 ctx->range.owner_ch = ch; 9594 ctx->range.locked_ctx = cb_arg; 9595 ctx->range.bdev = bdev; 9596 ctx->cb_fn = cb_fn; 9597 ctx->cb_arg = cb_arg; 9598 9599 spdk_spin_lock(&bdev->internal.spinlock); 9600 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9601 /* There is an active lock overlapping with this range. 9602 * Put it on the pending list until this range no 9603 * longer overlaps with another. 9604 */ 9605 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9606 } else { 9607 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9608 bdev_lock_lba_range_ctx(bdev, ctx); 9609 } 9610 spdk_spin_unlock(&bdev->internal.spinlock); 9611 return 0; 9612 } 9613 9614 static int 9615 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9616 uint64_t offset, uint64_t length, 9617 lock_range_cb cb_fn, void *cb_arg) 9618 { 9619 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9620 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9621 9622 if (cb_arg == NULL) { 9623 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9624 return -EINVAL; 9625 } 9626 9627 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9628 } 9629 9630 static void 9631 bdev_lock_lba_range_ctx_msg(void *_ctx) 9632 { 9633 struct locked_lba_range_ctx *ctx = _ctx; 9634 9635 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9636 } 9637 9638 static void 9639 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9640 { 9641 struct locked_lba_range_ctx *ctx = _ctx; 9642 struct locked_lba_range_ctx *pending_ctx; 9643 struct lba_range *range, *tmp; 9644 9645 spdk_spin_lock(&bdev->internal.spinlock); 9646 /* Check if there are any pending locked ranges that overlap with this range 9647 * that was just unlocked. If there are, check that it doesn't overlap with any 9648 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9649 * the lock process. 9650 */ 9651 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9652 if (bdev_lba_range_overlapped(range, &ctx->range) && 9653 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9654 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9655 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9656 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9657 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9658 bdev_lock_lba_range_ctx_msg, pending_ctx); 9659 } 9660 } 9661 spdk_spin_unlock(&bdev->internal.spinlock); 9662 9663 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9664 free(ctx); 9665 } 9666 9667 static void 9668 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9669 struct spdk_io_channel *_ch, void *_ctx) 9670 { 9671 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9672 struct locked_lba_range_ctx *ctx = _ctx; 9673 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9674 struct spdk_bdev_io *bdev_io; 9675 struct lba_range *range; 9676 9677 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9678 if (ctx->range.offset == range->offset && 9679 ctx->range.length == range->length && 9680 ctx->range.locked_ctx == range->locked_ctx) { 9681 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9682 free(range); 9683 break; 9684 } 9685 } 9686 9687 /* Note: we should almost always be able to assert that the range specified 9688 * was found. But there are some very rare corner cases where a new channel 9689 * gets created simultaneously with a range unlock, where this function 9690 * would execute on that new channel and wouldn't have the range. 9691 * We also use this to clean up range allocations when a later allocation 9692 * fails in the locking path. 9693 * So we can't actually assert() here. 9694 */ 9695 9696 /* Swap the locked IO into a temporary list, and then try to submit them again. 9697 * We could hyper-optimize this to only resubmit locked I/O that overlap 9698 * with the range that was just unlocked, but this isn't a performance path so 9699 * we go for simplicity here. 9700 */ 9701 TAILQ_INIT(&io_locked); 9702 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9703 while (!TAILQ_EMPTY(&io_locked)) { 9704 bdev_io = TAILQ_FIRST(&io_locked); 9705 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9706 bdev_io_submit(bdev_io); 9707 } 9708 9709 spdk_bdev_for_each_channel_continue(i, 0); 9710 } 9711 9712 static int 9713 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9714 lock_range_cb cb_fn, void *cb_arg) 9715 { 9716 struct locked_lba_range_ctx *ctx; 9717 struct lba_range *range; 9718 9719 spdk_spin_lock(&bdev->internal.spinlock); 9720 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9721 * and remove it. This ensures new channels don't inherit the locked range. 9722 * Then we will send a message to each channel to remove the range from its 9723 * per-channel list. 9724 */ 9725 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9726 if (range->offset == offset && range->length == length && 9727 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9728 break; 9729 } 9730 } 9731 if (range == NULL) { 9732 assert(false); 9733 spdk_spin_unlock(&bdev->internal.spinlock); 9734 return -EINVAL; 9735 } 9736 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9737 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9738 spdk_spin_unlock(&bdev->internal.spinlock); 9739 9740 ctx->cb_fn = cb_fn; 9741 ctx->cb_arg = cb_arg; 9742 9743 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9744 bdev_unlock_lba_range_cb); 9745 return 0; 9746 } 9747 9748 static int 9749 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9750 uint64_t offset, uint64_t length, 9751 lock_range_cb cb_fn, void *cb_arg) 9752 { 9753 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9754 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9755 struct lba_range *range; 9756 bool range_found = false; 9757 9758 /* Let's make sure the specified channel actually has a lock on 9759 * the specified range. Note that the range must match exactly. 9760 */ 9761 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9762 if (range->offset == offset && range->length == length && 9763 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9764 range_found = true; 9765 break; 9766 } 9767 } 9768 9769 if (!range_found) { 9770 return -EINVAL; 9771 } 9772 9773 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9774 } 9775 9776 struct bdev_quiesce_ctx { 9777 spdk_bdev_quiesce_cb cb_fn; 9778 void *cb_arg; 9779 }; 9780 9781 static void 9782 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9783 { 9784 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9785 9786 if (quiesce_ctx->cb_fn != NULL) { 9787 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9788 } 9789 9790 free(quiesce_ctx); 9791 } 9792 9793 static void 9794 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9795 { 9796 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9797 struct spdk_bdev_module *module = range->bdev->module; 9798 9799 if (status != 0) { 9800 if (quiesce_ctx->cb_fn != NULL) { 9801 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9802 } 9803 free(quiesce_ctx); 9804 return; 9805 } 9806 9807 spdk_spin_lock(&module->internal.spinlock); 9808 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9809 spdk_spin_unlock(&module->internal.spinlock); 9810 9811 if (quiesce_ctx->cb_fn != NULL) { 9812 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9813 quiesce_ctx->cb_fn = NULL; 9814 quiesce_ctx->cb_arg = NULL; 9815 } 9816 /* quiesce_ctx will be freed on unquiesce */ 9817 } 9818 9819 static int 9820 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9821 uint64_t offset, uint64_t length, 9822 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9823 bool unquiesce) 9824 { 9825 struct bdev_quiesce_ctx *quiesce_ctx; 9826 int rc; 9827 9828 if (module != bdev->module) { 9829 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9830 return -EINVAL; 9831 } 9832 9833 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9834 return -EINVAL; 9835 } 9836 9837 if (unquiesce) { 9838 struct lba_range *range; 9839 9840 /* Make sure the specified range is actually quiesced in the specified module and 9841 * then remove it from the list. Note that the range must match exactly. 9842 */ 9843 spdk_spin_lock(&module->internal.spinlock); 9844 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9845 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9846 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9847 break; 9848 } 9849 } 9850 spdk_spin_unlock(&module->internal.spinlock); 9851 9852 if (range == NULL) { 9853 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9854 return -EINVAL; 9855 } 9856 9857 quiesce_ctx = range->locked_ctx; 9858 quiesce_ctx->cb_fn = cb_fn; 9859 quiesce_ctx->cb_arg = cb_arg; 9860 9861 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9862 } else { 9863 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9864 if (quiesce_ctx == NULL) { 9865 return -ENOMEM; 9866 } 9867 9868 quiesce_ctx->cb_fn = cb_fn; 9869 quiesce_ctx->cb_arg = cb_arg; 9870 9871 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9872 if (rc != 0) { 9873 free(quiesce_ctx); 9874 } 9875 } 9876 9877 return rc; 9878 } 9879 9880 int 9881 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9882 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9883 { 9884 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9885 } 9886 9887 int 9888 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9889 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9890 { 9891 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9892 } 9893 9894 int 9895 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9896 uint64_t offset, uint64_t length, 9897 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9898 { 9899 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9900 } 9901 9902 int 9903 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9904 uint64_t offset, uint64_t length, 9905 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9906 { 9907 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9908 } 9909 9910 int 9911 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9912 int array_size) 9913 { 9914 if (!bdev) { 9915 return -EINVAL; 9916 } 9917 9918 if (bdev->fn_table->get_memory_domains) { 9919 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9920 } 9921 9922 return 0; 9923 } 9924 9925 struct spdk_bdev_for_each_io_ctx { 9926 void *ctx; 9927 spdk_bdev_io_fn fn; 9928 spdk_bdev_for_each_io_cb cb; 9929 }; 9930 9931 static void 9932 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9933 struct spdk_io_channel *io_ch, void *_ctx) 9934 { 9935 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9936 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9937 struct spdk_bdev_io *bdev_io; 9938 int rc = 0; 9939 9940 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9941 rc = ctx->fn(ctx->ctx, bdev_io); 9942 if (rc != 0) { 9943 break; 9944 } 9945 } 9946 9947 spdk_bdev_for_each_channel_continue(i, rc); 9948 } 9949 9950 static void 9951 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9952 { 9953 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9954 9955 ctx->cb(ctx->ctx, status); 9956 9957 free(ctx); 9958 } 9959 9960 void 9961 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9962 spdk_bdev_for_each_io_cb cb) 9963 { 9964 struct spdk_bdev_for_each_io_ctx *ctx; 9965 9966 assert(fn != NULL && cb != NULL); 9967 9968 ctx = calloc(1, sizeof(*ctx)); 9969 if (ctx == NULL) { 9970 SPDK_ERRLOG("Failed to allocate context.\n"); 9971 cb(_ctx, -ENOMEM); 9972 return; 9973 } 9974 9975 ctx->ctx = _ctx; 9976 ctx->fn = fn; 9977 ctx->cb = cb; 9978 9979 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9980 bdev_for_each_io_done); 9981 } 9982 9983 void 9984 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9985 { 9986 spdk_for_each_channel_continue(iter->i, status); 9987 } 9988 9989 static struct spdk_bdev * 9990 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9991 { 9992 void *io_device = spdk_io_channel_iter_get_io_device(i); 9993 9994 return __bdev_from_io_dev(io_device); 9995 } 9996 9997 static void 9998 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9999 { 10000 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10001 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10002 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10003 10004 iter->i = i; 10005 iter->fn(iter, bdev, ch, iter->ctx); 10006 } 10007 10008 static void 10009 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10010 { 10011 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10012 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10013 10014 iter->i = i; 10015 iter->cpl(bdev, iter->ctx, status); 10016 10017 free(iter); 10018 } 10019 10020 void 10021 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10022 void *ctx, spdk_bdev_for_each_channel_done cpl) 10023 { 10024 struct spdk_bdev_channel_iter *iter; 10025 10026 assert(bdev != NULL && fn != NULL && ctx != NULL); 10027 10028 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10029 if (iter == NULL) { 10030 SPDK_ERRLOG("Unable to allocate iterator\n"); 10031 assert(false); 10032 return; 10033 } 10034 10035 iter->fn = fn; 10036 iter->cpl = cpl; 10037 iter->ctx = ctx; 10038 10039 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10040 iter, bdev_each_channel_cpl); 10041 } 10042 10043 static void 10044 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10045 { 10046 struct spdk_bdev_io *parent_io = cb_arg; 10047 10048 spdk_bdev_free_io(bdev_io); 10049 10050 /* Check return status of write */ 10051 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10052 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10053 } 10054 10055 static void 10056 bdev_copy_do_write(void *_bdev_io) 10057 { 10058 struct spdk_bdev_io *bdev_io = _bdev_io; 10059 int rc; 10060 10061 /* Write blocks */ 10062 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10063 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10064 bdev_io->u.bdev.iovs[0].iov_base, 10065 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10066 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10067 10068 if (rc == -ENOMEM) { 10069 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10070 } else if (rc != 0) { 10071 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10072 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10073 } 10074 } 10075 10076 static void 10077 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10078 { 10079 struct spdk_bdev_io *parent_io = cb_arg; 10080 10081 spdk_bdev_free_io(bdev_io); 10082 10083 /* Check return status of read */ 10084 if (!success) { 10085 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10086 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10087 return; 10088 } 10089 10090 /* Do write */ 10091 bdev_copy_do_write(parent_io); 10092 } 10093 10094 static void 10095 bdev_copy_do_read(void *_bdev_io) 10096 { 10097 struct spdk_bdev_io *bdev_io = _bdev_io; 10098 int rc; 10099 10100 /* Read blocks */ 10101 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10102 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10103 bdev_io->u.bdev.iovs[0].iov_base, 10104 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10105 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10106 10107 if (rc == -ENOMEM) { 10108 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10109 } else if (rc != 0) { 10110 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10111 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10112 } 10113 } 10114 10115 static void 10116 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10117 { 10118 if (!success) { 10119 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10120 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10121 return; 10122 } 10123 10124 bdev_copy_do_read(bdev_io); 10125 } 10126 10127 int 10128 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10129 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10130 spdk_bdev_io_completion_cb cb, void *cb_arg) 10131 { 10132 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10133 struct spdk_bdev_io *bdev_io; 10134 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10135 10136 if (!desc->write) { 10137 return -EBADF; 10138 } 10139 10140 if (num_blocks == 0) { 10141 SPDK_ERRLOG("Can't copy 0 blocks\n"); 10142 return -EINVAL; 10143 } 10144 10145 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10146 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10147 SPDK_DEBUGLOG(bdev, 10148 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10149 dst_offset_blocks, src_offset_blocks, num_blocks); 10150 return -EINVAL; 10151 } 10152 10153 bdev_io = bdev_channel_get_io(channel); 10154 if (!bdev_io) { 10155 return -ENOMEM; 10156 } 10157 10158 bdev_io->internal.ch = channel; 10159 bdev_io->internal.desc = desc; 10160 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10161 10162 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10163 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10164 bdev_io->u.bdev.num_blocks = num_blocks; 10165 bdev_io->u.bdev.memory_domain = NULL; 10166 bdev_io->u.bdev.memory_domain_ctx = NULL; 10167 bdev_io->u.bdev.iovs = NULL; 10168 bdev_io->u.bdev.iovcnt = 0; 10169 bdev_io->u.bdev.md_buf = NULL; 10170 bdev_io->u.bdev.accel_sequence = NULL; 10171 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10172 10173 if (dst_offset_blocks == src_offset_blocks) { 10174 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 10175 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 10176 10177 return 0; 10178 } 10179 10180 10181 /* If the copy size is large and should be split, use the generic split logic 10182 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10183 * 10184 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10185 * emulate it using regular read and write requests otherwise. 10186 */ 10187 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10188 bdev_io->internal.split) { 10189 bdev_io_submit(bdev_io); 10190 return 0; 10191 } 10192 10193 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10194 10195 return 0; 10196 } 10197 10198 SPDK_LOG_REGISTER_COMPONENT(bdev) 10199 10200 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10201 { 10202 struct spdk_trace_tpoint_opts opts[] = { 10203 { 10204 "BDEV_IO_START", TRACE_BDEV_IO_START, 10205 OWNER_BDEV, OBJECT_BDEV_IO, 1, 10206 { 10207 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10208 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10209 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10210 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10211 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10212 } 10213 }, 10214 { 10215 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10216 OWNER_BDEV, OBJECT_BDEV_IO, 0, 10217 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10218 }, 10219 { 10220 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10221 OWNER_BDEV, OBJECT_NONE, 1, 10222 { 10223 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10224 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10225 } 10226 }, 10227 { 10228 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10229 OWNER_BDEV, OBJECT_NONE, 0, 10230 { 10231 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10232 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10233 } 10234 }, 10235 }; 10236 10237 10238 spdk_trace_register_owner(OWNER_BDEV, 'b'); 10239 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10240 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10241 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10242 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10243 } 10244