1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 79 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 80 }; 81 82 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 83 84 RB_HEAD(bdev_name_tree, spdk_bdev_name); 85 86 static int 87 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 88 { 89 return strcmp(name1->name, name2->name); 90 } 91 92 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 93 94 struct spdk_bdev_mgr { 95 struct spdk_mempool *bdev_io_pool; 96 97 void *zero_buffer; 98 99 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 100 101 struct spdk_bdev_list bdevs; 102 struct bdev_name_tree bdev_names; 103 104 bool init_complete; 105 bool module_init_complete; 106 107 struct spdk_spinlock spinlock; 108 109 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 123 }; 124 125 static void 126 __attribute__((constructor)) 127 _bdev_init(void) 128 { 129 spdk_spin_init(&g_bdev_mgr.spinlock); 130 } 131 132 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 133 134 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 135 136 struct lba_range { 137 struct spdk_bdev *bdev; 138 uint64_t offset; 139 uint64_t length; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. */ 180 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 182 /** Function to update for the submitted IO. */ 183 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 184 }; 185 186 struct spdk_bdev_qos { 187 /** Types of structure of rate limits. */ 188 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 189 190 /** The channel that all I/O are funneled through. */ 191 struct spdk_bdev_channel *ch; 192 193 /** The thread on which the poller is running. */ 194 struct spdk_thread *thread; 195 196 /** Queue of I/O waiting to be issued. */ 197 bdev_io_tailq_t queued; 198 199 /** Size of a timeslice in tsc ticks. */ 200 uint64_t timeslice_size; 201 202 /** Timestamp of start of last timeslice. */ 203 uint64_t last_timeslice; 204 205 /** Poller that processes queued I/O commands each time slice. */ 206 struct spdk_poller *poller; 207 }; 208 209 struct spdk_bdev_mgmt_channel { 210 /* 211 * Each thread keeps a cache of bdev_io - this allows 212 * bdev threads which are *not* DPDK threads to still 213 * benefit from a per-thread bdev_io cache. Without 214 * this, non-DPDK threads fetching from the mempool 215 * incur a cmpxchg on get and put. 216 */ 217 bdev_io_stailq_t per_thread_cache; 218 uint32_t per_thread_cache_count; 219 uint32_t bdev_io_cache_size; 220 221 struct spdk_iobuf_channel iobuf; 222 223 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 224 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 225 }; 226 227 /* 228 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 229 * will queue here their IO that awaits retry. It makes it possible to retry sending 230 * IO to one bdev after IO from other bdev completes. 231 */ 232 struct spdk_bdev_shared_resource { 233 /* The bdev management channel */ 234 struct spdk_bdev_mgmt_channel *mgmt_ch; 235 236 /* 237 * Count of I/O submitted to bdev module and waiting for completion. 238 * Incremented before submit_request() is called on an spdk_bdev_io. 239 */ 240 uint64_t io_outstanding; 241 242 /* 243 * Queue of IO awaiting retry because of a previous NOMEM status returned 244 * on this channel. 245 */ 246 bdev_io_tailq_t nomem_io; 247 248 /* 249 * Threshold which io_outstanding must drop to before retrying nomem_io. 250 */ 251 uint64_t nomem_threshold; 252 253 /* I/O channel allocated by a bdev module */ 254 struct spdk_io_channel *shared_ch; 255 256 struct spdk_poller *nomem_poller; 257 258 /* Refcount of bdev channels using this resource */ 259 uint32_t ref; 260 261 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 262 }; 263 264 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 265 #define BDEV_CH_QOS_ENABLED (1 << 1) 266 267 struct spdk_bdev_channel { 268 struct spdk_bdev *bdev; 269 270 /* The channel for the underlying device */ 271 struct spdk_io_channel *channel; 272 273 /* Accel channel */ 274 struct spdk_io_channel *accel_channel; 275 276 /* Per io_device per thread data */ 277 struct spdk_bdev_shared_resource *shared_resource; 278 279 struct spdk_bdev_io_stat *stat; 280 281 /* 282 * Count of I/O submitted to the underlying dev module through this channel 283 * and waiting for completion. 284 */ 285 uint64_t io_outstanding; 286 287 /* 288 * List of all submitted I/Os including I/O that are generated via splitting. 289 */ 290 bdev_io_tailq_t io_submitted; 291 292 /* 293 * List of spdk_bdev_io that are currently queued because they write to a locked 294 * LBA range. 295 */ 296 bdev_io_tailq_t io_locked; 297 298 /* List of I/Os with accel sequence being currently executed */ 299 bdev_io_tailq_t io_accel_exec; 300 301 /* List of I/Os doing memory domain pull/push */ 302 bdev_io_tailq_t io_memory_domain; 303 304 uint32_t flags; 305 306 struct spdk_histogram_data *histogram; 307 308 #ifdef SPDK_CONFIG_VTUNE 309 uint64_t start_tsc; 310 uint64_t interval_tsc; 311 __itt_string_handle *handle; 312 struct spdk_bdev_io_stat *prev_stat; 313 #endif 314 315 bdev_io_tailq_t queued_resets; 316 317 lba_range_tailq_t locked_ranges; 318 }; 319 320 struct media_event_entry { 321 struct spdk_bdev_media_event event; 322 TAILQ_ENTRY(media_event_entry) tailq; 323 }; 324 325 #define MEDIA_EVENT_POOL_SIZE 64 326 327 struct spdk_bdev_desc { 328 struct spdk_bdev *bdev; 329 struct spdk_thread *thread; 330 struct { 331 spdk_bdev_event_cb_t event_fn; 332 void *ctx; 333 } callback; 334 bool closed; 335 bool write; 336 bool memory_domains_supported; 337 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 338 struct spdk_spinlock spinlock; 339 uint32_t refs; 340 TAILQ_HEAD(, media_event_entry) pending_media_events; 341 TAILQ_HEAD(, media_event_entry) free_media_events; 342 struct media_event_entry *media_events_buffer; 343 TAILQ_ENTRY(spdk_bdev_desc) link; 344 345 uint64_t timeout_in_sec; 346 spdk_bdev_io_timeout_cb cb_fn; 347 void *cb_arg; 348 struct spdk_poller *io_timeout_poller; 349 struct spdk_bdev_module_claim *claim; 350 }; 351 352 struct spdk_bdev_iostat_ctx { 353 struct spdk_bdev_io_stat *stat; 354 spdk_bdev_get_device_stat_cb cb; 355 void *cb_arg; 356 }; 357 358 struct set_qos_limit_ctx { 359 void (*cb_fn)(void *cb_arg, int status); 360 void *cb_arg; 361 struct spdk_bdev *bdev; 362 }; 363 364 struct spdk_bdev_channel_iter { 365 spdk_bdev_for_each_channel_msg fn; 366 spdk_bdev_for_each_channel_done cpl; 367 struct spdk_io_channel_iter *i; 368 void *ctx; 369 }; 370 371 struct spdk_bdev_io_error_stat { 372 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 373 }; 374 375 enum bdev_io_retry_state { 376 BDEV_IO_RETRY_STATE_INVALID, 377 BDEV_IO_RETRY_STATE_PULL, 378 BDEV_IO_RETRY_STATE_PULL_MD, 379 BDEV_IO_RETRY_STATE_SUBMIT, 380 BDEV_IO_RETRY_STATE_PUSH, 381 BDEV_IO_RETRY_STATE_PUSH_MD, 382 }; 383 384 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 385 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 386 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 387 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 388 389 static inline void bdev_io_complete(void *ctx); 390 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 391 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 392 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 393 394 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 395 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 396 397 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 398 struct spdk_io_channel *ch, void *_ctx); 399 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 400 401 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 402 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 403 uint64_t num_blocks, 404 struct spdk_memory_domain *domain, void *domain_ctx, 405 struct spdk_accel_sequence *seq, 406 spdk_bdev_io_completion_cb cb, void *cb_arg); 407 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 408 struct iovec *iov, int iovcnt, void *md_buf, 409 uint64_t offset_blocks, uint64_t num_blocks, 410 struct spdk_memory_domain *domain, void *domain_ctx, 411 struct spdk_accel_sequence *seq, 412 spdk_bdev_io_completion_cb cb, void *cb_arg); 413 414 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 415 uint64_t offset, uint64_t length, 416 lock_range_cb cb_fn, void *cb_arg); 417 418 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 419 uint64_t offset, uint64_t length, 420 lock_range_cb cb_fn, void *cb_arg); 421 422 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 423 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 424 425 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 426 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 427 static void claim_reset(struct spdk_bdev *bdev); 428 429 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 430 431 #define bdev_get_ext_io_opt(opts, field, defval) \ 432 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 433 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 434 435 void 436 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 437 { 438 if (!opts) { 439 SPDK_ERRLOG("opts should not be NULL\n"); 440 return; 441 } 442 443 if (!opts_size) { 444 SPDK_ERRLOG("opts_size should not be zero value\n"); 445 return; 446 } 447 448 opts->opts_size = opts_size; 449 450 #define SET_FIELD(field) \ 451 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 452 opts->field = g_bdev_opts.field; \ 453 } \ 454 455 SET_FIELD(bdev_io_pool_size); 456 SET_FIELD(bdev_io_cache_size); 457 SET_FIELD(bdev_auto_examine); 458 SET_FIELD(iobuf_small_cache_size); 459 SET_FIELD(iobuf_large_cache_size); 460 461 /* Do not remove this statement, you should always update this statement when you adding a new field, 462 * and do not forget to add the SET_FIELD statement for your added field. */ 463 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 464 465 #undef SET_FIELD 466 } 467 468 int 469 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 470 { 471 uint32_t min_pool_size; 472 473 if (!opts) { 474 SPDK_ERRLOG("opts cannot be NULL\n"); 475 return -1; 476 } 477 478 if (!opts->opts_size) { 479 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 480 return -1; 481 } 482 483 /* 484 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 485 * initialization. A second mgmt_ch will be created on the same thread when the application starts 486 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 487 */ 488 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 489 if (opts->bdev_io_pool_size < min_pool_size) { 490 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 491 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 492 spdk_thread_get_count()); 493 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 494 return -1; 495 } 496 497 #define SET_FIELD(field) \ 498 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 499 g_bdev_opts.field = opts->field; \ 500 } \ 501 502 SET_FIELD(bdev_io_pool_size); 503 SET_FIELD(bdev_io_cache_size); 504 SET_FIELD(bdev_auto_examine); 505 SET_FIELD(iobuf_small_cache_size); 506 SET_FIELD(iobuf_large_cache_size); 507 508 g_bdev_opts.opts_size = opts->opts_size; 509 510 #undef SET_FIELD 511 512 return 0; 513 } 514 515 static struct spdk_bdev * 516 bdev_get_by_name(const char *bdev_name) 517 { 518 struct spdk_bdev_name find; 519 struct spdk_bdev_name *res; 520 521 find.name = (char *)bdev_name; 522 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 523 if (res != NULL) { 524 return res->bdev; 525 } 526 527 return NULL; 528 } 529 530 struct spdk_bdev * 531 spdk_bdev_get_by_name(const char *bdev_name) 532 { 533 struct spdk_bdev *bdev; 534 535 spdk_spin_lock(&g_bdev_mgr.spinlock); 536 bdev = bdev_get_by_name(bdev_name); 537 spdk_spin_unlock(&g_bdev_mgr.spinlock); 538 539 return bdev; 540 } 541 542 struct bdev_io_status_string { 543 enum spdk_bdev_io_status status; 544 const char *str; 545 }; 546 547 static const struct bdev_io_status_string bdev_io_status_strings[] = { 548 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 549 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 550 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 551 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 552 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 553 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 554 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 555 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 556 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 557 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 558 }; 559 560 static const char * 561 bdev_io_status_get_string(enum spdk_bdev_io_status status) 562 { 563 uint32_t i; 564 565 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 566 if (bdev_io_status_strings[i].status == status) { 567 return bdev_io_status_strings[i].str; 568 } 569 } 570 571 return "reserved"; 572 } 573 574 struct spdk_bdev_wait_for_examine_ctx { 575 struct spdk_poller *poller; 576 spdk_bdev_wait_for_examine_cb cb_fn; 577 void *cb_arg; 578 }; 579 580 static bool bdev_module_all_actions_completed(void); 581 582 static int 583 bdev_wait_for_examine_cb(void *arg) 584 { 585 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 586 587 if (!bdev_module_all_actions_completed()) { 588 return SPDK_POLLER_IDLE; 589 } 590 591 spdk_poller_unregister(&ctx->poller); 592 ctx->cb_fn(ctx->cb_arg); 593 free(ctx); 594 595 return SPDK_POLLER_BUSY; 596 } 597 598 int 599 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 600 { 601 struct spdk_bdev_wait_for_examine_ctx *ctx; 602 603 ctx = calloc(1, sizeof(*ctx)); 604 if (ctx == NULL) { 605 return -ENOMEM; 606 } 607 ctx->cb_fn = cb_fn; 608 ctx->cb_arg = cb_arg; 609 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 610 611 return 0; 612 } 613 614 struct spdk_bdev_examine_item { 615 char *name; 616 TAILQ_ENTRY(spdk_bdev_examine_item) link; 617 }; 618 619 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 620 621 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 622 g_bdev_examine_allowlist); 623 624 static inline bool 625 bdev_examine_allowlist_check(const char *name) 626 { 627 struct spdk_bdev_examine_item *item; 628 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 629 if (strcmp(name, item->name) == 0) { 630 return true; 631 } 632 } 633 return false; 634 } 635 636 static inline void 637 bdev_examine_allowlist_free(void) 638 { 639 struct spdk_bdev_examine_item *item; 640 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 641 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 642 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 643 free(item->name); 644 free(item); 645 } 646 } 647 648 static inline bool 649 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 650 { 651 struct spdk_bdev_alias *tmp; 652 if (bdev_examine_allowlist_check(bdev->name)) { 653 return true; 654 } 655 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 656 if (bdev_examine_allowlist_check(tmp->alias.name)) { 657 return true; 658 } 659 } 660 return false; 661 } 662 663 static inline bool 664 bdev_ok_to_examine(struct spdk_bdev *bdev) 665 { 666 if (g_bdev_opts.bdev_auto_examine) { 667 return true; 668 } else { 669 return bdev_in_examine_allowlist(bdev); 670 } 671 } 672 673 static void 674 bdev_examine(struct spdk_bdev *bdev) 675 { 676 struct spdk_bdev_module *module; 677 struct spdk_bdev_module_claim *claim, *tmpclaim; 678 uint32_t action; 679 680 if (!bdev_ok_to_examine(bdev)) { 681 return; 682 } 683 684 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 685 if (module->examine_config) { 686 spdk_spin_lock(&module->internal.spinlock); 687 action = module->internal.action_in_progress; 688 module->internal.action_in_progress++; 689 spdk_spin_unlock(&module->internal.spinlock); 690 module->examine_config(bdev); 691 if (action != module->internal.action_in_progress) { 692 SPDK_ERRLOG("examine_config for module %s did not call " 693 "spdk_bdev_module_examine_done()\n", module->name); 694 } 695 } 696 } 697 698 spdk_spin_lock(&bdev->internal.spinlock); 699 700 switch (bdev->internal.claim_type) { 701 case SPDK_BDEV_CLAIM_NONE: 702 /* Examine by all bdev modules */ 703 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 704 if (module->examine_disk) { 705 spdk_spin_lock(&module->internal.spinlock); 706 module->internal.action_in_progress++; 707 spdk_spin_unlock(&module->internal.spinlock); 708 spdk_spin_unlock(&bdev->internal.spinlock); 709 module->examine_disk(bdev); 710 spdk_spin_lock(&bdev->internal.spinlock); 711 } 712 } 713 break; 714 case SPDK_BDEV_CLAIM_EXCL_WRITE: 715 /* Examine by the one bdev module with a v1 claim */ 716 module = bdev->internal.claim.v1.module; 717 if (module->examine_disk) { 718 spdk_spin_lock(&module->internal.spinlock); 719 module->internal.action_in_progress++; 720 spdk_spin_unlock(&module->internal.spinlock); 721 spdk_spin_unlock(&bdev->internal.spinlock); 722 module->examine_disk(bdev); 723 return; 724 } 725 break; 726 default: 727 /* Examine by all bdev modules with a v2 claim */ 728 assert(claim_type_is_v2(bdev->internal.claim_type)); 729 /* 730 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 731 * list, perhaps accessing freed memory. Without protection, this could happen 732 * while the lock is dropped during the examine callback. 733 */ 734 bdev->internal.examine_in_progress++; 735 736 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 737 module = claim->module; 738 739 if (module == NULL) { 740 /* This is a vestigial claim, held by examine_count */ 741 continue; 742 } 743 744 if (module->examine_disk == NULL) { 745 continue; 746 } 747 748 spdk_spin_lock(&module->internal.spinlock); 749 module->internal.action_in_progress++; 750 spdk_spin_unlock(&module->internal.spinlock); 751 752 /* Call examine_disk without holding internal.spinlock. */ 753 spdk_spin_unlock(&bdev->internal.spinlock); 754 module->examine_disk(bdev); 755 spdk_spin_lock(&bdev->internal.spinlock); 756 } 757 758 assert(bdev->internal.examine_in_progress > 0); 759 bdev->internal.examine_in_progress--; 760 if (bdev->internal.examine_in_progress == 0) { 761 /* Remove any claims that were released during examine_disk */ 762 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 763 if (claim->desc != NULL) { 764 continue; 765 } 766 767 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 768 free(claim); 769 } 770 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 771 claim_reset(bdev); 772 } 773 } 774 } 775 776 spdk_spin_unlock(&bdev->internal.spinlock); 777 } 778 779 int 780 spdk_bdev_examine(const char *name) 781 { 782 struct spdk_bdev *bdev; 783 struct spdk_bdev_examine_item *item; 784 struct spdk_thread *thread = spdk_get_thread(); 785 786 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 787 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 788 thread ? spdk_thread_get_name(thread) : "null"); 789 return -EINVAL; 790 } 791 792 if (g_bdev_opts.bdev_auto_examine) { 793 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 794 return -EINVAL; 795 } 796 797 if (bdev_examine_allowlist_check(name)) { 798 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 799 return -EEXIST; 800 } 801 802 item = calloc(1, sizeof(*item)); 803 if (!item) { 804 return -ENOMEM; 805 } 806 item->name = strdup(name); 807 if (!item->name) { 808 free(item); 809 return -ENOMEM; 810 } 811 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 812 813 bdev = spdk_bdev_get_by_name(name); 814 if (bdev) { 815 bdev_examine(bdev); 816 } 817 return 0; 818 } 819 820 static inline void 821 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 822 { 823 struct spdk_bdev_examine_item *item; 824 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 825 spdk_json_write_object_begin(w); 826 spdk_json_write_named_string(w, "method", "bdev_examine"); 827 spdk_json_write_named_object_begin(w, "params"); 828 spdk_json_write_named_string(w, "name", item->name); 829 spdk_json_write_object_end(w); 830 spdk_json_write_object_end(w); 831 } 832 } 833 834 struct spdk_bdev * 835 spdk_bdev_first(void) 836 { 837 struct spdk_bdev *bdev; 838 839 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 840 if (bdev) { 841 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 842 } 843 844 return bdev; 845 } 846 847 struct spdk_bdev * 848 spdk_bdev_next(struct spdk_bdev *prev) 849 { 850 struct spdk_bdev *bdev; 851 852 bdev = TAILQ_NEXT(prev, internal.link); 853 if (bdev) { 854 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 855 } 856 857 return bdev; 858 } 859 860 static struct spdk_bdev * 861 _bdev_next_leaf(struct spdk_bdev *bdev) 862 { 863 while (bdev != NULL) { 864 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 865 return bdev; 866 } else { 867 bdev = TAILQ_NEXT(bdev, internal.link); 868 } 869 } 870 871 return bdev; 872 } 873 874 struct spdk_bdev * 875 spdk_bdev_first_leaf(void) 876 { 877 struct spdk_bdev *bdev; 878 879 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 880 881 if (bdev) { 882 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 883 } 884 885 return bdev; 886 } 887 888 struct spdk_bdev * 889 spdk_bdev_next_leaf(struct spdk_bdev *prev) 890 { 891 struct spdk_bdev *bdev; 892 893 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 894 895 if (bdev) { 896 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 897 } 898 899 return bdev; 900 } 901 902 static inline bool 903 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 904 { 905 return bdev_io->internal.memory_domain; 906 } 907 908 static inline bool 909 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 910 { 911 return bdev_io->internal.has_accel_sequence; 912 } 913 914 static inline void 915 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 916 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 917 { 918 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 919 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 920 * channels we will instead wait for half to complete. 921 */ 922 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 923 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 924 925 assert(state != BDEV_IO_RETRY_STATE_INVALID); 926 bdev_io->internal.retry_state = state; 927 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 928 } 929 930 static inline void 931 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 932 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 933 { 934 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 935 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 936 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 937 938 assert(state != BDEV_IO_RETRY_STATE_INVALID); 939 bdev_io->internal.retry_state = state; 940 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 941 } 942 943 void 944 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 945 { 946 struct iovec *iovs; 947 948 if (bdev_io->u.bdev.iovs == NULL) { 949 bdev_io->u.bdev.iovs = &bdev_io->iov; 950 bdev_io->u.bdev.iovcnt = 1; 951 } 952 953 iovs = bdev_io->u.bdev.iovs; 954 955 assert(iovs != NULL); 956 assert(bdev_io->u.bdev.iovcnt >= 1); 957 958 iovs[0].iov_base = buf; 959 iovs[0].iov_len = len; 960 } 961 962 void 963 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 964 { 965 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 966 bdev_io->u.bdev.md_buf = md_buf; 967 } 968 969 static bool 970 _is_buf_allocated(const struct iovec *iovs) 971 { 972 if (iovs == NULL) { 973 return false; 974 } 975 976 return iovs[0].iov_base != NULL; 977 } 978 979 static bool 980 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 981 { 982 int i; 983 uintptr_t iov_base; 984 985 if (spdk_likely(alignment == 1)) { 986 return true; 987 } 988 989 for (i = 0; i < iovcnt; i++) { 990 iov_base = (uintptr_t)iovs[i].iov_base; 991 if ((iov_base & (alignment - 1)) != 0) { 992 return false; 993 } 994 } 995 996 return true; 997 } 998 999 static inline bool 1000 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1001 { 1002 if (!bdev_io->internal.accel_sequence) { 1003 return false; 1004 } 1005 1006 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1007 * bdev module didn't support accel sequences */ 1008 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1009 } 1010 1011 static inline void 1012 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1013 struct spdk_bdev_shared_resource *shared_resource) 1014 { 1015 bdev_ch->io_outstanding++; 1016 shared_resource->io_outstanding++; 1017 } 1018 1019 static inline void 1020 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1021 struct spdk_bdev_shared_resource *shared_resource) 1022 { 1023 assert(bdev_ch->io_outstanding > 0); 1024 assert(shared_resource->io_outstanding > 0); 1025 bdev_ch->io_outstanding--; 1026 shared_resource->io_outstanding--; 1027 } 1028 1029 static void 1030 bdev_io_submit_sequence_cb(void *ctx, int status) 1031 { 1032 struct spdk_bdev_io *bdev_io = ctx; 1033 1034 bdev_io->u.bdev.accel_sequence = NULL; 1035 bdev_io->internal.accel_sequence = NULL; 1036 1037 if (spdk_unlikely(status != 0)) { 1038 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1039 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1040 bdev_io_complete_unsubmitted(bdev_io); 1041 return; 1042 } 1043 1044 bdev_io_submit(bdev_io); 1045 } 1046 1047 static void 1048 bdev_io_exec_sequence_cb(void *ctx, int status) 1049 { 1050 struct spdk_bdev_io *bdev_io = ctx; 1051 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1052 1053 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1054 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1055 1056 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1057 bdev_ch_retry_io(ch); 1058 } 1059 1060 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1061 } 1062 1063 static void 1064 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1065 { 1066 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1067 1068 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1069 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1070 1071 /* Since the operations are appended during submission, they're in the opposite order than 1072 * how we want to execute them for reads (i.e. we need to execute the most recently added 1073 * operation first), so reverse the sequence before executing it. 1074 */ 1075 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1076 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1077 } 1078 1079 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1080 bdev_io_increment_outstanding(ch, ch->shared_resource); 1081 bdev_io->internal.data_transfer_cpl = cb_fn; 1082 1083 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1084 bdev_io_exec_sequence_cb, bdev_io); 1085 } 1086 1087 static void 1088 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1089 { 1090 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1091 void *buf; 1092 1093 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1094 buf = bdev_io->internal.buf; 1095 bdev_io->internal.buf = NULL; 1096 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1097 bdev_io->internal.get_aux_buf_cb = NULL; 1098 } else { 1099 assert(bdev_io->internal.get_buf_cb != NULL); 1100 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1101 bdev_io->internal.get_buf_cb = NULL; 1102 } 1103 } 1104 1105 static void 1106 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1107 { 1108 struct spdk_bdev_io *bdev_io = ctx; 1109 1110 if (rc) { 1111 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1112 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1113 } 1114 bdev_io_get_buf_complete(bdev_io, !rc); 1115 } 1116 1117 static void 1118 bdev_io_pull_md_buf_done(void *ctx, int status) 1119 { 1120 struct spdk_bdev_io *bdev_io = ctx; 1121 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1122 1123 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1124 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1125 1126 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1127 bdev_ch_retry_io(ch); 1128 } 1129 1130 assert(bdev_io->internal.data_transfer_cpl); 1131 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1132 } 1133 1134 static void 1135 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1136 { 1137 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1138 int rc = 0; 1139 1140 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1141 if (bdev_io_use_memory_domain(bdev_io)) { 1142 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1143 bdev_io_increment_outstanding(ch, ch->shared_resource); 1144 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1145 bdev_io->internal.memory_domain_ctx, 1146 &bdev_io->internal.orig_md_iov, 1, 1147 &bdev_io->internal.bounce_md_iov, 1, 1148 bdev_io_pull_md_buf_done, bdev_io); 1149 if (rc == 0) { 1150 /* Continue to submit IO in completion callback */ 1151 return; 1152 } 1153 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1154 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1155 if (rc != -ENOMEM) { 1156 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1157 spdk_memory_domain_get_dma_device_id( 1158 bdev_io->internal.memory_domain), rc); 1159 } 1160 } else { 1161 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1162 bdev_io->internal.orig_md_iov.iov_base, 1163 bdev_io->internal.orig_md_iov.iov_len); 1164 } 1165 } 1166 1167 if (spdk_unlikely(rc == -ENOMEM)) { 1168 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1169 } else { 1170 assert(bdev_io->internal.data_transfer_cpl); 1171 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1172 } 1173 } 1174 1175 static void 1176 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1177 { 1178 /* save original md_buf */ 1179 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1180 bdev_io->internal.orig_md_iov.iov_len = len; 1181 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1182 bdev_io->internal.bounce_md_iov.iov_len = len; 1183 /* set bounce md_buf */ 1184 bdev_io->u.bdev.md_buf = md_buf; 1185 1186 bdev_io_pull_md_buf(bdev_io); 1187 } 1188 1189 static void 1190 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1191 { 1192 struct spdk_bdev *bdev = bdev_io->bdev; 1193 uint64_t md_len; 1194 void *buf; 1195 1196 if (spdk_bdev_is_md_separate(bdev)) { 1197 assert(!bdev_io_use_accel_sequence(bdev_io)); 1198 1199 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1200 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1201 1202 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1203 1204 if (bdev_io->u.bdev.md_buf != NULL) { 1205 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1206 return; 1207 } else { 1208 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1209 } 1210 } 1211 1212 bdev_io_get_buf_complete(bdev_io, true); 1213 } 1214 1215 static inline void 1216 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1217 { 1218 if (rc) { 1219 SPDK_ERRLOG("Failed to get data buffer\n"); 1220 assert(bdev_io->internal.data_transfer_cpl); 1221 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1222 return; 1223 } 1224 1225 _bdev_io_set_md_buf(bdev_io); 1226 } 1227 1228 static void 1229 bdev_io_pull_data_done_and_track(void *ctx, int status) 1230 { 1231 struct spdk_bdev_io *bdev_io = ctx; 1232 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1233 1234 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1235 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1236 1237 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1238 bdev_ch_retry_io(ch); 1239 } 1240 1241 bdev_io_pull_data_done(bdev_io, status); 1242 } 1243 1244 static void 1245 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1246 { 1247 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1248 int rc = 0; 1249 1250 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1251 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1252 * operation */ 1253 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1254 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1255 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1256 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1257 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1258 NULL, NULL, 1259 bdev_io->internal.orig_iovs, 1260 bdev_io->internal.orig_iovcnt, 1261 bdev_io->internal.memory_domain, 1262 bdev_io->internal.memory_domain_ctx, 1263 0, NULL, NULL); 1264 } else { 1265 /* We need to reverse the src/dst for reads */ 1266 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1267 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1268 bdev_io->internal.orig_iovs, 1269 bdev_io->internal.orig_iovcnt, 1270 bdev_io->internal.memory_domain, 1271 bdev_io->internal.memory_domain_ctx, 1272 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1273 NULL, NULL, 0, NULL, NULL); 1274 } 1275 1276 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1277 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1278 bdev_io->internal.accel_sequence); 1279 } 1280 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1281 /* if this is write path, copy data from original buffer to bounce buffer */ 1282 if (bdev_io_use_memory_domain(bdev_io)) { 1283 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1284 bdev_io_increment_outstanding(ch, ch->shared_resource); 1285 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1286 bdev_io->internal.memory_domain_ctx, 1287 bdev_io->internal.orig_iovs, 1288 (uint32_t) bdev_io->internal.orig_iovcnt, 1289 bdev_io->u.bdev.iovs, 1, 1290 bdev_io_pull_data_done_and_track, 1291 bdev_io); 1292 if (rc == 0) { 1293 /* Continue to submit IO in completion callback */ 1294 return; 1295 } 1296 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1297 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1298 if (rc != -ENOMEM) { 1299 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1300 spdk_memory_domain_get_dma_device_id( 1301 bdev_io->internal.memory_domain)); 1302 } 1303 } else { 1304 assert(bdev_io->u.bdev.iovcnt == 1); 1305 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1306 bdev_io->u.bdev.iovs[0].iov_len, 1307 bdev_io->internal.orig_iovs, 1308 bdev_io->internal.orig_iovcnt); 1309 } 1310 } 1311 1312 if (spdk_unlikely(rc == -ENOMEM)) { 1313 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1314 } else { 1315 bdev_io_pull_data_done(bdev_io, rc); 1316 } 1317 } 1318 1319 static void 1320 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1321 bdev_copy_bounce_buffer_cpl cpl_cb) 1322 { 1323 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1324 1325 bdev_io->internal.data_transfer_cpl = cpl_cb; 1326 /* save original iovec */ 1327 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1328 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1329 /* set bounce iov */ 1330 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1331 bdev_io->u.bdev.iovcnt = 1; 1332 /* set bounce buffer for this operation */ 1333 bdev_io->u.bdev.iovs[0].iov_base = buf; 1334 bdev_io->u.bdev.iovs[0].iov_len = len; 1335 1336 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1337 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1338 } else { 1339 bdev_io_pull_data(bdev_io); 1340 } 1341 } 1342 1343 static void 1344 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1345 { 1346 struct spdk_bdev *bdev = bdev_io->bdev; 1347 bool buf_allocated; 1348 uint64_t alignment; 1349 void *aligned_buf; 1350 1351 bdev_io->internal.buf = buf; 1352 1353 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1354 bdev_io_get_buf_complete(bdev_io, true); 1355 return; 1356 } 1357 1358 alignment = spdk_bdev_get_buf_align(bdev); 1359 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1360 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1361 1362 if (buf_allocated) { 1363 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1364 /* Continue in completion callback */ 1365 return; 1366 } else { 1367 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1368 } 1369 1370 _bdev_io_set_md_buf(bdev_io); 1371 } 1372 1373 static inline uint64_t 1374 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1375 { 1376 struct spdk_bdev *bdev = bdev_io->bdev; 1377 uint64_t md_len, alignment; 1378 1379 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1380 1381 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1382 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1383 1384 return len + alignment + md_len; 1385 } 1386 1387 static void 1388 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1389 { 1390 struct spdk_bdev_mgmt_channel *ch; 1391 1392 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1393 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1394 } 1395 1396 static void 1397 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1398 { 1399 assert(bdev_io->internal.buf != NULL); 1400 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1401 bdev_io->internal.buf = NULL; 1402 } 1403 1404 void 1405 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1406 { 1407 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1408 1409 assert(buf != NULL); 1410 _bdev_io_put_buf(bdev_io, buf, len); 1411 } 1412 1413 static inline void 1414 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1415 struct spdk_bdev_io *bdev_io) 1416 { 1417 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1418 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1419 * sequence pointer to make sure we won't touch it anymore. */ 1420 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1421 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1422 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1423 bdev_io->internal.accel_sequence = NULL; 1424 } 1425 1426 bdev->fn_table->submit_request(ioch, bdev_io); 1427 } 1428 1429 static inline void 1430 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1431 { 1432 struct spdk_bdev *bdev = bdev_io->bdev; 1433 1434 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1435 bdev_io->internal.error.nvme.cdw0 = 0; 1436 bdev_io->num_retries++; 1437 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1438 } 1439 1440 static void 1441 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1442 { 1443 struct spdk_bdev_io *bdev_io; 1444 1445 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1446 /* 1447 * Allow some more I/O to complete before retrying the nomem_io queue. 1448 * Some drivers (such as nvme) cannot immediately take a new I/O in 1449 * the context of a completion, because the resources for the I/O are 1450 * not released until control returns to the bdev poller. Also, we 1451 * may require several small I/O to complete before a larger I/O 1452 * (that requires splitting) can be submitted. 1453 */ 1454 return; 1455 } 1456 1457 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1458 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1459 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1460 1461 switch (bdev_io->internal.retry_state) { 1462 case BDEV_IO_RETRY_STATE_SUBMIT: 1463 bdev_ch_resubmit_io(shared_resource, bdev_io); 1464 break; 1465 case BDEV_IO_RETRY_STATE_PULL: 1466 bdev_io_pull_data(bdev_io); 1467 break; 1468 case BDEV_IO_RETRY_STATE_PULL_MD: 1469 bdev_io_pull_md_buf(bdev_io); 1470 break; 1471 case BDEV_IO_RETRY_STATE_PUSH: 1472 bdev_io_push_bounce_data(bdev_io); 1473 break; 1474 case BDEV_IO_RETRY_STATE_PUSH_MD: 1475 bdev_io_push_bounce_md_buf(bdev_io); 1476 break; 1477 default: 1478 assert(0 && "invalid retry state"); 1479 break; 1480 } 1481 1482 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1483 /* This IO completed again with NOMEM status, so break the loop and 1484 * don't try anymore. Note that a bdev_io that fails with NOMEM 1485 * always gets requeued at the front of the list, to maintain 1486 * ordering. 1487 */ 1488 break; 1489 } 1490 } 1491 } 1492 1493 static void 1494 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1495 { 1496 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1497 } 1498 1499 static int 1500 bdev_no_mem_poller(void *ctx) 1501 { 1502 struct spdk_bdev_shared_resource *shared_resource = ctx; 1503 1504 spdk_poller_unregister(&shared_resource->nomem_poller); 1505 1506 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1507 bdev_shared_ch_retry_io(shared_resource); 1508 } 1509 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && shared_resource->io_outstanding == 0) { 1510 /* No IOs were submitted, try again */ 1511 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1512 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1513 } 1514 1515 return SPDK_POLLER_BUSY; 1516 } 1517 1518 static inline bool 1519 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1520 { 1521 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1522 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1523 1524 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1525 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1526 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1527 1528 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1529 /* Special case when we have nomem IOs and no outstanding IOs which completions 1530 * could trigger retry of queued IOs 1531 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1532 * new IOs submitted, e.g. qd==1 */ 1533 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1534 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1535 } 1536 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1537 * ownership of that sequence is transferred back to the bdev layer, so we need to 1538 * restore internal.accel_sequence to make sure that the sequence is handled 1539 * correctly in case the I/O is later aborted. */ 1540 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1541 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1542 assert(bdev_io->internal.accel_sequence == NULL); 1543 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1544 } 1545 1546 return true; 1547 } 1548 1549 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1550 bdev_ch_retry_io(bdev_ch); 1551 } 1552 1553 return false; 1554 } 1555 1556 static void 1557 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1558 { 1559 struct spdk_bdev_io *bdev_io = ctx; 1560 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1561 1562 if (rc) { 1563 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1564 } 1565 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1566 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1567 */ 1568 bdev_io_put_buf(bdev_io); 1569 1570 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1571 bdev_ch_retry_io(ch); 1572 } 1573 1574 /* Continue with IO completion flow */ 1575 bdev_io_complete(bdev_io); 1576 } 1577 1578 static void 1579 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1580 { 1581 struct spdk_bdev_io *bdev_io = ctx; 1582 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1583 1584 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1585 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1586 1587 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1588 bdev_ch_retry_io(ch); 1589 } 1590 1591 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1592 } 1593 1594 static inline void 1595 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1596 { 1597 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1598 int rc = 0; 1599 1600 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1601 /* do the same for metadata buffer */ 1602 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1603 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1604 1605 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1606 if (bdev_io_use_memory_domain(bdev_io)) { 1607 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1608 bdev_io_increment_outstanding(ch, ch->shared_resource); 1609 /* If memory domain is used then we need to call async push function */ 1610 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1611 bdev_io->internal.memory_domain_ctx, 1612 &bdev_io->internal.orig_md_iov, 1613 (uint32_t)bdev_io->internal.orig_iovcnt, 1614 &bdev_io->internal.bounce_md_iov, 1, 1615 bdev_io_push_bounce_md_buf_done, 1616 bdev_io); 1617 if (rc == 0) { 1618 /* Continue IO completion in async callback */ 1619 return; 1620 } 1621 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1622 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1623 if (rc != -ENOMEM) { 1624 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1625 spdk_memory_domain_get_dma_device_id( 1626 bdev_io->internal.memory_domain)); 1627 } 1628 } else { 1629 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1630 bdev_io->internal.orig_md_iov.iov_len); 1631 } 1632 } 1633 } 1634 1635 if (spdk_unlikely(rc == -ENOMEM)) { 1636 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1637 } else { 1638 assert(bdev_io->internal.data_transfer_cpl); 1639 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1640 } 1641 } 1642 1643 static inline void 1644 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1645 { 1646 assert(bdev_io->internal.data_transfer_cpl); 1647 if (rc) { 1648 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1649 return; 1650 } 1651 1652 /* set original buffer for this io */ 1653 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1654 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1655 /* disable bouncing buffer for this io */ 1656 bdev_io->internal.orig_iovcnt = 0; 1657 bdev_io->internal.orig_iovs = NULL; 1658 1659 bdev_io_push_bounce_md_buf(bdev_io); 1660 } 1661 1662 static void 1663 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1664 { 1665 struct spdk_bdev_io *bdev_io = ctx; 1666 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1667 1668 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1669 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1670 1671 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1672 bdev_ch_retry_io(ch); 1673 } 1674 1675 bdev_io_push_bounce_data_done(bdev_io, status); 1676 } 1677 1678 static inline void 1679 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1680 { 1681 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1682 int rc = 0; 1683 1684 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1685 assert(!bdev_io_use_accel_sequence(bdev_io)); 1686 1687 /* if this is read path, copy data from bounce buffer to original buffer */ 1688 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1689 if (bdev_io_use_memory_domain(bdev_io)) { 1690 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1691 bdev_io_increment_outstanding(ch, ch->shared_resource); 1692 /* If memory domain is used then we need to call async push function */ 1693 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1694 bdev_io->internal.memory_domain_ctx, 1695 bdev_io->internal.orig_iovs, 1696 (uint32_t)bdev_io->internal.orig_iovcnt, 1697 &bdev_io->internal.bounce_iov, 1, 1698 bdev_io_push_bounce_data_done_and_track, 1699 bdev_io); 1700 if (rc == 0) { 1701 /* Continue IO completion in async callback */ 1702 return; 1703 } 1704 1705 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1706 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1707 if (rc != -ENOMEM) { 1708 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1709 spdk_memory_domain_get_dma_device_id( 1710 bdev_io->internal.memory_domain)); 1711 } 1712 } else { 1713 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1714 bdev_io->internal.orig_iovcnt, 1715 bdev_io->internal.bounce_iov.iov_base, 1716 bdev_io->internal.bounce_iov.iov_len); 1717 } 1718 } 1719 1720 if (spdk_unlikely(rc == -ENOMEM)) { 1721 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1722 } else { 1723 bdev_io_push_bounce_data_done(bdev_io, rc); 1724 } 1725 } 1726 1727 static inline void 1728 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1729 { 1730 bdev_io->internal.data_transfer_cpl = cpl_cb; 1731 bdev_io_push_bounce_data(bdev_io); 1732 } 1733 1734 static void 1735 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1736 { 1737 struct spdk_bdev_io *bdev_io; 1738 1739 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1740 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1741 } 1742 1743 static void 1744 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1745 { 1746 struct spdk_bdev_mgmt_channel *mgmt_ch; 1747 uint64_t max_len; 1748 void *buf; 1749 1750 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1751 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1752 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1753 1754 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1755 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1756 bdev_io_get_buf_complete(bdev_io, false); 1757 return; 1758 } 1759 1760 bdev_io->internal.buf_len = len; 1761 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1762 bdev_io_get_iobuf_cb); 1763 if (buf != NULL) { 1764 _bdev_io_set_buf(bdev_io, buf, len); 1765 } 1766 } 1767 1768 void 1769 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1770 { 1771 struct spdk_bdev *bdev = bdev_io->bdev; 1772 uint64_t alignment; 1773 1774 assert(cb != NULL); 1775 bdev_io->internal.get_buf_cb = cb; 1776 1777 alignment = spdk_bdev_get_buf_align(bdev); 1778 1779 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1780 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1781 /* Buffer already present and aligned */ 1782 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1783 return; 1784 } 1785 1786 bdev_io_get_buf(bdev_io, len); 1787 } 1788 1789 static void 1790 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1791 bool success) 1792 { 1793 if (!success) { 1794 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1795 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1796 bdev_io_complete_unsubmitted(bdev_io); 1797 return; 1798 } 1799 1800 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1801 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1802 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1803 return; 1804 } 1805 /* For reads we'll execute the sequence after the data is read, so, for now, only 1806 * clear out accel_sequence pointer and submit the IO */ 1807 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1808 bdev_io->u.bdev.accel_sequence = NULL; 1809 } 1810 1811 bdev_io_submit(bdev_io); 1812 } 1813 1814 static void 1815 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1816 uint64_t len) 1817 { 1818 assert(cb != NULL); 1819 bdev_io->internal.get_buf_cb = cb; 1820 1821 bdev_io_get_buf(bdev_io, len); 1822 } 1823 1824 void 1825 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1826 { 1827 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1828 1829 assert(cb != NULL); 1830 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1831 bdev_io->internal.get_aux_buf_cb = cb; 1832 bdev_io_get_buf(bdev_io, len); 1833 } 1834 1835 static int 1836 bdev_module_get_max_ctx_size(void) 1837 { 1838 struct spdk_bdev_module *bdev_module; 1839 int max_bdev_module_size = 0; 1840 1841 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1842 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1843 max_bdev_module_size = bdev_module->get_ctx_size(); 1844 } 1845 } 1846 1847 return max_bdev_module_size; 1848 } 1849 1850 static void 1851 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1852 { 1853 if (!bdev->internal.histogram_enabled) { 1854 return; 1855 } 1856 1857 spdk_json_write_object_begin(w); 1858 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1859 1860 spdk_json_write_named_object_begin(w, "params"); 1861 spdk_json_write_named_string(w, "name", bdev->name); 1862 1863 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1864 spdk_json_write_object_end(w); 1865 1866 spdk_json_write_object_end(w); 1867 } 1868 1869 static void 1870 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1871 { 1872 int i; 1873 struct spdk_bdev_qos *qos = bdev->internal.qos; 1874 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1875 1876 if (!qos) { 1877 return; 1878 } 1879 1880 spdk_bdev_get_qos_rate_limits(bdev, limits); 1881 1882 spdk_json_write_object_begin(w); 1883 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1884 1885 spdk_json_write_named_object_begin(w, "params"); 1886 spdk_json_write_named_string(w, "name", bdev->name); 1887 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1888 if (limits[i] > 0) { 1889 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1890 } 1891 } 1892 spdk_json_write_object_end(w); 1893 1894 spdk_json_write_object_end(w); 1895 } 1896 1897 void 1898 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1899 { 1900 struct spdk_bdev_module *bdev_module; 1901 struct spdk_bdev *bdev; 1902 1903 assert(w != NULL); 1904 1905 spdk_json_write_array_begin(w); 1906 1907 spdk_json_write_object_begin(w); 1908 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1909 spdk_json_write_named_object_begin(w, "params"); 1910 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1911 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1912 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1913 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1914 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1915 spdk_json_write_object_end(w); 1916 spdk_json_write_object_end(w); 1917 1918 bdev_examine_allowlist_config_json(w); 1919 1920 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1921 if (bdev_module->config_json) { 1922 bdev_module->config_json(w); 1923 } 1924 } 1925 1926 spdk_spin_lock(&g_bdev_mgr.spinlock); 1927 1928 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1929 if (bdev->fn_table->write_config_json) { 1930 bdev->fn_table->write_config_json(bdev, w); 1931 } 1932 1933 bdev_qos_config_json(bdev, w); 1934 bdev_enable_histogram_config_json(bdev, w); 1935 } 1936 1937 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1938 1939 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1940 spdk_json_write_object_begin(w); 1941 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1942 spdk_json_write_object_end(w); 1943 1944 spdk_json_write_array_end(w); 1945 } 1946 1947 static void 1948 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1949 { 1950 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1951 struct spdk_bdev_io *bdev_io; 1952 1953 spdk_iobuf_channel_fini(&ch->iobuf); 1954 1955 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1956 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1957 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1958 ch->per_thread_cache_count--; 1959 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1960 } 1961 1962 assert(ch->per_thread_cache_count == 0); 1963 } 1964 1965 static int 1966 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1967 { 1968 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1969 struct spdk_bdev_io *bdev_io; 1970 uint32_t i; 1971 int rc; 1972 1973 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 1974 g_bdev_opts.iobuf_small_cache_size, 1975 g_bdev_opts.iobuf_large_cache_size); 1976 if (rc != 0) { 1977 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1978 return -1; 1979 } 1980 1981 STAILQ_INIT(&ch->per_thread_cache); 1982 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1983 1984 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1985 ch->per_thread_cache_count = 0; 1986 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1987 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1988 if (bdev_io == NULL) { 1989 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1990 assert(false); 1991 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1992 return -1; 1993 } 1994 ch->per_thread_cache_count++; 1995 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1996 } 1997 1998 TAILQ_INIT(&ch->shared_resources); 1999 TAILQ_INIT(&ch->io_wait_queue); 2000 2001 return 0; 2002 } 2003 2004 static void 2005 bdev_init_complete(int rc) 2006 { 2007 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2008 void *cb_arg = g_init_cb_arg; 2009 struct spdk_bdev_module *m; 2010 2011 g_bdev_mgr.init_complete = true; 2012 g_init_cb_fn = NULL; 2013 g_init_cb_arg = NULL; 2014 2015 /* 2016 * For modules that need to know when subsystem init is complete, 2017 * inform them now. 2018 */ 2019 if (rc == 0) { 2020 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2021 if (m->init_complete) { 2022 m->init_complete(); 2023 } 2024 } 2025 } 2026 2027 cb_fn(cb_arg, rc); 2028 } 2029 2030 static bool 2031 bdev_module_all_actions_completed(void) 2032 { 2033 struct spdk_bdev_module *m; 2034 2035 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2036 if (m->internal.action_in_progress > 0) { 2037 return false; 2038 } 2039 } 2040 return true; 2041 } 2042 2043 static void 2044 bdev_module_action_complete(void) 2045 { 2046 /* 2047 * Don't finish bdev subsystem initialization if 2048 * module pre-initialization is still in progress, or 2049 * the subsystem been already initialized. 2050 */ 2051 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2052 return; 2053 } 2054 2055 /* 2056 * Check all bdev modules for inits/examinations in progress. If any 2057 * exist, return immediately since we cannot finish bdev subsystem 2058 * initialization until all are completed. 2059 */ 2060 if (!bdev_module_all_actions_completed()) { 2061 return; 2062 } 2063 2064 /* 2065 * Modules already finished initialization - now that all 2066 * the bdev modules have finished their asynchronous I/O 2067 * processing, the entire bdev layer can be marked as complete. 2068 */ 2069 bdev_init_complete(0); 2070 } 2071 2072 static void 2073 bdev_module_action_done(struct spdk_bdev_module *module) 2074 { 2075 spdk_spin_lock(&module->internal.spinlock); 2076 assert(module->internal.action_in_progress > 0); 2077 module->internal.action_in_progress--; 2078 spdk_spin_unlock(&module->internal.spinlock); 2079 bdev_module_action_complete(); 2080 } 2081 2082 void 2083 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2084 { 2085 assert(module->async_init); 2086 bdev_module_action_done(module); 2087 } 2088 2089 void 2090 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2091 { 2092 bdev_module_action_done(module); 2093 } 2094 2095 /** The last initialized bdev module */ 2096 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2097 2098 static void 2099 bdev_init_failed(void *cb_arg) 2100 { 2101 struct spdk_bdev_module *module = cb_arg; 2102 2103 spdk_spin_lock(&module->internal.spinlock); 2104 assert(module->internal.action_in_progress > 0); 2105 module->internal.action_in_progress--; 2106 spdk_spin_unlock(&module->internal.spinlock); 2107 bdev_init_complete(-1); 2108 } 2109 2110 static int 2111 bdev_modules_init(void) 2112 { 2113 struct spdk_bdev_module *module; 2114 int rc = 0; 2115 2116 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2117 g_resume_bdev_module = module; 2118 if (module->async_init) { 2119 spdk_spin_lock(&module->internal.spinlock); 2120 module->internal.action_in_progress = 1; 2121 spdk_spin_unlock(&module->internal.spinlock); 2122 } 2123 rc = module->module_init(); 2124 if (rc != 0) { 2125 /* Bump action_in_progress to prevent other modules from completion of modules_init 2126 * Send message to defer application shutdown until resources are cleaned up */ 2127 spdk_spin_lock(&module->internal.spinlock); 2128 module->internal.action_in_progress = 1; 2129 spdk_spin_unlock(&module->internal.spinlock); 2130 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2131 return rc; 2132 } 2133 } 2134 2135 g_resume_bdev_module = NULL; 2136 return 0; 2137 } 2138 2139 void 2140 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2141 { 2142 int rc = 0; 2143 char mempool_name[32]; 2144 2145 assert(cb_fn != NULL); 2146 2147 g_init_cb_fn = cb_fn; 2148 g_init_cb_arg = cb_arg; 2149 2150 spdk_notify_type_register("bdev_register"); 2151 spdk_notify_type_register("bdev_unregister"); 2152 2153 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2154 2155 rc = spdk_iobuf_register_module("bdev"); 2156 if (rc != 0) { 2157 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2158 bdev_init_complete(-1); 2159 return; 2160 } 2161 2162 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2163 g_bdev_opts.bdev_io_pool_size, 2164 sizeof(struct spdk_bdev_io) + 2165 bdev_module_get_max_ctx_size(), 2166 0, 2167 SPDK_ENV_SOCKET_ID_ANY); 2168 2169 if (g_bdev_mgr.bdev_io_pool == NULL) { 2170 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2171 bdev_init_complete(-1); 2172 return; 2173 } 2174 2175 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2176 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2177 if (!g_bdev_mgr.zero_buffer) { 2178 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2179 bdev_init_complete(-1); 2180 return; 2181 } 2182 2183 #ifdef SPDK_CONFIG_VTUNE 2184 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2185 #endif 2186 2187 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2188 bdev_mgmt_channel_destroy, 2189 sizeof(struct spdk_bdev_mgmt_channel), 2190 "bdev_mgr"); 2191 2192 rc = bdev_modules_init(); 2193 g_bdev_mgr.module_init_complete = true; 2194 if (rc != 0) { 2195 SPDK_ERRLOG("bdev modules init failed\n"); 2196 return; 2197 } 2198 2199 bdev_module_action_complete(); 2200 } 2201 2202 static void 2203 bdev_mgr_unregister_cb(void *io_device) 2204 { 2205 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2206 2207 if (g_bdev_mgr.bdev_io_pool) { 2208 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2209 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2210 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2211 g_bdev_opts.bdev_io_pool_size); 2212 } 2213 2214 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2215 } 2216 2217 spdk_free(g_bdev_mgr.zero_buffer); 2218 2219 bdev_examine_allowlist_free(); 2220 2221 cb_fn(g_fini_cb_arg); 2222 g_fini_cb_fn = NULL; 2223 g_fini_cb_arg = NULL; 2224 g_bdev_mgr.init_complete = false; 2225 g_bdev_mgr.module_init_complete = false; 2226 } 2227 2228 static void 2229 bdev_module_fini_iter(void *arg) 2230 { 2231 struct spdk_bdev_module *bdev_module; 2232 2233 /* FIXME: Handling initialization failures is broken now, 2234 * so we won't even try cleaning up after successfully 2235 * initialized modules. if module_init_complete is false, 2236 * just call spdk_bdev_mgr_unregister_cb 2237 */ 2238 if (!g_bdev_mgr.module_init_complete) { 2239 bdev_mgr_unregister_cb(NULL); 2240 return; 2241 } 2242 2243 /* Start iterating from the last touched module */ 2244 if (!g_resume_bdev_module) { 2245 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2246 } else { 2247 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2248 internal.tailq); 2249 } 2250 2251 while (bdev_module) { 2252 if (bdev_module->async_fini) { 2253 /* Save our place so we can resume later. We must 2254 * save the variable here, before calling module_fini() 2255 * below, because in some cases the module may immediately 2256 * call spdk_bdev_module_fini_done() and re-enter 2257 * this function to continue iterating. */ 2258 g_resume_bdev_module = bdev_module; 2259 } 2260 2261 if (bdev_module->module_fini) { 2262 bdev_module->module_fini(); 2263 } 2264 2265 if (bdev_module->async_fini) { 2266 return; 2267 } 2268 2269 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2270 internal.tailq); 2271 } 2272 2273 g_resume_bdev_module = NULL; 2274 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2275 } 2276 2277 void 2278 spdk_bdev_module_fini_done(void) 2279 { 2280 if (spdk_get_thread() != g_fini_thread) { 2281 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2282 } else { 2283 bdev_module_fini_iter(NULL); 2284 } 2285 } 2286 2287 static void 2288 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2289 { 2290 struct spdk_bdev *bdev = cb_arg; 2291 2292 if (bdeverrno && bdev) { 2293 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2294 bdev->name); 2295 2296 /* 2297 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2298 * bdev; try to continue by manually removing this bdev from the list and continue 2299 * with the next bdev in the list. 2300 */ 2301 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2302 } 2303 2304 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2305 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2306 /* 2307 * Bdev module finish need to be deferred as we might be in the middle of some context 2308 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2309 * after returning. 2310 */ 2311 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2312 return; 2313 } 2314 2315 /* 2316 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2317 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2318 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2319 * base bdevs. 2320 * 2321 * Also, walk the list in the reverse order. 2322 */ 2323 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2324 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2325 spdk_spin_lock(&bdev->internal.spinlock); 2326 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2327 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2328 spdk_spin_unlock(&bdev->internal.spinlock); 2329 continue; 2330 } 2331 spdk_spin_unlock(&bdev->internal.spinlock); 2332 2333 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2334 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2335 return; 2336 } 2337 2338 /* 2339 * If any bdev fails to unclaim underlying bdev properly, we may face the 2340 * case of bdev list consisting of claimed bdevs only (if claims are managed 2341 * correctly, this would mean there's a loop in the claims graph which is 2342 * clearly impossible). Warn and unregister last bdev on the list then. 2343 */ 2344 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2345 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2346 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2347 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2348 return; 2349 } 2350 } 2351 2352 static void 2353 bdev_module_fini_start_iter(void *arg) 2354 { 2355 struct spdk_bdev_module *bdev_module; 2356 2357 if (!g_resume_bdev_module) { 2358 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2359 } else { 2360 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2361 } 2362 2363 while (bdev_module) { 2364 if (bdev_module->async_fini_start) { 2365 /* Save our place so we can resume later. We must 2366 * save the variable here, before calling fini_start() 2367 * below, because in some cases the module may immediately 2368 * call spdk_bdev_module_fini_start_done() and re-enter 2369 * this function to continue iterating. */ 2370 g_resume_bdev_module = bdev_module; 2371 } 2372 2373 if (bdev_module->fini_start) { 2374 bdev_module->fini_start(); 2375 } 2376 2377 if (bdev_module->async_fini_start) { 2378 return; 2379 } 2380 2381 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2382 } 2383 2384 g_resume_bdev_module = NULL; 2385 2386 bdev_finish_unregister_bdevs_iter(NULL, 0); 2387 } 2388 2389 void 2390 spdk_bdev_module_fini_start_done(void) 2391 { 2392 if (spdk_get_thread() != g_fini_thread) { 2393 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2394 } else { 2395 bdev_module_fini_start_iter(NULL); 2396 } 2397 } 2398 2399 static void 2400 bdev_finish_wait_for_examine_done(void *cb_arg) 2401 { 2402 bdev_module_fini_start_iter(NULL); 2403 } 2404 2405 static void bdev_open_async_fini(void); 2406 2407 void 2408 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2409 { 2410 int rc; 2411 2412 assert(cb_fn != NULL); 2413 2414 g_fini_thread = spdk_get_thread(); 2415 2416 g_fini_cb_fn = cb_fn; 2417 g_fini_cb_arg = cb_arg; 2418 2419 bdev_open_async_fini(); 2420 2421 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2422 if (rc != 0) { 2423 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2424 bdev_finish_wait_for_examine_done(NULL); 2425 } 2426 } 2427 2428 struct spdk_bdev_io * 2429 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2430 { 2431 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2432 struct spdk_bdev_io *bdev_io; 2433 2434 if (ch->per_thread_cache_count > 0) { 2435 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2436 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2437 ch->per_thread_cache_count--; 2438 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2439 /* 2440 * Don't try to look for bdev_ios in the global pool if there are 2441 * waiters on bdev_ios - we don't want this caller to jump the line. 2442 */ 2443 bdev_io = NULL; 2444 } else { 2445 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2446 } 2447 2448 return bdev_io; 2449 } 2450 2451 void 2452 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2453 { 2454 struct spdk_bdev_mgmt_channel *ch; 2455 2456 assert(bdev_io != NULL); 2457 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2458 2459 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2460 2461 if (bdev_io->internal.buf != NULL) { 2462 bdev_io_put_buf(bdev_io); 2463 } 2464 2465 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2466 ch->per_thread_cache_count++; 2467 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2468 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2469 struct spdk_bdev_io_wait_entry *entry; 2470 2471 entry = TAILQ_FIRST(&ch->io_wait_queue); 2472 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2473 entry->cb_fn(entry->cb_arg); 2474 } 2475 } else { 2476 /* We should never have a full cache with entries on the io wait queue. */ 2477 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2478 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2479 } 2480 } 2481 2482 static bool 2483 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2484 { 2485 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2486 2487 switch (limit) { 2488 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2489 return true; 2490 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2491 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2492 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2493 return false; 2494 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2495 default: 2496 return false; 2497 } 2498 } 2499 2500 static bool 2501 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2502 { 2503 switch (bdev_io->type) { 2504 case SPDK_BDEV_IO_TYPE_NVME_IO: 2505 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2506 case SPDK_BDEV_IO_TYPE_READ: 2507 case SPDK_BDEV_IO_TYPE_WRITE: 2508 return true; 2509 case SPDK_BDEV_IO_TYPE_ZCOPY: 2510 if (bdev_io->u.bdev.zcopy.start) { 2511 return true; 2512 } else { 2513 return false; 2514 } 2515 default: 2516 return false; 2517 } 2518 } 2519 2520 static bool 2521 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2522 { 2523 switch (bdev_io->type) { 2524 case SPDK_BDEV_IO_TYPE_NVME_IO: 2525 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2526 /* Bit 1 (0x2) set for read operation */ 2527 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2528 return true; 2529 } else { 2530 return false; 2531 } 2532 case SPDK_BDEV_IO_TYPE_READ: 2533 return true; 2534 case SPDK_BDEV_IO_TYPE_ZCOPY: 2535 /* Populate to read from disk */ 2536 if (bdev_io->u.bdev.zcopy.populate) { 2537 return true; 2538 } else { 2539 return false; 2540 } 2541 default: 2542 return false; 2543 } 2544 } 2545 2546 static uint64_t 2547 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2548 { 2549 struct spdk_bdev *bdev = bdev_io->bdev; 2550 2551 switch (bdev_io->type) { 2552 case SPDK_BDEV_IO_TYPE_NVME_IO: 2553 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2554 return bdev_io->u.nvme_passthru.nbytes; 2555 case SPDK_BDEV_IO_TYPE_READ: 2556 case SPDK_BDEV_IO_TYPE_WRITE: 2557 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2558 case SPDK_BDEV_IO_TYPE_ZCOPY: 2559 /* Track the data in the start phase only */ 2560 if (bdev_io->u.bdev.zcopy.start) { 2561 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2562 } else { 2563 return 0; 2564 } 2565 default: 2566 return 0; 2567 } 2568 } 2569 2570 static bool 2571 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2572 { 2573 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2574 return true; 2575 } else { 2576 return false; 2577 } 2578 } 2579 2580 static bool 2581 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2582 { 2583 if (bdev_is_read_io(io) == false) { 2584 return false; 2585 } 2586 2587 return bdev_qos_rw_queue_io(limit, io); 2588 } 2589 2590 static bool 2591 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2592 { 2593 if (bdev_is_read_io(io) == true) { 2594 return false; 2595 } 2596 2597 return bdev_qos_rw_queue_io(limit, io); 2598 } 2599 2600 static void 2601 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2602 { 2603 limit->remaining_this_timeslice--; 2604 } 2605 2606 static void 2607 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2608 { 2609 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2610 } 2611 2612 static void 2613 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2614 { 2615 if (bdev_is_read_io(io) == false) { 2616 return; 2617 } 2618 2619 return bdev_qos_rw_bps_update_quota(limit, io); 2620 } 2621 2622 static void 2623 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2624 { 2625 if (bdev_is_read_io(io) == true) { 2626 return; 2627 } 2628 2629 return bdev_qos_rw_bps_update_quota(limit, io); 2630 } 2631 2632 static void 2633 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2634 { 2635 int i; 2636 2637 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2638 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2639 qos->rate_limits[i].queue_io = NULL; 2640 qos->rate_limits[i].update_quota = NULL; 2641 continue; 2642 } 2643 2644 switch (i) { 2645 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2646 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2647 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2648 break; 2649 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2650 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2651 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2652 break; 2653 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2654 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2655 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2656 break; 2657 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2658 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2659 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2660 break; 2661 default: 2662 break; 2663 } 2664 } 2665 } 2666 2667 static void 2668 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2669 struct spdk_bdev_io *bdev_io, 2670 enum spdk_bdev_io_status status) 2671 { 2672 bdev_io->internal.in_submit_request = true; 2673 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2674 spdk_bdev_io_complete(bdev_io, status); 2675 bdev_io->internal.in_submit_request = false; 2676 } 2677 2678 static inline void 2679 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2680 { 2681 struct spdk_bdev *bdev = bdev_io->bdev; 2682 struct spdk_io_channel *ch = bdev_ch->channel; 2683 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2684 2685 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2686 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2687 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2688 2689 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2690 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2691 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2692 SPDK_BDEV_IO_STATUS_SUCCESS); 2693 return; 2694 } 2695 } 2696 2697 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2698 bdev_io->bdev->split_on_write_unit && 2699 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2700 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2701 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2702 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2703 return; 2704 } 2705 2706 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2707 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2708 bdev_io->internal.in_submit_request = true; 2709 bdev_submit_request(bdev, ch, bdev_io); 2710 bdev_io->internal.in_submit_request = false; 2711 } else { 2712 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2713 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2714 /* Special case when we have nomem IOs and no outstanding IOs which completions 2715 * could trigger retry of queued IOs */ 2716 bdev_shared_ch_retry_io(shared_resource); 2717 } 2718 } 2719 } 2720 2721 static bool 2722 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2723 { 2724 int i; 2725 2726 if (bdev_qos_io_to_limit(bdev_io) == true) { 2727 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2728 if (!qos->rate_limits[i].queue_io) { 2729 continue; 2730 } 2731 2732 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2733 bdev_io) == true) { 2734 return true; 2735 } 2736 } 2737 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2738 if (!qos->rate_limits[i].update_quota) { 2739 continue; 2740 } 2741 2742 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2743 } 2744 } 2745 2746 return false; 2747 } 2748 2749 static inline void 2750 _bdev_io_do_submit(void *ctx) 2751 { 2752 struct spdk_bdev_io *bdev_io = ctx; 2753 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2754 2755 bdev_io_do_submit(ch, bdev_io); 2756 } 2757 2758 static int 2759 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2760 { 2761 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2762 int submitted_ios = 0; 2763 2764 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2765 if (!bdev_qos_queue_io(qos, bdev_io)) { 2766 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2767 2768 if (bdev_io->internal.io_submit_ch) { 2769 /* Send back the IO to the original thread for the actual processing. */ 2770 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2771 bdev_io->internal.io_submit_ch = NULL; 2772 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2773 _bdev_io_do_submit, bdev_io); 2774 } else { 2775 bdev_io_do_submit(ch, bdev_io); 2776 } 2777 2778 submitted_ios++; 2779 } 2780 } 2781 2782 return submitted_ios; 2783 } 2784 2785 static void 2786 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2787 { 2788 int rc; 2789 2790 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2791 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2792 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2793 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2794 &bdev_io->internal.waitq_entry); 2795 if (rc != 0) { 2796 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2797 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2798 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2799 } 2800 } 2801 2802 static bool 2803 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2804 { 2805 uint32_t io_boundary; 2806 struct spdk_bdev *bdev = bdev_io->bdev; 2807 uint32_t max_segment_size = bdev->max_segment_size; 2808 uint32_t max_size = bdev->max_rw_size; 2809 int max_segs = bdev->max_num_segments; 2810 2811 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2812 io_boundary = bdev->write_unit_size; 2813 } else if (bdev->split_on_optimal_io_boundary) { 2814 io_boundary = bdev->optimal_io_boundary; 2815 } else { 2816 io_boundary = 0; 2817 } 2818 2819 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2820 return false; 2821 } 2822 2823 if (io_boundary) { 2824 uint64_t start_stripe, end_stripe; 2825 2826 start_stripe = bdev_io->u.bdev.offset_blocks; 2827 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2828 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2829 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2830 start_stripe >>= spdk_u32log2(io_boundary); 2831 end_stripe >>= spdk_u32log2(io_boundary); 2832 } else { 2833 start_stripe /= io_boundary; 2834 end_stripe /= io_boundary; 2835 } 2836 2837 if (start_stripe != end_stripe) { 2838 return true; 2839 } 2840 } 2841 2842 if (max_segs) { 2843 if (bdev_io->u.bdev.iovcnt > max_segs) { 2844 return true; 2845 } 2846 } 2847 2848 if (max_segment_size) { 2849 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2850 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2851 return true; 2852 } 2853 } 2854 } 2855 2856 if (max_size) { 2857 if (bdev_io->u.bdev.num_blocks > max_size) { 2858 return true; 2859 } 2860 } 2861 2862 return false; 2863 } 2864 2865 static bool 2866 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2867 { 2868 uint32_t num_unmap_segments; 2869 2870 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2871 return false; 2872 } 2873 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2874 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2875 return true; 2876 } 2877 2878 return false; 2879 } 2880 2881 static bool 2882 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2883 { 2884 if (!bdev_io->bdev->max_write_zeroes) { 2885 return false; 2886 } 2887 2888 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2889 return true; 2890 } 2891 2892 return false; 2893 } 2894 2895 static bool 2896 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2897 { 2898 if (bdev_io->bdev->max_copy != 0 && 2899 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2900 return true; 2901 } 2902 2903 return false; 2904 } 2905 2906 static bool 2907 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2908 { 2909 switch (bdev_io->type) { 2910 case SPDK_BDEV_IO_TYPE_READ: 2911 case SPDK_BDEV_IO_TYPE_WRITE: 2912 return bdev_rw_should_split(bdev_io); 2913 case SPDK_BDEV_IO_TYPE_UNMAP: 2914 return bdev_unmap_should_split(bdev_io); 2915 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2916 return bdev_write_zeroes_should_split(bdev_io); 2917 case SPDK_BDEV_IO_TYPE_COPY: 2918 return bdev_copy_should_split(bdev_io); 2919 default: 2920 return false; 2921 } 2922 } 2923 2924 static uint32_t 2925 _to_next_boundary(uint64_t offset, uint32_t boundary) 2926 { 2927 return (boundary - (offset % boundary)); 2928 } 2929 2930 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2931 2932 static void _bdev_rw_split(void *_bdev_io); 2933 2934 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2935 2936 static void 2937 _bdev_unmap_split(void *_bdev_io) 2938 { 2939 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2940 } 2941 2942 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2943 2944 static void 2945 _bdev_write_zeroes_split(void *_bdev_io) 2946 { 2947 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2948 } 2949 2950 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2951 2952 static void 2953 _bdev_copy_split(void *_bdev_io) 2954 { 2955 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2956 } 2957 2958 static int 2959 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2960 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2961 { 2962 int rc; 2963 uint64_t current_offset, current_remaining, current_src_offset; 2964 spdk_bdev_io_wait_cb io_wait_fn; 2965 2966 current_offset = *offset; 2967 current_remaining = *remaining; 2968 2969 bdev_io->u.bdev.split_outstanding++; 2970 2971 io_wait_fn = _bdev_rw_split; 2972 switch (bdev_io->type) { 2973 case SPDK_BDEV_IO_TYPE_READ: 2974 assert(bdev_io->u.bdev.accel_sequence == NULL); 2975 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2976 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2977 iov, iovcnt, md_buf, current_offset, 2978 num_blocks, bdev_io->internal.memory_domain, 2979 bdev_io->internal.memory_domain_ctx, NULL, 2980 bdev_io_split_done, bdev_io); 2981 break; 2982 case SPDK_BDEV_IO_TYPE_WRITE: 2983 assert(bdev_io->u.bdev.accel_sequence == NULL); 2984 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2985 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2986 iov, iovcnt, md_buf, current_offset, 2987 num_blocks, bdev_io->internal.memory_domain, 2988 bdev_io->internal.memory_domain_ctx, NULL, 2989 bdev_io_split_done, bdev_io); 2990 break; 2991 case SPDK_BDEV_IO_TYPE_UNMAP: 2992 io_wait_fn = _bdev_unmap_split; 2993 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2994 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2995 current_offset, num_blocks, 2996 bdev_io_split_done, bdev_io); 2997 break; 2998 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2999 io_wait_fn = _bdev_write_zeroes_split; 3000 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3001 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3002 current_offset, num_blocks, 3003 bdev_io_split_done, bdev_io); 3004 break; 3005 case SPDK_BDEV_IO_TYPE_COPY: 3006 io_wait_fn = _bdev_copy_split; 3007 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3008 (current_offset - bdev_io->u.bdev.offset_blocks); 3009 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3010 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3011 current_offset, current_src_offset, num_blocks, 3012 bdev_io_split_done, bdev_io); 3013 break; 3014 default: 3015 assert(false); 3016 rc = -EINVAL; 3017 break; 3018 } 3019 3020 if (rc == 0) { 3021 current_offset += num_blocks; 3022 current_remaining -= num_blocks; 3023 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 3024 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 3025 *offset = current_offset; 3026 *remaining = current_remaining; 3027 } else { 3028 bdev_io->u.bdev.split_outstanding--; 3029 if (rc == -ENOMEM) { 3030 if (bdev_io->u.bdev.split_outstanding == 0) { 3031 /* No I/O is outstanding. Hence we should wait here. */ 3032 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3033 } 3034 } else { 3035 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3036 if (bdev_io->u.bdev.split_outstanding == 0) { 3037 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3038 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3039 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3040 } 3041 } 3042 } 3043 3044 return rc; 3045 } 3046 3047 static void 3048 _bdev_rw_split(void *_bdev_io) 3049 { 3050 struct iovec *parent_iov, *iov; 3051 struct spdk_bdev_io *bdev_io = _bdev_io; 3052 struct spdk_bdev *bdev = bdev_io->bdev; 3053 uint64_t parent_offset, current_offset, remaining; 3054 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3055 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3056 uint32_t iovcnt, iov_len, child_iovsize; 3057 uint32_t blocklen = bdev->blocklen; 3058 uint32_t io_boundary; 3059 uint32_t max_segment_size = bdev->max_segment_size; 3060 uint32_t max_child_iovcnt = bdev->max_num_segments; 3061 uint32_t max_size = bdev->max_rw_size; 3062 void *md_buf = NULL; 3063 int rc; 3064 3065 max_size = max_size ? max_size : UINT32_MAX; 3066 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3067 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3068 SPDK_BDEV_IO_NUM_CHILD_IOV; 3069 3070 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3071 io_boundary = bdev->write_unit_size; 3072 } else if (bdev->split_on_optimal_io_boundary) { 3073 io_boundary = bdev->optimal_io_boundary; 3074 } else { 3075 io_boundary = UINT32_MAX; 3076 } 3077 3078 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3079 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3080 parent_offset = bdev_io->u.bdev.offset_blocks; 3081 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3082 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3083 3084 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3085 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3086 if (parent_iov_offset < parent_iov->iov_len) { 3087 break; 3088 } 3089 parent_iov_offset -= parent_iov->iov_len; 3090 } 3091 3092 child_iovcnt = 0; 3093 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3094 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3095 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3096 to_next_boundary = spdk_min(remaining, to_next_boundary); 3097 to_next_boundary = spdk_min(max_size, to_next_boundary); 3098 to_next_boundary_bytes = to_next_boundary * blocklen; 3099 3100 iov = &bdev_io->child_iov[child_iovcnt]; 3101 iovcnt = 0; 3102 3103 if (bdev_io->u.bdev.md_buf) { 3104 md_buf = (char *)bdev_io->u.bdev.md_buf + 3105 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3106 } 3107 3108 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3109 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3110 iovcnt < child_iovsize) { 3111 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3112 iov_len = parent_iov->iov_len - parent_iov_offset; 3113 3114 iov_len = spdk_min(iov_len, max_segment_size); 3115 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3116 to_next_boundary_bytes -= iov_len; 3117 3118 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3119 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3120 3121 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3122 parent_iov_offset += iov_len; 3123 } else { 3124 parent_iovpos++; 3125 parent_iov_offset = 0; 3126 } 3127 child_iovcnt++; 3128 iovcnt++; 3129 } 3130 3131 if (to_next_boundary_bytes > 0) { 3132 /* We had to stop this child I/O early because we ran out of 3133 * child_iov space or were limited by max_num_segments. 3134 * Ensure the iovs to be aligned with block size and 3135 * then adjust to_next_boundary before starting the 3136 * child I/O. 3137 */ 3138 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3139 iovcnt == child_iovsize); 3140 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3141 if (to_last_block_bytes != 0) { 3142 uint32_t child_iovpos = child_iovcnt - 1; 3143 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3144 * so the loop will naturally end 3145 */ 3146 3147 to_last_block_bytes = blocklen - to_last_block_bytes; 3148 to_next_boundary_bytes += to_last_block_bytes; 3149 while (to_last_block_bytes > 0 && iovcnt > 0) { 3150 iov_len = spdk_min(to_last_block_bytes, 3151 bdev_io->child_iov[child_iovpos].iov_len); 3152 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3153 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3154 child_iovpos--; 3155 if (--iovcnt == 0) { 3156 /* If the child IO is less than a block size just return. 3157 * If the first child IO of any split round is less than 3158 * a block size, an error exit. 3159 */ 3160 if (bdev_io->u.bdev.split_outstanding == 0) { 3161 SPDK_ERRLOG("The first child io was less than a block size\n"); 3162 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3163 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3164 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3165 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3166 } 3167 3168 return; 3169 } 3170 } 3171 3172 to_last_block_bytes -= iov_len; 3173 3174 if (parent_iov_offset == 0) { 3175 parent_iovpos--; 3176 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3177 } 3178 parent_iov_offset -= iov_len; 3179 } 3180 3181 assert(to_last_block_bytes == 0); 3182 } 3183 to_next_boundary -= to_next_boundary_bytes / blocklen; 3184 } 3185 3186 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3187 ¤t_offset, &remaining); 3188 if (spdk_unlikely(rc)) { 3189 return; 3190 } 3191 } 3192 } 3193 3194 static void 3195 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3196 { 3197 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3198 uint32_t num_children_reqs = 0; 3199 int rc; 3200 3201 offset = bdev_io->u.bdev.split_current_offset_blocks; 3202 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3203 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3204 3205 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3206 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3207 3208 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3209 &offset, &remaining); 3210 if (spdk_likely(rc == 0)) { 3211 num_children_reqs++; 3212 } else { 3213 return; 3214 } 3215 } 3216 } 3217 3218 static void 3219 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3220 { 3221 uint64_t offset, write_zeroes_blocks, remaining; 3222 uint32_t num_children_reqs = 0; 3223 int rc; 3224 3225 offset = bdev_io->u.bdev.split_current_offset_blocks; 3226 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3227 3228 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3229 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3230 3231 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3232 &offset, &remaining); 3233 if (spdk_likely(rc == 0)) { 3234 num_children_reqs++; 3235 } else { 3236 return; 3237 } 3238 } 3239 } 3240 3241 static void 3242 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3243 { 3244 uint64_t offset, copy_blocks, remaining; 3245 uint32_t num_children_reqs = 0; 3246 int rc; 3247 3248 offset = bdev_io->u.bdev.split_current_offset_blocks; 3249 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3250 3251 assert(bdev_io->bdev->max_copy != 0); 3252 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3253 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3254 3255 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3256 &offset, &remaining); 3257 if (spdk_likely(rc == 0)) { 3258 num_children_reqs++; 3259 } else { 3260 return; 3261 } 3262 } 3263 } 3264 3265 static void 3266 parent_bdev_io_complete(void *ctx, int rc) 3267 { 3268 struct spdk_bdev_io *parent_io = ctx; 3269 3270 if (rc) { 3271 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3272 } 3273 3274 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3275 parent_io->internal.caller_ctx); 3276 } 3277 3278 static void 3279 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3280 { 3281 struct spdk_bdev_io *bdev_io = ctx; 3282 3283 /* u.bdev.accel_sequence should have already been cleared at this point */ 3284 assert(bdev_io->u.bdev.accel_sequence == NULL); 3285 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3286 bdev_io->internal.accel_sequence = NULL; 3287 3288 if (spdk_unlikely(status != 0)) { 3289 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3290 } 3291 3292 parent_bdev_io_complete(bdev_io, status); 3293 } 3294 3295 static void 3296 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3297 { 3298 struct spdk_bdev_io *parent_io = cb_arg; 3299 3300 spdk_bdev_free_io(bdev_io); 3301 3302 if (!success) { 3303 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3304 /* If any child I/O failed, stop further splitting process. */ 3305 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3306 parent_io->u.bdev.split_remaining_num_blocks = 0; 3307 } 3308 parent_io->u.bdev.split_outstanding--; 3309 if (parent_io->u.bdev.split_outstanding != 0) { 3310 return; 3311 } 3312 3313 /* 3314 * Parent I/O finishes when all blocks are consumed. 3315 */ 3316 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3317 assert(parent_io->internal.cb != bdev_io_split_done); 3318 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3319 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3320 3321 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3322 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3323 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3324 return; 3325 } else if (parent_io->internal.orig_iovcnt != 0 && 3326 !bdev_io_use_accel_sequence(bdev_io)) { 3327 /* bdev IO will be completed in the callback */ 3328 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3329 return; 3330 } 3331 } 3332 3333 parent_bdev_io_complete(parent_io, 0); 3334 return; 3335 } 3336 3337 /* 3338 * Continue with the splitting process. This function will complete the parent I/O if the 3339 * splitting is done. 3340 */ 3341 switch (parent_io->type) { 3342 case SPDK_BDEV_IO_TYPE_READ: 3343 case SPDK_BDEV_IO_TYPE_WRITE: 3344 _bdev_rw_split(parent_io); 3345 break; 3346 case SPDK_BDEV_IO_TYPE_UNMAP: 3347 bdev_unmap_split(parent_io); 3348 break; 3349 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3350 bdev_write_zeroes_split(parent_io); 3351 break; 3352 case SPDK_BDEV_IO_TYPE_COPY: 3353 bdev_copy_split(parent_io); 3354 break; 3355 default: 3356 assert(false); 3357 break; 3358 } 3359 } 3360 3361 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3362 bool success); 3363 3364 static void 3365 bdev_io_split(struct spdk_bdev_io *bdev_io) 3366 { 3367 assert(bdev_io_should_split(bdev_io)); 3368 3369 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3370 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3371 bdev_io->u.bdev.split_outstanding = 0; 3372 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3373 3374 switch (bdev_io->type) { 3375 case SPDK_BDEV_IO_TYPE_READ: 3376 case SPDK_BDEV_IO_TYPE_WRITE: 3377 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3378 _bdev_rw_split(bdev_io); 3379 } else { 3380 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3381 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3382 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3383 } 3384 break; 3385 case SPDK_BDEV_IO_TYPE_UNMAP: 3386 bdev_unmap_split(bdev_io); 3387 break; 3388 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3389 bdev_write_zeroes_split(bdev_io); 3390 break; 3391 case SPDK_BDEV_IO_TYPE_COPY: 3392 bdev_copy_split(bdev_io); 3393 break; 3394 default: 3395 assert(false); 3396 break; 3397 } 3398 } 3399 3400 static void 3401 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3402 { 3403 if (!success) { 3404 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3405 return; 3406 } 3407 3408 _bdev_rw_split(bdev_io); 3409 } 3410 3411 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3412 * be inlined, at least on some compilers. 3413 */ 3414 static inline void 3415 _bdev_io_submit(void *ctx) 3416 { 3417 struct spdk_bdev_io *bdev_io = ctx; 3418 struct spdk_bdev *bdev = bdev_io->bdev; 3419 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3420 3421 if (spdk_likely(bdev_ch->flags == 0)) { 3422 bdev_io_do_submit(bdev_ch, bdev_io); 3423 return; 3424 } 3425 3426 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3427 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3428 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3429 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3430 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3431 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3432 } else { 3433 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3434 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3435 } 3436 } else { 3437 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3438 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3439 } 3440 } 3441 3442 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3443 3444 bool 3445 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3446 { 3447 if (range1->length == 0 || range2->length == 0) { 3448 return false; 3449 } 3450 3451 if (range1->offset + range1->length <= range2->offset) { 3452 return false; 3453 } 3454 3455 if (range2->offset + range2->length <= range1->offset) { 3456 return false; 3457 } 3458 3459 return true; 3460 } 3461 3462 static bool 3463 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3464 { 3465 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3466 struct lba_range r; 3467 3468 switch (bdev_io->type) { 3469 case SPDK_BDEV_IO_TYPE_NVME_IO: 3470 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3471 /* Don't try to decode the NVMe command - just assume worst-case and that 3472 * it overlaps a locked range. 3473 */ 3474 return true; 3475 case SPDK_BDEV_IO_TYPE_WRITE: 3476 case SPDK_BDEV_IO_TYPE_UNMAP: 3477 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3478 case SPDK_BDEV_IO_TYPE_ZCOPY: 3479 case SPDK_BDEV_IO_TYPE_COPY: 3480 r.offset = bdev_io->u.bdev.offset_blocks; 3481 r.length = bdev_io->u.bdev.num_blocks; 3482 if (!bdev_lba_range_overlapped(range, &r)) { 3483 /* This I/O doesn't overlap the specified LBA range. */ 3484 return false; 3485 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3486 /* This I/O overlaps, but the I/O is on the same channel that locked this 3487 * range, and the caller_ctx is the same as the locked_ctx. This means 3488 * that this I/O is associated with the lock, and is allowed to execute. 3489 */ 3490 return false; 3491 } else { 3492 return true; 3493 } 3494 default: 3495 return false; 3496 } 3497 } 3498 3499 void 3500 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3501 { 3502 struct spdk_bdev *bdev = bdev_io->bdev; 3503 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3504 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3505 3506 assert(thread != NULL); 3507 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3508 3509 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3510 struct lba_range *range; 3511 3512 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3513 if (bdev_io_range_is_locked(bdev_io, range)) { 3514 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3515 return; 3516 } 3517 } 3518 } 3519 3520 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3521 3522 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3523 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3524 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3525 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3526 spdk_bdev_get_name(bdev)); 3527 3528 if (bdev_io->internal.split) { 3529 bdev_io_split(bdev_io); 3530 return; 3531 } 3532 3533 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3534 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3535 _bdev_io_submit(bdev_io); 3536 } else { 3537 bdev_io->internal.io_submit_ch = ch; 3538 bdev_io->internal.ch = bdev->internal.qos->ch; 3539 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3540 } 3541 } else { 3542 _bdev_io_submit(bdev_io); 3543 } 3544 } 3545 3546 static inline void 3547 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3548 { 3549 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3550 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3551 * For write operation we need to pull buffers from memory domain before submitting IO. 3552 * Once read operation completes, we need to use memory_domain push functionality to 3553 * update data in original memory domain IO buffer 3554 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3555 bdev_io->u.bdev.memory_domain = NULL; 3556 bdev_io->u.bdev.memory_domain_ctx = NULL; 3557 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3558 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3559 } 3560 3561 static inline void 3562 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3563 { 3564 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3565 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3566 3567 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3568 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3569 bdev_io_complete_unsubmitted(bdev_io); 3570 return; 3571 } 3572 3573 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3574 * support them, but we need to execute an accel sequence and the data buffer is from accel 3575 * memory domain (to avoid doing a push/pull from that domain). 3576 */ 3577 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3578 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3579 _bdev_io_ext_use_bounce_buffer(bdev_io); 3580 return; 3581 } 3582 3583 if (needs_exec) { 3584 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3585 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3586 return; 3587 } 3588 /* For reads we'll execute the sequence after the data is read, so, for now, only 3589 * clear out accel_sequence pointer and submit the IO */ 3590 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3591 bdev_io->u.bdev.accel_sequence = NULL; 3592 } 3593 3594 bdev_io_submit(bdev_io); 3595 } 3596 3597 static void 3598 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3599 { 3600 struct spdk_bdev *bdev = bdev_io->bdev; 3601 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3602 struct spdk_io_channel *ch = bdev_ch->channel; 3603 3604 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3605 3606 bdev_io->internal.in_submit_request = true; 3607 bdev_submit_request(bdev, ch, bdev_io); 3608 bdev_io->internal.in_submit_request = false; 3609 } 3610 3611 void 3612 bdev_io_init(struct spdk_bdev_io *bdev_io, 3613 struct spdk_bdev *bdev, void *cb_arg, 3614 spdk_bdev_io_completion_cb cb) 3615 { 3616 bdev_io->bdev = bdev; 3617 bdev_io->internal.caller_ctx = cb_arg; 3618 bdev_io->internal.cb = cb; 3619 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3620 bdev_io->internal.in_submit_request = false; 3621 bdev_io->internal.buf = NULL; 3622 bdev_io->internal.io_submit_ch = NULL; 3623 bdev_io->internal.orig_iovs = NULL; 3624 bdev_io->internal.orig_iovcnt = 0; 3625 bdev_io->internal.orig_md_iov.iov_base = NULL; 3626 bdev_io->internal.error.nvme.cdw0 = 0; 3627 bdev_io->num_retries = 0; 3628 bdev_io->internal.get_buf_cb = NULL; 3629 bdev_io->internal.get_aux_buf_cb = NULL; 3630 bdev_io->internal.memory_domain = NULL; 3631 bdev_io->internal.memory_domain_ctx = NULL; 3632 bdev_io->internal.data_transfer_cpl = NULL; 3633 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3634 bdev_io->internal.accel_sequence = NULL; 3635 bdev_io->internal.has_accel_sequence = false; 3636 } 3637 3638 static bool 3639 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3640 { 3641 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3642 } 3643 3644 bool 3645 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3646 { 3647 bool supported; 3648 3649 supported = bdev_io_type_supported(bdev, io_type); 3650 3651 if (!supported) { 3652 switch (io_type) { 3653 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3654 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3655 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3656 break; 3657 default: 3658 break; 3659 } 3660 } 3661 3662 return supported; 3663 } 3664 3665 uint64_t 3666 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3667 { 3668 return bdev_io->internal.submit_tsc; 3669 } 3670 3671 int 3672 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3673 { 3674 if (bdev->fn_table->dump_info_json) { 3675 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3676 } 3677 3678 return 0; 3679 } 3680 3681 static void 3682 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3683 { 3684 uint32_t max_per_timeslice = 0; 3685 int i; 3686 3687 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3688 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3689 qos->rate_limits[i].max_per_timeslice = 0; 3690 continue; 3691 } 3692 3693 max_per_timeslice = qos->rate_limits[i].limit * 3694 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3695 3696 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3697 qos->rate_limits[i].min_per_timeslice); 3698 3699 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3700 } 3701 3702 bdev_qos_set_ops(qos); 3703 } 3704 3705 static int 3706 bdev_channel_poll_qos(void *arg) 3707 { 3708 struct spdk_bdev_qos *qos = arg; 3709 uint64_t now = spdk_get_ticks(); 3710 int i; 3711 3712 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3713 /* We received our callback earlier than expected - return 3714 * immediately and wait to do accounting until at least one 3715 * timeslice has actually expired. This should never happen 3716 * with a well-behaved timer implementation. 3717 */ 3718 return SPDK_POLLER_IDLE; 3719 } 3720 3721 /* Reset for next round of rate limiting */ 3722 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3723 /* We may have allowed the IOs or bytes to slightly overrun in the last 3724 * timeslice. remaining_this_timeslice is signed, so if it's negative 3725 * here, we'll account for the overrun so that the next timeslice will 3726 * be appropriately reduced. 3727 */ 3728 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3729 qos->rate_limits[i].remaining_this_timeslice = 0; 3730 } 3731 } 3732 3733 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3734 qos->last_timeslice += qos->timeslice_size; 3735 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3736 qos->rate_limits[i].remaining_this_timeslice += 3737 qos->rate_limits[i].max_per_timeslice; 3738 } 3739 } 3740 3741 return bdev_qos_io_submit(qos->ch, qos); 3742 } 3743 3744 static void 3745 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3746 { 3747 struct spdk_bdev_shared_resource *shared_resource; 3748 struct lba_range *range; 3749 3750 bdev_free_io_stat(ch->stat); 3751 #ifdef SPDK_CONFIG_VTUNE 3752 bdev_free_io_stat(ch->prev_stat); 3753 #endif 3754 3755 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3756 range = TAILQ_FIRST(&ch->locked_ranges); 3757 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3758 free(range); 3759 } 3760 3761 spdk_put_io_channel(ch->channel); 3762 spdk_put_io_channel(ch->accel_channel); 3763 3764 shared_resource = ch->shared_resource; 3765 3766 assert(TAILQ_EMPTY(&ch->io_locked)); 3767 assert(TAILQ_EMPTY(&ch->io_submitted)); 3768 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3769 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3770 assert(ch->io_outstanding == 0); 3771 assert(shared_resource->ref > 0); 3772 shared_resource->ref--; 3773 if (shared_resource->ref == 0) { 3774 assert(shared_resource->io_outstanding == 0); 3775 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3776 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3777 spdk_poller_unregister(&shared_resource->nomem_poller); 3778 free(shared_resource); 3779 } 3780 } 3781 3782 static void 3783 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3784 { 3785 struct spdk_bdev_qos *qos = bdev->internal.qos; 3786 int i; 3787 3788 assert(spdk_spin_held(&bdev->internal.spinlock)); 3789 3790 /* Rate limiting on this bdev enabled */ 3791 if (qos) { 3792 if (qos->ch == NULL) { 3793 struct spdk_io_channel *io_ch; 3794 3795 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3796 bdev->name, spdk_get_thread()); 3797 3798 /* No qos channel has been selected, so set one up */ 3799 3800 /* Take another reference to ch */ 3801 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3802 assert(io_ch != NULL); 3803 qos->ch = ch; 3804 3805 qos->thread = spdk_io_channel_get_thread(io_ch); 3806 3807 TAILQ_INIT(&qos->queued); 3808 3809 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3810 if (bdev_qos_is_iops_rate_limit(i) == true) { 3811 qos->rate_limits[i].min_per_timeslice = 3812 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3813 } else { 3814 qos->rate_limits[i].min_per_timeslice = 3815 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3816 } 3817 3818 if (qos->rate_limits[i].limit == 0) { 3819 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3820 } 3821 } 3822 bdev_qos_update_max_quota_per_timeslice(qos); 3823 qos->timeslice_size = 3824 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3825 qos->last_timeslice = spdk_get_ticks(); 3826 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3827 qos, 3828 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3829 } 3830 3831 ch->flags |= BDEV_CH_QOS_ENABLED; 3832 } 3833 } 3834 3835 struct poll_timeout_ctx { 3836 struct spdk_bdev_desc *desc; 3837 uint64_t timeout_in_sec; 3838 spdk_bdev_io_timeout_cb cb_fn; 3839 void *cb_arg; 3840 }; 3841 3842 static void 3843 bdev_desc_free(struct spdk_bdev_desc *desc) 3844 { 3845 spdk_spin_destroy(&desc->spinlock); 3846 free(desc->media_events_buffer); 3847 free(desc); 3848 } 3849 3850 static void 3851 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3852 { 3853 struct poll_timeout_ctx *ctx = _ctx; 3854 struct spdk_bdev_desc *desc = ctx->desc; 3855 3856 free(ctx); 3857 3858 spdk_spin_lock(&desc->spinlock); 3859 desc->refs--; 3860 if (desc->closed == true && desc->refs == 0) { 3861 spdk_spin_unlock(&desc->spinlock); 3862 bdev_desc_free(desc); 3863 return; 3864 } 3865 spdk_spin_unlock(&desc->spinlock); 3866 } 3867 3868 static void 3869 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3870 struct spdk_io_channel *io_ch, void *_ctx) 3871 { 3872 struct poll_timeout_ctx *ctx = _ctx; 3873 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3874 struct spdk_bdev_desc *desc = ctx->desc; 3875 struct spdk_bdev_io *bdev_io; 3876 uint64_t now; 3877 3878 spdk_spin_lock(&desc->spinlock); 3879 if (desc->closed == true) { 3880 spdk_spin_unlock(&desc->spinlock); 3881 spdk_bdev_for_each_channel_continue(i, -1); 3882 return; 3883 } 3884 spdk_spin_unlock(&desc->spinlock); 3885 3886 now = spdk_get_ticks(); 3887 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3888 /* Exclude any I/O that are generated via splitting. */ 3889 if (bdev_io->internal.cb == bdev_io_split_done) { 3890 continue; 3891 } 3892 3893 /* Once we find an I/O that has not timed out, we can immediately 3894 * exit the loop. 3895 */ 3896 if (now < (bdev_io->internal.submit_tsc + 3897 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3898 goto end; 3899 } 3900 3901 if (bdev_io->internal.desc == desc) { 3902 ctx->cb_fn(ctx->cb_arg, bdev_io); 3903 } 3904 } 3905 3906 end: 3907 spdk_bdev_for_each_channel_continue(i, 0); 3908 } 3909 3910 static int 3911 bdev_poll_timeout_io(void *arg) 3912 { 3913 struct spdk_bdev_desc *desc = arg; 3914 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3915 struct poll_timeout_ctx *ctx; 3916 3917 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3918 if (!ctx) { 3919 SPDK_ERRLOG("failed to allocate memory\n"); 3920 return SPDK_POLLER_BUSY; 3921 } 3922 ctx->desc = desc; 3923 ctx->cb_arg = desc->cb_arg; 3924 ctx->cb_fn = desc->cb_fn; 3925 ctx->timeout_in_sec = desc->timeout_in_sec; 3926 3927 /* Take a ref on the descriptor in case it gets closed while we are checking 3928 * all of the channels. 3929 */ 3930 spdk_spin_lock(&desc->spinlock); 3931 desc->refs++; 3932 spdk_spin_unlock(&desc->spinlock); 3933 3934 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3935 bdev_channel_poll_timeout_io_done); 3936 3937 return SPDK_POLLER_BUSY; 3938 } 3939 3940 int 3941 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3942 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3943 { 3944 assert(desc->thread == spdk_get_thread()); 3945 3946 spdk_poller_unregister(&desc->io_timeout_poller); 3947 3948 if (timeout_in_sec) { 3949 assert(cb_fn != NULL); 3950 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3951 desc, 3952 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3953 1000); 3954 if (desc->io_timeout_poller == NULL) { 3955 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3956 return -1; 3957 } 3958 } 3959 3960 desc->cb_fn = cb_fn; 3961 desc->cb_arg = cb_arg; 3962 desc->timeout_in_sec = timeout_in_sec; 3963 3964 return 0; 3965 } 3966 3967 static int 3968 bdev_channel_create(void *io_device, void *ctx_buf) 3969 { 3970 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3971 struct spdk_bdev_channel *ch = ctx_buf; 3972 struct spdk_io_channel *mgmt_io_ch; 3973 struct spdk_bdev_mgmt_channel *mgmt_ch; 3974 struct spdk_bdev_shared_resource *shared_resource; 3975 struct lba_range *range; 3976 3977 ch->bdev = bdev; 3978 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3979 if (!ch->channel) { 3980 return -1; 3981 } 3982 3983 ch->accel_channel = spdk_accel_get_io_channel(); 3984 if (!ch->accel_channel) { 3985 spdk_put_io_channel(ch->channel); 3986 return -1; 3987 } 3988 3989 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3990 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3991 3992 assert(ch->histogram == NULL); 3993 if (bdev->internal.histogram_enabled) { 3994 ch->histogram = spdk_histogram_data_alloc(); 3995 if (ch->histogram == NULL) { 3996 SPDK_ERRLOG("Could not allocate histogram\n"); 3997 } 3998 } 3999 4000 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4001 if (!mgmt_io_ch) { 4002 spdk_put_io_channel(ch->channel); 4003 spdk_put_io_channel(ch->accel_channel); 4004 return -1; 4005 } 4006 4007 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4008 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4009 if (shared_resource->shared_ch == ch->channel) { 4010 spdk_put_io_channel(mgmt_io_ch); 4011 shared_resource->ref++; 4012 break; 4013 } 4014 } 4015 4016 if (shared_resource == NULL) { 4017 shared_resource = calloc(1, sizeof(*shared_resource)); 4018 if (shared_resource == NULL) { 4019 spdk_put_io_channel(ch->channel); 4020 spdk_put_io_channel(ch->accel_channel); 4021 spdk_put_io_channel(mgmt_io_ch); 4022 return -1; 4023 } 4024 4025 shared_resource->mgmt_ch = mgmt_ch; 4026 shared_resource->io_outstanding = 0; 4027 TAILQ_INIT(&shared_resource->nomem_io); 4028 shared_resource->nomem_threshold = 0; 4029 shared_resource->shared_ch = ch->channel; 4030 shared_resource->ref = 1; 4031 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4032 } 4033 4034 ch->io_outstanding = 0; 4035 TAILQ_INIT(&ch->queued_resets); 4036 TAILQ_INIT(&ch->locked_ranges); 4037 ch->flags = 0; 4038 ch->shared_resource = shared_resource; 4039 4040 TAILQ_INIT(&ch->io_submitted); 4041 TAILQ_INIT(&ch->io_locked); 4042 TAILQ_INIT(&ch->io_accel_exec); 4043 TAILQ_INIT(&ch->io_memory_domain); 4044 4045 ch->stat = bdev_alloc_io_stat(false); 4046 if (ch->stat == NULL) { 4047 bdev_channel_destroy_resource(ch); 4048 return -1; 4049 } 4050 4051 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4052 4053 #ifdef SPDK_CONFIG_VTUNE 4054 { 4055 char *name; 4056 __itt_init_ittlib(NULL, 0); 4057 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4058 if (!name) { 4059 bdev_channel_destroy_resource(ch); 4060 return -1; 4061 } 4062 ch->handle = __itt_string_handle_create(name); 4063 free(name); 4064 ch->start_tsc = spdk_get_ticks(); 4065 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4066 ch->prev_stat = bdev_alloc_io_stat(false); 4067 if (ch->prev_stat == NULL) { 4068 bdev_channel_destroy_resource(ch); 4069 return -1; 4070 } 4071 } 4072 #endif 4073 4074 spdk_spin_lock(&bdev->internal.spinlock); 4075 bdev_enable_qos(bdev, ch); 4076 4077 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4078 struct lba_range *new_range; 4079 4080 new_range = calloc(1, sizeof(*new_range)); 4081 if (new_range == NULL) { 4082 spdk_spin_unlock(&bdev->internal.spinlock); 4083 bdev_channel_destroy_resource(ch); 4084 return -1; 4085 } 4086 new_range->length = range->length; 4087 new_range->offset = range->offset; 4088 new_range->locked_ctx = range->locked_ctx; 4089 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4090 } 4091 4092 spdk_spin_unlock(&bdev->internal.spinlock); 4093 4094 return 0; 4095 } 4096 4097 static int 4098 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4099 void *cb_ctx) 4100 { 4101 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4102 struct spdk_bdev_io *bdev_io; 4103 uint64_t buf_len; 4104 4105 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4106 if (bdev_io->internal.ch == bdev_ch) { 4107 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4108 spdk_iobuf_entry_abort(ch, entry, buf_len); 4109 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4110 } 4111 4112 return 0; 4113 } 4114 4115 /* 4116 * Abort I/O that are waiting on a data buffer. 4117 */ 4118 static void 4119 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4120 { 4121 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4122 bdev_abort_all_buf_io_cb, ch); 4123 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4124 bdev_abort_all_buf_io_cb, ch); 4125 } 4126 4127 /* 4128 * Abort I/O that are queued waiting for submission. These types of I/O are 4129 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4130 */ 4131 static void 4132 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4133 { 4134 struct spdk_bdev_io *bdev_io, *tmp; 4135 4136 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4137 if (bdev_io->internal.ch == ch) { 4138 TAILQ_REMOVE(queue, bdev_io, internal.link); 4139 /* 4140 * spdk_bdev_io_complete() assumes that the completed I/O had 4141 * been submitted to the bdev module. Since in this case it 4142 * hadn't, bump io_outstanding to account for the decrement 4143 * that spdk_bdev_io_complete() will do. 4144 */ 4145 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4146 bdev_io_increment_outstanding(ch, ch->shared_resource); 4147 } 4148 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4149 } 4150 } 4151 } 4152 4153 static bool 4154 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4155 { 4156 struct spdk_bdev_io *bdev_io; 4157 4158 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4159 if (bdev_io == bio_to_abort) { 4160 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4161 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4162 return true; 4163 } 4164 } 4165 4166 return false; 4167 } 4168 4169 static int 4170 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4171 { 4172 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4173 uint64_t buf_len; 4174 4175 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4176 if (bdev_io == bio_to_abort) { 4177 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4178 spdk_iobuf_entry_abort(ch, entry, buf_len); 4179 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4180 return 1; 4181 } 4182 4183 return 0; 4184 } 4185 4186 static bool 4187 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4188 { 4189 int rc; 4190 4191 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4192 bdev_abort_buf_io_cb, bio_to_abort); 4193 if (rc == 1) { 4194 return true; 4195 } 4196 4197 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4198 bdev_abort_buf_io_cb, bio_to_abort); 4199 return rc == 1; 4200 } 4201 4202 static void 4203 bdev_qos_channel_destroy(void *cb_arg) 4204 { 4205 struct spdk_bdev_qos *qos = cb_arg; 4206 4207 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4208 spdk_poller_unregister(&qos->poller); 4209 4210 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4211 4212 free(qos); 4213 } 4214 4215 static int 4216 bdev_qos_destroy(struct spdk_bdev *bdev) 4217 { 4218 int i; 4219 4220 /* 4221 * Cleanly shutting down the QoS poller is tricky, because 4222 * during the asynchronous operation the user could open 4223 * a new descriptor and create a new channel, spawning 4224 * a new QoS poller. 4225 * 4226 * The strategy is to create a new QoS structure here and swap it 4227 * in. The shutdown path then continues to refer to the old one 4228 * until it completes and then releases it. 4229 */ 4230 struct spdk_bdev_qos *new_qos, *old_qos; 4231 4232 old_qos = bdev->internal.qos; 4233 4234 new_qos = calloc(1, sizeof(*new_qos)); 4235 if (!new_qos) { 4236 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4237 return -ENOMEM; 4238 } 4239 4240 /* Copy the old QoS data into the newly allocated structure */ 4241 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4242 4243 /* Zero out the key parts of the QoS structure */ 4244 new_qos->ch = NULL; 4245 new_qos->thread = NULL; 4246 new_qos->poller = NULL; 4247 TAILQ_INIT(&new_qos->queued); 4248 /* 4249 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4250 * It will be used later for the new QoS structure. 4251 */ 4252 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4253 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4254 new_qos->rate_limits[i].min_per_timeslice = 0; 4255 new_qos->rate_limits[i].max_per_timeslice = 0; 4256 } 4257 4258 bdev->internal.qos = new_qos; 4259 4260 if (old_qos->thread == NULL) { 4261 free(old_qos); 4262 } else { 4263 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4264 } 4265 4266 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4267 * been destroyed yet. The destruction path will end up waiting for the final 4268 * channel to be put before it releases resources. */ 4269 4270 return 0; 4271 } 4272 4273 void 4274 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4275 { 4276 total->bytes_read += add->bytes_read; 4277 total->num_read_ops += add->num_read_ops; 4278 total->bytes_written += add->bytes_written; 4279 total->num_write_ops += add->num_write_ops; 4280 total->bytes_unmapped += add->bytes_unmapped; 4281 total->num_unmap_ops += add->num_unmap_ops; 4282 total->bytes_copied += add->bytes_copied; 4283 total->num_copy_ops += add->num_copy_ops; 4284 total->read_latency_ticks += add->read_latency_ticks; 4285 total->write_latency_ticks += add->write_latency_ticks; 4286 total->unmap_latency_ticks += add->unmap_latency_ticks; 4287 total->copy_latency_ticks += add->copy_latency_ticks; 4288 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4289 total->max_read_latency_ticks = add->max_read_latency_ticks; 4290 } 4291 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4292 total->min_read_latency_ticks = add->min_read_latency_ticks; 4293 } 4294 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4295 total->max_write_latency_ticks = add->max_write_latency_ticks; 4296 } 4297 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4298 total->min_write_latency_ticks = add->min_write_latency_ticks; 4299 } 4300 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4301 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4302 } 4303 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4304 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4305 } 4306 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4307 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4308 } 4309 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4310 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4311 } 4312 } 4313 4314 static void 4315 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4316 { 4317 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4318 4319 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4320 memcpy(to_stat->io_error, from_stat->io_error, 4321 sizeof(struct spdk_bdev_io_error_stat)); 4322 } 4323 } 4324 4325 void 4326 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4327 { 4328 stat->max_read_latency_ticks = 0; 4329 stat->min_read_latency_ticks = UINT64_MAX; 4330 stat->max_write_latency_ticks = 0; 4331 stat->min_write_latency_ticks = UINT64_MAX; 4332 stat->max_unmap_latency_ticks = 0; 4333 stat->min_unmap_latency_ticks = UINT64_MAX; 4334 stat->max_copy_latency_ticks = 0; 4335 stat->min_copy_latency_ticks = UINT64_MAX; 4336 4337 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4338 return; 4339 } 4340 4341 stat->bytes_read = 0; 4342 stat->num_read_ops = 0; 4343 stat->bytes_written = 0; 4344 stat->num_write_ops = 0; 4345 stat->bytes_unmapped = 0; 4346 stat->num_unmap_ops = 0; 4347 stat->bytes_copied = 0; 4348 stat->num_copy_ops = 0; 4349 stat->read_latency_ticks = 0; 4350 stat->write_latency_ticks = 0; 4351 stat->unmap_latency_ticks = 0; 4352 stat->copy_latency_ticks = 0; 4353 4354 if (stat->io_error != NULL) { 4355 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4356 } 4357 } 4358 4359 struct spdk_bdev_io_stat * 4360 bdev_alloc_io_stat(bool io_error_stat) 4361 { 4362 struct spdk_bdev_io_stat *stat; 4363 4364 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4365 if (stat == NULL) { 4366 return NULL; 4367 } 4368 4369 if (io_error_stat) { 4370 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4371 if (stat->io_error == NULL) { 4372 free(stat); 4373 return NULL; 4374 } 4375 } else { 4376 stat->io_error = NULL; 4377 } 4378 4379 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4380 4381 return stat; 4382 } 4383 4384 void 4385 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4386 { 4387 if (stat != NULL) { 4388 free(stat->io_error); 4389 free(stat); 4390 } 4391 } 4392 4393 void 4394 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4395 { 4396 int i; 4397 4398 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4399 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4400 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4401 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4402 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4403 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4404 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4405 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4406 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4407 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4408 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4409 stat->min_read_latency_ticks != UINT64_MAX ? 4410 stat->min_read_latency_ticks : 0); 4411 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4412 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4413 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4414 stat->min_write_latency_ticks != UINT64_MAX ? 4415 stat->min_write_latency_ticks : 0); 4416 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4417 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4418 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4419 stat->min_unmap_latency_ticks != UINT64_MAX ? 4420 stat->min_unmap_latency_ticks : 0); 4421 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4422 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4423 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4424 stat->min_copy_latency_ticks != UINT64_MAX ? 4425 stat->min_copy_latency_ticks : 0); 4426 4427 if (stat->io_error != NULL) { 4428 spdk_json_write_named_object_begin(w, "io_error"); 4429 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4430 if (stat->io_error->error_status[i] != 0) { 4431 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4432 stat->io_error->error_status[i]); 4433 } 4434 } 4435 spdk_json_write_object_end(w); 4436 } 4437 } 4438 4439 static void 4440 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4441 { 4442 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4443 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4444 4445 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4446 bdev_abort_all_buf_io(mgmt_ch, ch); 4447 } 4448 4449 static void 4450 bdev_channel_destroy(void *io_device, void *ctx_buf) 4451 { 4452 struct spdk_bdev_channel *ch = ctx_buf; 4453 4454 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4455 spdk_get_thread()); 4456 4457 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4458 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4459 4460 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4461 spdk_spin_lock(&ch->bdev->internal.spinlock); 4462 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4463 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4464 4465 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4466 4467 bdev_channel_abort_queued_ios(ch); 4468 4469 if (ch->histogram) { 4470 spdk_histogram_data_free(ch->histogram); 4471 } 4472 4473 bdev_channel_destroy_resource(ch); 4474 } 4475 4476 /* 4477 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4478 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4479 */ 4480 static int 4481 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4482 { 4483 struct spdk_bdev_name *tmp; 4484 4485 bdev_name->name = strdup(name); 4486 if (bdev_name->name == NULL) { 4487 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4488 return -ENOMEM; 4489 } 4490 4491 bdev_name->bdev = bdev; 4492 4493 spdk_spin_lock(&g_bdev_mgr.spinlock); 4494 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4495 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4496 4497 if (tmp != NULL) { 4498 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4499 free(bdev_name->name); 4500 return -EEXIST; 4501 } 4502 4503 return 0; 4504 } 4505 4506 static void 4507 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4508 { 4509 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4510 free(bdev_name->name); 4511 } 4512 4513 static void 4514 bdev_name_del(struct spdk_bdev_name *bdev_name) 4515 { 4516 spdk_spin_lock(&g_bdev_mgr.spinlock); 4517 bdev_name_del_unsafe(bdev_name); 4518 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4519 } 4520 4521 int 4522 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4523 { 4524 struct spdk_bdev_alias *tmp; 4525 int ret; 4526 4527 if (alias == NULL) { 4528 SPDK_ERRLOG("Empty alias passed\n"); 4529 return -EINVAL; 4530 } 4531 4532 tmp = calloc(1, sizeof(*tmp)); 4533 if (tmp == NULL) { 4534 SPDK_ERRLOG("Unable to allocate alias\n"); 4535 return -ENOMEM; 4536 } 4537 4538 ret = bdev_name_add(&tmp->alias, bdev, alias); 4539 if (ret != 0) { 4540 free(tmp); 4541 return ret; 4542 } 4543 4544 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4545 4546 return 0; 4547 } 4548 4549 static int 4550 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4551 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4552 { 4553 struct spdk_bdev_alias *tmp; 4554 4555 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4556 if (strcmp(alias, tmp->alias.name) == 0) { 4557 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4558 alias_del_fn(&tmp->alias); 4559 free(tmp); 4560 return 0; 4561 } 4562 } 4563 4564 return -ENOENT; 4565 } 4566 4567 int 4568 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4569 { 4570 int rc; 4571 4572 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4573 if (rc == -ENOENT) { 4574 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4575 } 4576 4577 return rc; 4578 } 4579 4580 void 4581 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4582 { 4583 struct spdk_bdev_alias *p, *tmp; 4584 4585 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4586 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4587 bdev_name_del(&p->alias); 4588 free(p); 4589 } 4590 } 4591 4592 struct spdk_io_channel * 4593 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4594 { 4595 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4596 } 4597 4598 void * 4599 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4600 { 4601 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4602 void *ctx = NULL; 4603 4604 if (bdev->fn_table->get_module_ctx) { 4605 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4606 } 4607 4608 return ctx; 4609 } 4610 4611 const char * 4612 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4613 { 4614 return bdev->module->name; 4615 } 4616 4617 const char * 4618 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4619 { 4620 return bdev->name; 4621 } 4622 4623 const char * 4624 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4625 { 4626 return bdev->product_name; 4627 } 4628 4629 const struct spdk_bdev_aliases_list * 4630 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4631 { 4632 return &bdev->aliases; 4633 } 4634 4635 uint32_t 4636 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4637 { 4638 return bdev->blocklen; 4639 } 4640 4641 uint32_t 4642 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4643 { 4644 return bdev->write_unit_size; 4645 } 4646 4647 uint64_t 4648 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4649 { 4650 return bdev->blockcnt; 4651 } 4652 4653 const char * 4654 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4655 { 4656 return qos_rpc_type[type]; 4657 } 4658 4659 void 4660 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4661 { 4662 int i; 4663 4664 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4665 4666 spdk_spin_lock(&bdev->internal.spinlock); 4667 if (bdev->internal.qos) { 4668 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4669 if (bdev->internal.qos->rate_limits[i].limit != 4670 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4671 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4672 if (bdev_qos_is_iops_rate_limit(i) == false) { 4673 /* Change from Byte to Megabyte which is user visible. */ 4674 limits[i] = limits[i] / 1024 / 1024; 4675 } 4676 } 4677 } 4678 } 4679 spdk_spin_unlock(&bdev->internal.spinlock); 4680 } 4681 4682 size_t 4683 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4684 { 4685 return 1 << bdev->required_alignment; 4686 } 4687 4688 uint32_t 4689 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4690 { 4691 return bdev->optimal_io_boundary; 4692 } 4693 4694 bool 4695 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4696 { 4697 return bdev->write_cache; 4698 } 4699 4700 const struct spdk_uuid * 4701 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4702 { 4703 return &bdev->uuid; 4704 } 4705 4706 uint16_t 4707 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4708 { 4709 return bdev->acwu; 4710 } 4711 4712 uint32_t 4713 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4714 { 4715 return bdev->md_len; 4716 } 4717 4718 bool 4719 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4720 { 4721 return (bdev->md_len != 0) && bdev->md_interleave; 4722 } 4723 4724 bool 4725 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4726 { 4727 return (bdev->md_len != 0) && !bdev->md_interleave; 4728 } 4729 4730 bool 4731 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4732 { 4733 return bdev->zoned; 4734 } 4735 4736 uint32_t 4737 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4738 { 4739 if (spdk_bdev_is_md_interleaved(bdev)) { 4740 return bdev->blocklen - bdev->md_len; 4741 } else { 4742 return bdev->blocklen; 4743 } 4744 } 4745 4746 uint32_t 4747 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4748 { 4749 return bdev->phys_blocklen; 4750 } 4751 4752 static uint32_t 4753 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4754 { 4755 if (!spdk_bdev_is_md_interleaved(bdev)) { 4756 return bdev->blocklen + bdev->md_len; 4757 } else { 4758 return bdev->blocklen; 4759 } 4760 } 4761 4762 /* We have to use the typedef in the function declaration to appease astyle. */ 4763 typedef enum spdk_dif_type spdk_dif_type_t; 4764 4765 spdk_dif_type_t 4766 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4767 { 4768 if (bdev->md_len != 0) { 4769 return bdev->dif_type; 4770 } else { 4771 return SPDK_DIF_DISABLE; 4772 } 4773 } 4774 4775 bool 4776 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4777 { 4778 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4779 return bdev->dif_is_head_of_md; 4780 } else { 4781 return false; 4782 } 4783 } 4784 4785 bool 4786 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4787 enum spdk_dif_check_type check_type) 4788 { 4789 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4790 return false; 4791 } 4792 4793 switch (check_type) { 4794 case SPDK_DIF_CHECK_TYPE_REFTAG: 4795 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4796 case SPDK_DIF_CHECK_TYPE_APPTAG: 4797 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4798 case SPDK_DIF_CHECK_TYPE_GUARD: 4799 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4800 default: 4801 return false; 4802 } 4803 } 4804 4805 static uint32_t 4806 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4807 { 4808 uint64_t aligned_length, max_write_blocks; 4809 4810 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4811 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4812 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4813 4814 return max_write_blocks; 4815 } 4816 4817 uint32_t 4818 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4819 { 4820 return bdev->max_copy; 4821 } 4822 4823 uint64_t 4824 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4825 { 4826 return bdev->internal.measured_queue_depth; 4827 } 4828 4829 uint64_t 4830 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4831 { 4832 return bdev->internal.period; 4833 } 4834 4835 uint64_t 4836 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4837 { 4838 return bdev->internal.weighted_io_time; 4839 } 4840 4841 uint64_t 4842 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4843 { 4844 return bdev->internal.io_time; 4845 } 4846 4847 static void bdev_update_qd_sampling_period(void *ctx); 4848 4849 static void 4850 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4851 { 4852 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4853 4854 if (bdev->internal.measured_queue_depth) { 4855 bdev->internal.io_time += bdev->internal.period; 4856 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4857 } 4858 4859 bdev->internal.qd_poll_in_progress = false; 4860 4861 bdev_update_qd_sampling_period(bdev); 4862 } 4863 4864 static void 4865 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4866 struct spdk_io_channel *io_ch, void *_ctx) 4867 { 4868 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4869 4870 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4871 spdk_bdev_for_each_channel_continue(i, 0); 4872 } 4873 4874 static int 4875 bdev_calculate_measured_queue_depth(void *ctx) 4876 { 4877 struct spdk_bdev *bdev = ctx; 4878 4879 bdev->internal.qd_poll_in_progress = true; 4880 bdev->internal.temporary_queue_depth = 0; 4881 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4882 return SPDK_POLLER_BUSY; 4883 } 4884 4885 static void 4886 bdev_update_qd_sampling_period(void *ctx) 4887 { 4888 struct spdk_bdev *bdev = ctx; 4889 4890 if (bdev->internal.period == bdev->internal.new_period) { 4891 return; 4892 } 4893 4894 if (bdev->internal.qd_poll_in_progress) { 4895 return; 4896 } 4897 4898 bdev->internal.period = bdev->internal.new_period; 4899 4900 spdk_poller_unregister(&bdev->internal.qd_poller); 4901 if (bdev->internal.period != 0) { 4902 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4903 bdev, bdev->internal.period); 4904 } else { 4905 spdk_bdev_close(bdev->internal.qd_desc); 4906 bdev->internal.qd_desc = NULL; 4907 } 4908 } 4909 4910 static void 4911 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4912 { 4913 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4914 } 4915 4916 void 4917 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4918 { 4919 int rc; 4920 4921 if (bdev->internal.new_period == period) { 4922 return; 4923 } 4924 4925 bdev->internal.new_period = period; 4926 4927 if (bdev->internal.qd_desc != NULL) { 4928 assert(bdev->internal.period != 0); 4929 4930 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4931 bdev_update_qd_sampling_period, bdev); 4932 return; 4933 } 4934 4935 assert(bdev->internal.period == 0); 4936 4937 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4938 NULL, &bdev->internal.qd_desc); 4939 if (rc != 0) { 4940 return; 4941 } 4942 4943 bdev->internal.period = period; 4944 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4945 bdev, period); 4946 } 4947 4948 struct bdev_get_current_qd_ctx { 4949 uint64_t current_qd; 4950 spdk_bdev_get_current_qd_cb cb_fn; 4951 void *cb_arg; 4952 }; 4953 4954 static void 4955 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4956 { 4957 struct bdev_get_current_qd_ctx *ctx = _ctx; 4958 4959 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4960 4961 free(ctx); 4962 } 4963 4964 static void 4965 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4966 struct spdk_io_channel *io_ch, void *_ctx) 4967 { 4968 struct bdev_get_current_qd_ctx *ctx = _ctx; 4969 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4970 4971 ctx->current_qd += bdev_ch->io_outstanding; 4972 4973 spdk_bdev_for_each_channel_continue(i, 0); 4974 } 4975 4976 void 4977 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4978 void *cb_arg) 4979 { 4980 struct bdev_get_current_qd_ctx *ctx; 4981 4982 assert(cb_fn != NULL); 4983 4984 ctx = calloc(1, sizeof(*ctx)); 4985 if (ctx == NULL) { 4986 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4987 return; 4988 } 4989 4990 ctx->cb_fn = cb_fn; 4991 ctx->cb_arg = cb_arg; 4992 4993 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4994 } 4995 4996 static void 4997 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4998 { 4999 assert(desc->thread == spdk_get_thread()); 5000 5001 spdk_spin_lock(&desc->spinlock); 5002 desc->refs--; 5003 if (!desc->closed) { 5004 spdk_spin_unlock(&desc->spinlock); 5005 desc->callback.event_fn(type, 5006 desc->bdev, 5007 desc->callback.ctx); 5008 return; 5009 } else if (desc->refs == 0) { 5010 /* This descriptor was closed after this event_notify message was sent. 5011 * spdk_bdev_close() could not free the descriptor since this message was 5012 * in flight, so we free it now using bdev_desc_free(). 5013 */ 5014 spdk_spin_unlock(&desc->spinlock); 5015 bdev_desc_free(desc); 5016 return; 5017 } 5018 spdk_spin_unlock(&desc->spinlock); 5019 } 5020 5021 static void 5022 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5023 { 5024 spdk_spin_lock(&desc->spinlock); 5025 desc->refs++; 5026 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5027 spdk_spin_unlock(&desc->spinlock); 5028 } 5029 5030 static void 5031 _resize_notify(void *ctx) 5032 { 5033 struct spdk_bdev_desc *desc = ctx; 5034 5035 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5036 } 5037 5038 int 5039 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5040 { 5041 struct spdk_bdev_desc *desc; 5042 int ret; 5043 5044 if (size == bdev->blockcnt) { 5045 return 0; 5046 } 5047 5048 spdk_spin_lock(&bdev->internal.spinlock); 5049 5050 /* bdev has open descriptors */ 5051 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5052 bdev->blockcnt > size) { 5053 ret = -EBUSY; 5054 } else { 5055 bdev->blockcnt = size; 5056 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5057 event_notify(desc, _resize_notify); 5058 } 5059 ret = 0; 5060 } 5061 5062 spdk_spin_unlock(&bdev->internal.spinlock); 5063 5064 return ret; 5065 } 5066 5067 /* 5068 * Convert I/O offset and length from bytes to blocks. 5069 * 5070 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5071 */ 5072 static uint64_t 5073 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5074 uint64_t num_bytes, uint64_t *num_blocks) 5075 { 5076 uint32_t block_size = bdev->blocklen; 5077 uint8_t shift_cnt; 5078 5079 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5080 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5081 shift_cnt = spdk_u32log2(block_size); 5082 *offset_blocks = offset_bytes >> shift_cnt; 5083 *num_blocks = num_bytes >> shift_cnt; 5084 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5085 (num_bytes - (*num_blocks << shift_cnt)); 5086 } else { 5087 *offset_blocks = offset_bytes / block_size; 5088 *num_blocks = num_bytes / block_size; 5089 return (offset_bytes % block_size) | (num_bytes % block_size); 5090 } 5091 } 5092 5093 static bool 5094 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5095 { 5096 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5097 * has been an overflow and hence the offset has been wrapped around */ 5098 if (offset_blocks + num_blocks < offset_blocks) { 5099 return false; 5100 } 5101 5102 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5103 if (offset_blocks + num_blocks > bdev->blockcnt) { 5104 return false; 5105 } 5106 5107 return true; 5108 } 5109 5110 static void 5111 bdev_seek_complete_cb(void *ctx) 5112 { 5113 struct spdk_bdev_io *bdev_io = ctx; 5114 5115 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5116 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5117 } 5118 5119 static int 5120 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5121 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5122 spdk_bdev_io_completion_cb cb, void *cb_arg) 5123 { 5124 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5125 struct spdk_bdev_io *bdev_io; 5126 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5127 5128 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5129 5130 /* Check if offset_blocks is valid looking at the validity of one block */ 5131 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5132 return -EINVAL; 5133 } 5134 5135 bdev_io = bdev_channel_get_io(channel); 5136 if (!bdev_io) { 5137 return -ENOMEM; 5138 } 5139 5140 bdev_io->internal.ch = channel; 5141 bdev_io->internal.desc = desc; 5142 bdev_io->type = io_type; 5143 bdev_io->u.bdev.offset_blocks = offset_blocks; 5144 bdev_io->u.bdev.memory_domain = NULL; 5145 bdev_io->u.bdev.memory_domain_ctx = NULL; 5146 bdev_io->u.bdev.accel_sequence = NULL; 5147 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5148 5149 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5150 /* In case bdev doesn't support seek to next data/hole offset, 5151 * it is assumed that only data and no holes are present */ 5152 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5153 bdev_io->u.bdev.seek.offset = offset_blocks; 5154 } else { 5155 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5156 } 5157 5158 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5159 return 0; 5160 } 5161 5162 bdev_io_submit(bdev_io); 5163 return 0; 5164 } 5165 5166 int 5167 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5168 uint64_t offset_blocks, 5169 spdk_bdev_io_completion_cb cb, void *cb_arg) 5170 { 5171 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5172 } 5173 5174 int 5175 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5176 uint64_t offset_blocks, 5177 spdk_bdev_io_completion_cb cb, void *cb_arg) 5178 { 5179 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5180 } 5181 5182 uint64_t 5183 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5184 { 5185 return bdev_io->u.bdev.seek.offset; 5186 } 5187 5188 static int 5189 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5190 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5191 spdk_bdev_io_completion_cb cb, void *cb_arg) 5192 { 5193 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5194 struct spdk_bdev_io *bdev_io; 5195 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5196 5197 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5198 return -EINVAL; 5199 } 5200 5201 bdev_io = bdev_channel_get_io(channel); 5202 if (!bdev_io) { 5203 return -ENOMEM; 5204 } 5205 5206 bdev_io->internal.ch = channel; 5207 bdev_io->internal.desc = desc; 5208 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5209 bdev_io->u.bdev.iovs = &bdev_io->iov; 5210 bdev_io->u.bdev.iovs[0].iov_base = buf; 5211 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5212 bdev_io->u.bdev.iovcnt = 1; 5213 bdev_io->u.bdev.md_buf = md_buf; 5214 bdev_io->u.bdev.num_blocks = num_blocks; 5215 bdev_io->u.bdev.offset_blocks = offset_blocks; 5216 bdev_io->u.bdev.memory_domain = NULL; 5217 bdev_io->u.bdev.memory_domain_ctx = NULL; 5218 bdev_io->u.bdev.accel_sequence = NULL; 5219 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5220 5221 bdev_io_submit(bdev_io); 5222 return 0; 5223 } 5224 5225 int 5226 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5227 void *buf, uint64_t offset, uint64_t nbytes, 5228 spdk_bdev_io_completion_cb cb, void *cb_arg) 5229 { 5230 uint64_t offset_blocks, num_blocks; 5231 5232 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5233 nbytes, &num_blocks) != 0) { 5234 return -EINVAL; 5235 } 5236 5237 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5238 } 5239 5240 int 5241 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5242 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5243 spdk_bdev_io_completion_cb cb, void *cb_arg) 5244 { 5245 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5246 } 5247 5248 int 5249 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5250 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5251 spdk_bdev_io_completion_cb cb, void *cb_arg) 5252 { 5253 struct iovec iov = { 5254 .iov_base = buf, 5255 }; 5256 5257 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5258 return -EINVAL; 5259 } 5260 5261 if (md_buf && !_is_buf_allocated(&iov)) { 5262 return -EINVAL; 5263 } 5264 5265 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5266 cb, cb_arg); 5267 } 5268 5269 int 5270 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5271 struct iovec *iov, int iovcnt, 5272 uint64_t offset, uint64_t nbytes, 5273 spdk_bdev_io_completion_cb cb, void *cb_arg) 5274 { 5275 uint64_t offset_blocks, num_blocks; 5276 5277 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5278 nbytes, &num_blocks) != 0) { 5279 return -EINVAL; 5280 } 5281 5282 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5283 } 5284 5285 static int 5286 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5287 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5288 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5289 struct spdk_accel_sequence *seq, 5290 spdk_bdev_io_completion_cb cb, void *cb_arg) 5291 { 5292 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5293 struct spdk_bdev_io *bdev_io; 5294 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5295 5296 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5297 return -EINVAL; 5298 } 5299 5300 bdev_io = bdev_channel_get_io(channel); 5301 if (!bdev_io) { 5302 return -ENOMEM; 5303 } 5304 5305 bdev_io->internal.ch = channel; 5306 bdev_io->internal.desc = desc; 5307 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5308 bdev_io->u.bdev.iovs = iov; 5309 bdev_io->u.bdev.iovcnt = iovcnt; 5310 bdev_io->u.bdev.md_buf = md_buf; 5311 bdev_io->u.bdev.num_blocks = num_blocks; 5312 bdev_io->u.bdev.offset_blocks = offset_blocks; 5313 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5314 bdev_io->internal.memory_domain = domain; 5315 bdev_io->internal.memory_domain_ctx = domain_ctx; 5316 bdev_io->internal.accel_sequence = seq; 5317 bdev_io->internal.has_accel_sequence = seq != NULL; 5318 bdev_io->u.bdev.memory_domain = domain; 5319 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5320 bdev_io->u.bdev.accel_sequence = seq; 5321 5322 _bdev_io_submit_ext(desc, bdev_io); 5323 5324 return 0; 5325 } 5326 5327 int 5328 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5329 struct iovec *iov, int iovcnt, 5330 uint64_t offset_blocks, uint64_t num_blocks, 5331 spdk_bdev_io_completion_cb cb, void *cb_arg) 5332 { 5333 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5334 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5335 } 5336 5337 int 5338 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5339 struct iovec *iov, int iovcnt, void *md_buf, 5340 uint64_t offset_blocks, uint64_t num_blocks, 5341 spdk_bdev_io_completion_cb cb, void *cb_arg) 5342 { 5343 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5344 return -EINVAL; 5345 } 5346 5347 if (md_buf && !_is_buf_allocated(iov)) { 5348 return -EINVAL; 5349 } 5350 5351 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5352 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5353 } 5354 5355 static inline bool 5356 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5357 { 5358 /* 5359 * We check if opts size is at least of size when we first introduced 5360 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5361 * are not checked internal. 5362 */ 5363 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5364 sizeof(opts->metadata) && 5365 opts->size <= sizeof(*opts) && 5366 /* When memory domain is used, the user must provide data buffers */ 5367 (!opts->memory_domain || (iov && iov[0].iov_base)); 5368 } 5369 5370 int 5371 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5372 struct iovec *iov, int iovcnt, 5373 uint64_t offset_blocks, uint64_t num_blocks, 5374 spdk_bdev_io_completion_cb cb, void *cb_arg, 5375 struct spdk_bdev_ext_io_opts *opts) 5376 { 5377 void *md = NULL; 5378 5379 if (opts) { 5380 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5381 return -EINVAL; 5382 } 5383 md = opts->metadata; 5384 } 5385 5386 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5387 return -EINVAL; 5388 } 5389 5390 if (md && !_is_buf_allocated(iov)) { 5391 return -EINVAL; 5392 } 5393 5394 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5395 num_blocks, 5396 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5397 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5398 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5399 cb, cb_arg); 5400 } 5401 5402 static int 5403 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5404 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5405 spdk_bdev_io_completion_cb cb, void *cb_arg) 5406 { 5407 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5408 struct spdk_bdev_io *bdev_io; 5409 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5410 5411 if (!desc->write) { 5412 return -EBADF; 5413 } 5414 5415 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5416 return -EINVAL; 5417 } 5418 5419 bdev_io = bdev_channel_get_io(channel); 5420 if (!bdev_io) { 5421 return -ENOMEM; 5422 } 5423 5424 bdev_io->internal.ch = channel; 5425 bdev_io->internal.desc = desc; 5426 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5427 bdev_io->u.bdev.iovs = &bdev_io->iov; 5428 bdev_io->u.bdev.iovs[0].iov_base = buf; 5429 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5430 bdev_io->u.bdev.iovcnt = 1; 5431 bdev_io->u.bdev.md_buf = md_buf; 5432 bdev_io->u.bdev.num_blocks = num_blocks; 5433 bdev_io->u.bdev.offset_blocks = offset_blocks; 5434 bdev_io->u.bdev.memory_domain = NULL; 5435 bdev_io->u.bdev.memory_domain_ctx = NULL; 5436 bdev_io->u.bdev.accel_sequence = NULL; 5437 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5438 5439 bdev_io_submit(bdev_io); 5440 return 0; 5441 } 5442 5443 int 5444 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5445 void *buf, uint64_t offset, uint64_t nbytes, 5446 spdk_bdev_io_completion_cb cb, void *cb_arg) 5447 { 5448 uint64_t offset_blocks, num_blocks; 5449 5450 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5451 nbytes, &num_blocks) != 0) { 5452 return -EINVAL; 5453 } 5454 5455 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5456 } 5457 5458 int 5459 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5460 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5461 spdk_bdev_io_completion_cb cb, void *cb_arg) 5462 { 5463 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5464 cb, cb_arg); 5465 } 5466 5467 int 5468 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5469 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5470 spdk_bdev_io_completion_cb cb, void *cb_arg) 5471 { 5472 struct iovec iov = { 5473 .iov_base = buf, 5474 }; 5475 5476 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5477 return -EINVAL; 5478 } 5479 5480 if (md_buf && !_is_buf_allocated(&iov)) { 5481 return -EINVAL; 5482 } 5483 5484 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5485 cb, cb_arg); 5486 } 5487 5488 static int 5489 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5490 struct iovec *iov, int iovcnt, void *md_buf, 5491 uint64_t offset_blocks, uint64_t num_blocks, 5492 struct spdk_memory_domain *domain, void *domain_ctx, 5493 struct spdk_accel_sequence *seq, 5494 spdk_bdev_io_completion_cb cb, void *cb_arg) 5495 { 5496 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5497 struct spdk_bdev_io *bdev_io; 5498 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5499 5500 if (!desc->write) { 5501 return -EBADF; 5502 } 5503 5504 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5505 return -EINVAL; 5506 } 5507 5508 bdev_io = bdev_channel_get_io(channel); 5509 if (!bdev_io) { 5510 return -ENOMEM; 5511 } 5512 5513 bdev_io->internal.ch = channel; 5514 bdev_io->internal.desc = desc; 5515 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5516 bdev_io->u.bdev.iovs = iov; 5517 bdev_io->u.bdev.iovcnt = iovcnt; 5518 bdev_io->u.bdev.md_buf = md_buf; 5519 bdev_io->u.bdev.num_blocks = num_blocks; 5520 bdev_io->u.bdev.offset_blocks = offset_blocks; 5521 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5522 bdev_io->internal.memory_domain = domain; 5523 bdev_io->internal.memory_domain_ctx = domain_ctx; 5524 bdev_io->internal.accel_sequence = seq; 5525 bdev_io->internal.has_accel_sequence = seq != NULL; 5526 bdev_io->u.bdev.memory_domain = domain; 5527 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5528 bdev_io->u.bdev.accel_sequence = seq; 5529 5530 _bdev_io_submit_ext(desc, bdev_io); 5531 5532 return 0; 5533 } 5534 5535 int 5536 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5537 struct iovec *iov, int iovcnt, 5538 uint64_t offset, uint64_t len, 5539 spdk_bdev_io_completion_cb cb, void *cb_arg) 5540 { 5541 uint64_t offset_blocks, num_blocks; 5542 5543 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5544 len, &num_blocks) != 0) { 5545 return -EINVAL; 5546 } 5547 5548 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5549 } 5550 5551 int 5552 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5553 struct iovec *iov, int iovcnt, 5554 uint64_t offset_blocks, uint64_t num_blocks, 5555 spdk_bdev_io_completion_cb cb, void *cb_arg) 5556 { 5557 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5558 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5559 } 5560 5561 int 5562 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5563 struct iovec *iov, int iovcnt, void *md_buf, 5564 uint64_t offset_blocks, uint64_t num_blocks, 5565 spdk_bdev_io_completion_cb cb, void *cb_arg) 5566 { 5567 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5568 return -EINVAL; 5569 } 5570 5571 if (md_buf && !_is_buf_allocated(iov)) { 5572 return -EINVAL; 5573 } 5574 5575 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5576 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5577 } 5578 5579 int 5580 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5581 struct iovec *iov, int iovcnt, 5582 uint64_t offset_blocks, uint64_t num_blocks, 5583 spdk_bdev_io_completion_cb cb, void *cb_arg, 5584 struct spdk_bdev_ext_io_opts *opts) 5585 { 5586 void *md = NULL; 5587 5588 if (opts) { 5589 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5590 return -EINVAL; 5591 } 5592 md = opts->metadata; 5593 } 5594 5595 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5596 return -EINVAL; 5597 } 5598 5599 if (md && !_is_buf_allocated(iov)) { 5600 return -EINVAL; 5601 } 5602 5603 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5604 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5605 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5606 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5607 cb, cb_arg); 5608 } 5609 5610 static void 5611 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5612 { 5613 struct spdk_bdev_io *parent_io = cb_arg; 5614 struct spdk_bdev *bdev = parent_io->bdev; 5615 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5616 int i, rc = 0; 5617 5618 if (!success) { 5619 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5620 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5621 spdk_bdev_free_io(bdev_io); 5622 return; 5623 } 5624 5625 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5626 rc = memcmp(read_buf, 5627 parent_io->u.bdev.iovs[i].iov_base, 5628 parent_io->u.bdev.iovs[i].iov_len); 5629 if (rc) { 5630 break; 5631 } 5632 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5633 } 5634 5635 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5636 rc = memcmp(bdev_io->u.bdev.md_buf, 5637 parent_io->u.bdev.md_buf, 5638 spdk_bdev_get_md_size(bdev)); 5639 } 5640 5641 spdk_bdev_free_io(bdev_io); 5642 5643 if (rc == 0) { 5644 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5645 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5646 } else { 5647 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5648 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5649 } 5650 } 5651 5652 static void 5653 bdev_compare_do_read(void *_bdev_io) 5654 { 5655 struct spdk_bdev_io *bdev_io = _bdev_io; 5656 int rc; 5657 5658 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5659 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5660 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5661 bdev_compare_do_read_done, bdev_io); 5662 5663 if (rc == -ENOMEM) { 5664 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5665 } else if (rc != 0) { 5666 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5667 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5668 } 5669 } 5670 5671 static int 5672 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5673 struct iovec *iov, int iovcnt, void *md_buf, 5674 uint64_t offset_blocks, uint64_t num_blocks, 5675 spdk_bdev_io_completion_cb cb, void *cb_arg) 5676 { 5677 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5678 struct spdk_bdev_io *bdev_io; 5679 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5680 5681 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5682 return -EINVAL; 5683 } 5684 5685 bdev_io = bdev_channel_get_io(channel); 5686 if (!bdev_io) { 5687 return -ENOMEM; 5688 } 5689 5690 bdev_io->internal.ch = channel; 5691 bdev_io->internal.desc = desc; 5692 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5693 bdev_io->u.bdev.iovs = iov; 5694 bdev_io->u.bdev.iovcnt = iovcnt; 5695 bdev_io->u.bdev.md_buf = md_buf; 5696 bdev_io->u.bdev.num_blocks = num_blocks; 5697 bdev_io->u.bdev.offset_blocks = offset_blocks; 5698 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5699 bdev_io->u.bdev.memory_domain = NULL; 5700 bdev_io->u.bdev.memory_domain_ctx = NULL; 5701 bdev_io->u.bdev.accel_sequence = NULL; 5702 5703 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5704 bdev_io_submit(bdev_io); 5705 return 0; 5706 } 5707 5708 bdev_compare_do_read(bdev_io); 5709 5710 return 0; 5711 } 5712 5713 int 5714 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5715 struct iovec *iov, int iovcnt, 5716 uint64_t offset_blocks, uint64_t num_blocks, 5717 spdk_bdev_io_completion_cb cb, void *cb_arg) 5718 { 5719 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5720 num_blocks, cb, cb_arg); 5721 } 5722 5723 int 5724 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5725 struct iovec *iov, int iovcnt, void *md_buf, 5726 uint64_t offset_blocks, uint64_t num_blocks, 5727 spdk_bdev_io_completion_cb cb, void *cb_arg) 5728 { 5729 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5730 return -EINVAL; 5731 } 5732 5733 if (md_buf && !_is_buf_allocated(iov)) { 5734 return -EINVAL; 5735 } 5736 5737 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5738 num_blocks, cb, cb_arg); 5739 } 5740 5741 static int 5742 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5743 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5744 spdk_bdev_io_completion_cb cb, void *cb_arg) 5745 { 5746 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5747 struct spdk_bdev_io *bdev_io; 5748 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5749 5750 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5751 return -EINVAL; 5752 } 5753 5754 bdev_io = bdev_channel_get_io(channel); 5755 if (!bdev_io) { 5756 return -ENOMEM; 5757 } 5758 5759 bdev_io->internal.ch = channel; 5760 bdev_io->internal.desc = desc; 5761 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5762 bdev_io->u.bdev.iovs = &bdev_io->iov; 5763 bdev_io->u.bdev.iovs[0].iov_base = buf; 5764 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5765 bdev_io->u.bdev.iovcnt = 1; 5766 bdev_io->u.bdev.md_buf = md_buf; 5767 bdev_io->u.bdev.num_blocks = num_blocks; 5768 bdev_io->u.bdev.offset_blocks = offset_blocks; 5769 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5770 bdev_io->u.bdev.memory_domain = NULL; 5771 bdev_io->u.bdev.memory_domain_ctx = NULL; 5772 bdev_io->u.bdev.accel_sequence = NULL; 5773 5774 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5775 bdev_io_submit(bdev_io); 5776 return 0; 5777 } 5778 5779 bdev_compare_do_read(bdev_io); 5780 5781 return 0; 5782 } 5783 5784 int 5785 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5786 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5787 spdk_bdev_io_completion_cb cb, void *cb_arg) 5788 { 5789 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5790 cb, cb_arg); 5791 } 5792 5793 int 5794 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5795 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5796 spdk_bdev_io_completion_cb cb, void *cb_arg) 5797 { 5798 struct iovec iov = { 5799 .iov_base = buf, 5800 }; 5801 5802 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5803 return -EINVAL; 5804 } 5805 5806 if (md_buf && !_is_buf_allocated(&iov)) { 5807 return -EINVAL; 5808 } 5809 5810 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5811 cb, cb_arg); 5812 } 5813 5814 static void 5815 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5816 { 5817 struct spdk_bdev_io *bdev_io = ctx; 5818 5819 if (unlock_status) { 5820 SPDK_ERRLOG("LBA range unlock failed\n"); 5821 } 5822 5823 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5824 false, bdev_io->internal.caller_ctx); 5825 } 5826 5827 static void 5828 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5829 { 5830 bdev_io->internal.status = status; 5831 5832 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5833 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5834 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5835 } 5836 5837 static void 5838 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5839 { 5840 struct spdk_bdev_io *parent_io = cb_arg; 5841 5842 if (!success) { 5843 SPDK_ERRLOG("Compare and write operation failed\n"); 5844 } 5845 5846 spdk_bdev_free_io(bdev_io); 5847 5848 bdev_comparev_and_writev_blocks_unlock(parent_io, 5849 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5850 } 5851 5852 static void 5853 bdev_compare_and_write_do_write(void *_bdev_io) 5854 { 5855 struct spdk_bdev_io *bdev_io = _bdev_io; 5856 int rc; 5857 5858 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5859 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5860 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5861 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5862 bdev_compare_and_write_do_write_done, bdev_io); 5863 5864 5865 if (rc == -ENOMEM) { 5866 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5867 } else if (rc != 0) { 5868 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5869 } 5870 } 5871 5872 static void 5873 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5874 { 5875 struct spdk_bdev_io *parent_io = cb_arg; 5876 5877 spdk_bdev_free_io(bdev_io); 5878 5879 if (!success) { 5880 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5881 return; 5882 } 5883 5884 bdev_compare_and_write_do_write(parent_io); 5885 } 5886 5887 static void 5888 bdev_compare_and_write_do_compare(void *_bdev_io) 5889 { 5890 struct spdk_bdev_io *bdev_io = _bdev_io; 5891 int rc; 5892 5893 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5894 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5895 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5896 bdev_compare_and_write_do_compare_done, bdev_io); 5897 5898 if (rc == -ENOMEM) { 5899 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5900 } else if (rc != 0) { 5901 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5902 } 5903 } 5904 5905 static void 5906 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5907 { 5908 struct spdk_bdev_io *bdev_io = ctx; 5909 5910 if (status) { 5911 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5912 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5913 return; 5914 } 5915 5916 bdev_compare_and_write_do_compare(bdev_io); 5917 } 5918 5919 int 5920 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5921 struct iovec *compare_iov, int compare_iovcnt, 5922 struct iovec *write_iov, int write_iovcnt, 5923 uint64_t offset_blocks, uint64_t num_blocks, 5924 spdk_bdev_io_completion_cb cb, void *cb_arg) 5925 { 5926 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5927 struct spdk_bdev_io *bdev_io; 5928 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5929 5930 if (!desc->write) { 5931 return -EBADF; 5932 } 5933 5934 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5935 return -EINVAL; 5936 } 5937 5938 if (num_blocks > bdev->acwu) { 5939 return -EINVAL; 5940 } 5941 5942 bdev_io = bdev_channel_get_io(channel); 5943 if (!bdev_io) { 5944 return -ENOMEM; 5945 } 5946 5947 bdev_io->internal.ch = channel; 5948 bdev_io->internal.desc = desc; 5949 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5950 bdev_io->u.bdev.iovs = compare_iov; 5951 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5952 bdev_io->u.bdev.fused_iovs = write_iov; 5953 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5954 bdev_io->u.bdev.md_buf = NULL; 5955 bdev_io->u.bdev.num_blocks = num_blocks; 5956 bdev_io->u.bdev.offset_blocks = offset_blocks; 5957 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5958 bdev_io->u.bdev.memory_domain = NULL; 5959 bdev_io->u.bdev.memory_domain_ctx = NULL; 5960 bdev_io->u.bdev.accel_sequence = NULL; 5961 5962 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5963 bdev_io_submit(bdev_io); 5964 return 0; 5965 } 5966 5967 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5968 bdev_comparev_and_writev_blocks_locked, bdev_io); 5969 } 5970 5971 int 5972 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5973 struct iovec *iov, int iovcnt, 5974 uint64_t offset_blocks, uint64_t num_blocks, 5975 bool populate, 5976 spdk_bdev_io_completion_cb cb, void *cb_arg) 5977 { 5978 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5979 struct spdk_bdev_io *bdev_io; 5980 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5981 5982 if (!desc->write) { 5983 return -EBADF; 5984 } 5985 5986 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5987 return -EINVAL; 5988 } 5989 5990 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5991 return -ENOTSUP; 5992 } 5993 5994 bdev_io = bdev_channel_get_io(channel); 5995 if (!bdev_io) { 5996 return -ENOMEM; 5997 } 5998 5999 bdev_io->internal.ch = channel; 6000 bdev_io->internal.desc = desc; 6001 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6002 bdev_io->u.bdev.num_blocks = num_blocks; 6003 bdev_io->u.bdev.offset_blocks = offset_blocks; 6004 bdev_io->u.bdev.iovs = iov; 6005 bdev_io->u.bdev.iovcnt = iovcnt; 6006 bdev_io->u.bdev.md_buf = NULL; 6007 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6008 bdev_io->u.bdev.zcopy.commit = 0; 6009 bdev_io->u.bdev.zcopy.start = 1; 6010 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6011 bdev_io->u.bdev.memory_domain = NULL; 6012 bdev_io->u.bdev.memory_domain_ctx = NULL; 6013 bdev_io->u.bdev.accel_sequence = NULL; 6014 6015 bdev_io_submit(bdev_io); 6016 6017 return 0; 6018 } 6019 6020 int 6021 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6022 spdk_bdev_io_completion_cb cb, void *cb_arg) 6023 { 6024 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6025 return -EINVAL; 6026 } 6027 6028 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6029 bdev_io->u.bdev.zcopy.start = 0; 6030 bdev_io->internal.caller_ctx = cb_arg; 6031 bdev_io->internal.cb = cb; 6032 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6033 6034 bdev_io_submit(bdev_io); 6035 6036 return 0; 6037 } 6038 6039 int 6040 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6041 uint64_t offset, uint64_t len, 6042 spdk_bdev_io_completion_cb cb, void *cb_arg) 6043 { 6044 uint64_t offset_blocks, num_blocks; 6045 6046 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6047 len, &num_blocks) != 0) { 6048 return -EINVAL; 6049 } 6050 6051 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6052 } 6053 6054 int 6055 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6056 uint64_t offset_blocks, uint64_t num_blocks, 6057 spdk_bdev_io_completion_cb cb, void *cb_arg) 6058 { 6059 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6060 struct spdk_bdev_io *bdev_io; 6061 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6062 6063 if (!desc->write) { 6064 return -EBADF; 6065 } 6066 6067 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6068 return -EINVAL; 6069 } 6070 6071 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6072 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6073 return -ENOTSUP; 6074 } 6075 6076 bdev_io = bdev_channel_get_io(channel); 6077 6078 if (!bdev_io) { 6079 return -ENOMEM; 6080 } 6081 6082 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6083 bdev_io->internal.ch = channel; 6084 bdev_io->internal.desc = desc; 6085 bdev_io->u.bdev.offset_blocks = offset_blocks; 6086 bdev_io->u.bdev.num_blocks = num_blocks; 6087 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6088 bdev_io->u.bdev.memory_domain = NULL; 6089 bdev_io->u.bdev.memory_domain_ctx = NULL; 6090 bdev_io->u.bdev.accel_sequence = NULL; 6091 6092 /* If the write_zeroes size is large and should be split, use the generic split 6093 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6094 * 6095 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6096 * or emulate it using regular write request otherwise. 6097 */ 6098 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6099 bdev_io->internal.split) { 6100 bdev_io_submit(bdev_io); 6101 return 0; 6102 } 6103 6104 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6105 6106 return bdev_write_zero_buffer(bdev_io); 6107 } 6108 6109 int 6110 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6111 uint64_t offset, uint64_t nbytes, 6112 spdk_bdev_io_completion_cb cb, void *cb_arg) 6113 { 6114 uint64_t offset_blocks, num_blocks; 6115 6116 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6117 nbytes, &num_blocks) != 0) { 6118 return -EINVAL; 6119 } 6120 6121 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6122 } 6123 6124 int 6125 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6126 uint64_t offset_blocks, uint64_t num_blocks, 6127 spdk_bdev_io_completion_cb cb, void *cb_arg) 6128 { 6129 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6130 struct spdk_bdev_io *bdev_io; 6131 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6132 6133 if (!desc->write) { 6134 return -EBADF; 6135 } 6136 6137 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6138 return -EINVAL; 6139 } 6140 6141 if (num_blocks == 0) { 6142 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6143 return -EINVAL; 6144 } 6145 6146 bdev_io = bdev_channel_get_io(channel); 6147 if (!bdev_io) { 6148 return -ENOMEM; 6149 } 6150 6151 bdev_io->internal.ch = channel; 6152 bdev_io->internal.desc = desc; 6153 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6154 6155 bdev_io->u.bdev.iovs = &bdev_io->iov; 6156 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6157 bdev_io->u.bdev.iovs[0].iov_len = 0; 6158 bdev_io->u.bdev.iovcnt = 1; 6159 6160 bdev_io->u.bdev.offset_blocks = offset_blocks; 6161 bdev_io->u.bdev.num_blocks = num_blocks; 6162 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6163 bdev_io->u.bdev.memory_domain = NULL; 6164 bdev_io->u.bdev.memory_domain_ctx = NULL; 6165 bdev_io->u.bdev.accel_sequence = NULL; 6166 6167 bdev_io_submit(bdev_io); 6168 return 0; 6169 } 6170 6171 int 6172 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6173 uint64_t offset, uint64_t length, 6174 spdk_bdev_io_completion_cb cb, void *cb_arg) 6175 { 6176 uint64_t offset_blocks, num_blocks; 6177 6178 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6179 length, &num_blocks) != 0) { 6180 return -EINVAL; 6181 } 6182 6183 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6184 } 6185 6186 int 6187 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6188 uint64_t offset_blocks, uint64_t num_blocks, 6189 spdk_bdev_io_completion_cb cb, void *cb_arg) 6190 { 6191 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6192 struct spdk_bdev_io *bdev_io; 6193 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6194 6195 if (!desc->write) { 6196 return -EBADF; 6197 } 6198 6199 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6200 return -EINVAL; 6201 } 6202 6203 bdev_io = bdev_channel_get_io(channel); 6204 if (!bdev_io) { 6205 return -ENOMEM; 6206 } 6207 6208 bdev_io->internal.ch = channel; 6209 bdev_io->internal.desc = desc; 6210 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6211 bdev_io->u.bdev.iovs = NULL; 6212 bdev_io->u.bdev.iovcnt = 0; 6213 bdev_io->u.bdev.offset_blocks = offset_blocks; 6214 bdev_io->u.bdev.num_blocks = num_blocks; 6215 bdev_io->u.bdev.memory_domain = NULL; 6216 bdev_io->u.bdev.memory_domain_ctx = NULL; 6217 bdev_io->u.bdev.accel_sequence = NULL; 6218 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6219 6220 bdev_io_submit(bdev_io); 6221 return 0; 6222 } 6223 6224 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6225 6226 static void 6227 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6228 { 6229 struct spdk_bdev_channel *ch = _ctx; 6230 struct spdk_bdev_io *bdev_io; 6231 6232 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6233 6234 if (status == -EBUSY) { 6235 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6236 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6237 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6238 } else { 6239 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6240 6241 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6242 /* If outstanding IOs are still present and reset_io_drain_timeout 6243 * seconds passed, start the reset. */ 6244 bdev_io_submit_reset(bdev_io); 6245 } else { 6246 /* We still have in progress memory domain pull/push or we're 6247 * executing accel sequence. Since we cannot abort either of those 6248 * operaions, fail the reset request. */ 6249 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6250 } 6251 } 6252 } else { 6253 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6254 SPDK_DEBUGLOG(bdev, 6255 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6256 ch->bdev->name); 6257 /* Mark the completion status as a SUCCESS and complete the reset. */ 6258 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6259 } 6260 } 6261 6262 static void 6263 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6264 struct spdk_io_channel *io_ch, void *_ctx) 6265 { 6266 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6267 int status = 0; 6268 6269 if (cur_ch->io_outstanding > 0 || 6270 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6271 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6272 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6273 * further iteration over the rest of the channels and pass non-zero status 6274 * to the callback function. */ 6275 status = -EBUSY; 6276 } 6277 spdk_bdev_for_each_channel_continue(i, status); 6278 } 6279 6280 static int 6281 bdev_reset_poll_for_outstanding_io(void *ctx) 6282 { 6283 struct spdk_bdev_channel *ch = ctx; 6284 struct spdk_bdev_io *bdev_io; 6285 6286 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6287 6288 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6289 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6290 bdev_reset_check_outstanding_io_done); 6291 6292 return SPDK_POLLER_BUSY; 6293 } 6294 6295 static void 6296 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6297 { 6298 struct spdk_bdev_channel *ch = _ctx; 6299 struct spdk_bdev_io *bdev_io; 6300 6301 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6302 6303 if (bdev->reset_io_drain_timeout == 0) { 6304 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6305 6306 bdev_io_submit_reset(bdev_io); 6307 return; 6308 } 6309 6310 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6311 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6312 6313 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6314 * submit the reset to the underlying module only if outstanding I/O 6315 * remain after reset_io_drain_timeout seconds have passed. */ 6316 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6317 bdev_reset_check_outstanding_io_done); 6318 } 6319 6320 static void 6321 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6322 struct spdk_io_channel *ch, void *_ctx) 6323 { 6324 struct spdk_bdev_channel *channel; 6325 struct spdk_bdev_mgmt_channel *mgmt_channel; 6326 struct spdk_bdev_shared_resource *shared_resource; 6327 bdev_io_tailq_t tmp_queued; 6328 6329 TAILQ_INIT(&tmp_queued); 6330 6331 channel = __io_ch_to_bdev_ch(ch); 6332 shared_resource = channel->shared_resource; 6333 mgmt_channel = shared_resource->mgmt_ch; 6334 6335 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6336 6337 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6338 /* The QoS object is always valid and readable while 6339 * the channel flag is set, so the lock here should not 6340 * be necessary. We're not in the fast path though, so 6341 * just take it anyway. */ 6342 spdk_spin_lock(&channel->bdev->internal.spinlock); 6343 if (channel->bdev->internal.qos->ch == channel) { 6344 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6345 } 6346 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6347 } 6348 6349 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6350 bdev_abort_all_buf_io(mgmt_channel, channel); 6351 bdev_abort_all_queued_io(&tmp_queued, channel); 6352 6353 spdk_bdev_for_each_channel_continue(i, 0); 6354 } 6355 6356 static void 6357 bdev_start_reset(void *ctx) 6358 { 6359 struct spdk_bdev_channel *ch = ctx; 6360 6361 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6362 bdev_reset_freeze_channel_done); 6363 } 6364 6365 static void 6366 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6367 { 6368 struct spdk_bdev *bdev = ch->bdev; 6369 6370 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6371 6372 spdk_spin_lock(&bdev->internal.spinlock); 6373 if (bdev->internal.reset_in_progress == NULL) { 6374 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6375 /* 6376 * Take a channel reference for the target bdev for the life of this 6377 * reset. This guards against the channel getting destroyed while 6378 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6379 * progress. We will release the reference when this reset is 6380 * completed. 6381 */ 6382 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6383 bdev_start_reset(ch); 6384 } 6385 spdk_spin_unlock(&bdev->internal.spinlock); 6386 } 6387 6388 int 6389 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6390 spdk_bdev_io_completion_cb cb, void *cb_arg) 6391 { 6392 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6393 struct spdk_bdev_io *bdev_io; 6394 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6395 6396 bdev_io = bdev_channel_get_io(channel); 6397 if (!bdev_io) { 6398 return -ENOMEM; 6399 } 6400 6401 bdev_io->internal.ch = channel; 6402 bdev_io->internal.desc = desc; 6403 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6404 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6405 bdev_io->u.reset.ch_ref = NULL; 6406 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6407 6408 spdk_spin_lock(&bdev->internal.spinlock); 6409 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6410 spdk_spin_unlock(&bdev->internal.spinlock); 6411 6412 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6413 internal.ch_link); 6414 6415 bdev_channel_start_reset(channel); 6416 6417 return 0; 6418 } 6419 6420 void 6421 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6422 struct spdk_bdev_io_stat *stat) 6423 { 6424 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6425 6426 bdev_get_io_stat(stat, channel->stat); 6427 } 6428 6429 static void 6430 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6431 { 6432 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6433 6434 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6435 bdev_iostat_ctx->cb_arg, 0); 6436 free(bdev_iostat_ctx); 6437 } 6438 6439 static void 6440 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6441 struct spdk_io_channel *ch, void *_ctx) 6442 { 6443 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6444 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6445 6446 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6447 spdk_bdev_for_each_channel_continue(i, 0); 6448 } 6449 6450 void 6451 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6452 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6453 { 6454 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6455 6456 assert(bdev != NULL); 6457 assert(stat != NULL); 6458 assert(cb != NULL); 6459 6460 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6461 if (bdev_iostat_ctx == NULL) { 6462 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6463 cb(bdev, stat, cb_arg, -ENOMEM); 6464 return; 6465 } 6466 6467 bdev_iostat_ctx->stat = stat; 6468 bdev_iostat_ctx->cb = cb; 6469 bdev_iostat_ctx->cb_arg = cb_arg; 6470 6471 /* Start with the statistics from previously deleted channels. */ 6472 spdk_spin_lock(&bdev->internal.spinlock); 6473 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6474 spdk_spin_unlock(&bdev->internal.spinlock); 6475 6476 /* Then iterate and add the statistics from each existing channel. */ 6477 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6478 bdev_get_device_stat_done); 6479 } 6480 6481 struct bdev_iostat_reset_ctx { 6482 enum spdk_bdev_reset_stat_mode mode; 6483 bdev_reset_device_stat_cb cb; 6484 void *cb_arg; 6485 }; 6486 6487 static void 6488 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6489 { 6490 struct bdev_iostat_reset_ctx *ctx = _ctx; 6491 6492 ctx->cb(bdev, ctx->cb_arg, 0); 6493 6494 free(ctx); 6495 } 6496 6497 static void 6498 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6499 struct spdk_io_channel *ch, void *_ctx) 6500 { 6501 struct bdev_iostat_reset_ctx *ctx = _ctx; 6502 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6503 6504 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6505 6506 spdk_bdev_for_each_channel_continue(i, 0); 6507 } 6508 6509 void 6510 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6511 bdev_reset_device_stat_cb cb, void *cb_arg) 6512 { 6513 struct bdev_iostat_reset_ctx *ctx; 6514 6515 assert(bdev != NULL); 6516 assert(cb != NULL); 6517 6518 ctx = calloc(1, sizeof(*ctx)); 6519 if (ctx == NULL) { 6520 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6521 cb(bdev, cb_arg, -ENOMEM); 6522 return; 6523 } 6524 6525 ctx->mode = mode; 6526 ctx->cb = cb; 6527 ctx->cb_arg = cb_arg; 6528 6529 spdk_spin_lock(&bdev->internal.spinlock); 6530 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6531 spdk_spin_unlock(&bdev->internal.spinlock); 6532 6533 spdk_bdev_for_each_channel(bdev, 6534 bdev_reset_each_channel_stat, 6535 ctx, 6536 bdev_reset_device_stat_done); 6537 } 6538 6539 int 6540 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6541 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6542 spdk_bdev_io_completion_cb cb, void *cb_arg) 6543 { 6544 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6545 struct spdk_bdev_io *bdev_io; 6546 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6547 6548 if (!desc->write) { 6549 return -EBADF; 6550 } 6551 6552 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6553 return -ENOTSUP; 6554 } 6555 6556 bdev_io = bdev_channel_get_io(channel); 6557 if (!bdev_io) { 6558 return -ENOMEM; 6559 } 6560 6561 bdev_io->internal.ch = channel; 6562 bdev_io->internal.desc = desc; 6563 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6564 bdev_io->u.nvme_passthru.cmd = *cmd; 6565 bdev_io->u.nvme_passthru.buf = buf; 6566 bdev_io->u.nvme_passthru.nbytes = nbytes; 6567 bdev_io->u.nvme_passthru.md_buf = NULL; 6568 bdev_io->u.nvme_passthru.md_len = 0; 6569 6570 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6571 6572 bdev_io_submit(bdev_io); 6573 return 0; 6574 } 6575 6576 int 6577 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6578 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6579 spdk_bdev_io_completion_cb cb, void *cb_arg) 6580 { 6581 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6582 struct spdk_bdev_io *bdev_io; 6583 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6584 6585 if (!desc->write) { 6586 /* 6587 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6588 * to easily determine if the command is a read or write, but for now just 6589 * do not allow io_passthru with a read-only descriptor. 6590 */ 6591 return -EBADF; 6592 } 6593 6594 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6595 return -ENOTSUP; 6596 } 6597 6598 bdev_io = bdev_channel_get_io(channel); 6599 if (!bdev_io) { 6600 return -ENOMEM; 6601 } 6602 6603 bdev_io->internal.ch = channel; 6604 bdev_io->internal.desc = desc; 6605 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6606 bdev_io->u.nvme_passthru.cmd = *cmd; 6607 bdev_io->u.nvme_passthru.buf = buf; 6608 bdev_io->u.nvme_passthru.nbytes = nbytes; 6609 bdev_io->u.nvme_passthru.md_buf = NULL; 6610 bdev_io->u.nvme_passthru.md_len = 0; 6611 6612 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6613 6614 bdev_io_submit(bdev_io); 6615 return 0; 6616 } 6617 6618 int 6619 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6620 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6621 spdk_bdev_io_completion_cb cb, void *cb_arg) 6622 { 6623 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6624 struct spdk_bdev_io *bdev_io; 6625 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6626 6627 if (!desc->write) { 6628 /* 6629 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6630 * to easily determine if the command is a read or write, but for now just 6631 * do not allow io_passthru with a read-only descriptor. 6632 */ 6633 return -EBADF; 6634 } 6635 6636 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6637 return -ENOTSUP; 6638 } 6639 6640 bdev_io = bdev_channel_get_io(channel); 6641 if (!bdev_io) { 6642 return -ENOMEM; 6643 } 6644 6645 bdev_io->internal.ch = channel; 6646 bdev_io->internal.desc = desc; 6647 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6648 bdev_io->u.nvme_passthru.cmd = *cmd; 6649 bdev_io->u.nvme_passthru.buf = buf; 6650 bdev_io->u.nvme_passthru.nbytes = nbytes; 6651 bdev_io->u.nvme_passthru.md_buf = md_buf; 6652 bdev_io->u.nvme_passthru.md_len = md_len; 6653 6654 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6655 6656 bdev_io_submit(bdev_io); 6657 return 0; 6658 } 6659 6660 static void bdev_abort_retry(void *ctx); 6661 static void bdev_abort(struct spdk_bdev_io *parent_io); 6662 6663 static void 6664 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6665 { 6666 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6667 struct spdk_bdev_io *parent_io = cb_arg; 6668 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6669 6670 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6671 6672 spdk_bdev_free_io(bdev_io); 6673 6674 if (!success) { 6675 /* Check if the target I/O completed in the meantime. */ 6676 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6677 if (tmp_io == bio_to_abort) { 6678 break; 6679 } 6680 } 6681 6682 /* If the target I/O still exists, set the parent to failed. */ 6683 if (tmp_io != NULL) { 6684 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6685 } 6686 } 6687 6688 parent_io->u.bdev.split_outstanding--; 6689 if (parent_io->u.bdev.split_outstanding == 0) { 6690 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6691 bdev_abort_retry(parent_io); 6692 } else { 6693 bdev_io_complete(parent_io); 6694 } 6695 } 6696 } 6697 6698 static int 6699 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6700 struct spdk_bdev_io *bio_to_abort, 6701 spdk_bdev_io_completion_cb cb, void *cb_arg) 6702 { 6703 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6704 struct spdk_bdev_io *bdev_io; 6705 6706 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6707 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6708 /* TODO: Abort reset or abort request. */ 6709 return -ENOTSUP; 6710 } 6711 6712 bdev_io = bdev_channel_get_io(channel); 6713 if (bdev_io == NULL) { 6714 return -ENOMEM; 6715 } 6716 6717 bdev_io->internal.ch = channel; 6718 bdev_io->internal.desc = desc; 6719 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6720 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6721 6722 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6723 assert(bdev_io_should_split(bio_to_abort)); 6724 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6725 6726 /* Parent abort request is not submitted directly, but to manage its 6727 * execution add it to the submitted list here. 6728 */ 6729 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6730 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6731 6732 bdev_abort(bdev_io); 6733 6734 return 0; 6735 } 6736 6737 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6738 6739 /* Submit the abort request to the underlying bdev module. */ 6740 bdev_io_submit(bdev_io); 6741 6742 return 0; 6743 } 6744 6745 static bool 6746 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6747 { 6748 struct spdk_bdev_io *iter; 6749 6750 TAILQ_FOREACH(iter, tailq, internal.link) { 6751 if (iter == bdev_io) { 6752 return true; 6753 } 6754 } 6755 6756 return false; 6757 } 6758 6759 static uint32_t 6760 _bdev_abort(struct spdk_bdev_io *parent_io) 6761 { 6762 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6763 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6764 void *bio_cb_arg; 6765 struct spdk_bdev_io *bio_to_abort; 6766 uint32_t matched_ios; 6767 int rc; 6768 6769 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6770 6771 /* matched_ios is returned and will be kept by the caller. 6772 * 6773 * This function will be used for two cases, 1) the same cb_arg is used for 6774 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6775 * Incrementing split_outstanding directly here may confuse readers especially 6776 * for the 1st case. 6777 * 6778 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6779 * works as expected. 6780 */ 6781 matched_ios = 0; 6782 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6783 6784 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6785 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6786 continue; 6787 } 6788 6789 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6790 /* Any I/O which was submitted after this abort command should be excluded. */ 6791 continue; 6792 } 6793 6794 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6795 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6796 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6797 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6798 break; 6799 } 6800 6801 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6802 if (rc != 0) { 6803 if (rc == -ENOMEM) { 6804 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6805 } else { 6806 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6807 } 6808 break; 6809 } 6810 matched_ios++; 6811 } 6812 6813 return matched_ios; 6814 } 6815 6816 static void 6817 bdev_abort_retry(void *ctx) 6818 { 6819 struct spdk_bdev_io *parent_io = ctx; 6820 uint32_t matched_ios; 6821 6822 matched_ios = _bdev_abort(parent_io); 6823 6824 if (matched_ios == 0) { 6825 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6826 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6827 } else { 6828 /* For retry, the case that no target I/O was found is success 6829 * because it means target I/Os completed in the meantime. 6830 */ 6831 bdev_io_complete(parent_io); 6832 } 6833 return; 6834 } 6835 6836 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6837 parent_io->u.bdev.split_outstanding = matched_ios; 6838 } 6839 6840 static void 6841 bdev_abort(struct spdk_bdev_io *parent_io) 6842 { 6843 uint32_t matched_ios; 6844 6845 matched_ios = _bdev_abort(parent_io); 6846 6847 if (matched_ios == 0) { 6848 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6849 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6850 } else { 6851 /* The case the no target I/O was found is failure. */ 6852 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6853 bdev_io_complete(parent_io); 6854 } 6855 return; 6856 } 6857 6858 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6859 parent_io->u.bdev.split_outstanding = matched_ios; 6860 } 6861 6862 int 6863 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6864 void *bio_cb_arg, 6865 spdk_bdev_io_completion_cb cb, void *cb_arg) 6866 { 6867 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6868 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6869 struct spdk_bdev_io *bdev_io; 6870 6871 if (bio_cb_arg == NULL) { 6872 return -EINVAL; 6873 } 6874 6875 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6876 return -ENOTSUP; 6877 } 6878 6879 bdev_io = bdev_channel_get_io(channel); 6880 if (bdev_io == NULL) { 6881 return -ENOMEM; 6882 } 6883 6884 bdev_io->internal.ch = channel; 6885 bdev_io->internal.desc = desc; 6886 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6887 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6888 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6889 6890 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6891 6892 /* Parent abort request is not submitted directly, but to manage its execution, 6893 * add it to the submitted list here. 6894 */ 6895 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6896 6897 bdev_abort(bdev_io); 6898 6899 return 0; 6900 } 6901 6902 int 6903 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6904 struct spdk_bdev_io_wait_entry *entry) 6905 { 6906 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6907 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6908 6909 if (bdev != entry->bdev) { 6910 SPDK_ERRLOG("bdevs do not match\n"); 6911 return -EINVAL; 6912 } 6913 6914 if (mgmt_ch->per_thread_cache_count > 0) { 6915 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6916 return -EINVAL; 6917 } 6918 6919 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6920 return 0; 6921 } 6922 6923 static inline void 6924 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6925 { 6926 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6927 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6928 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6929 uint32_t blocklen = bdev_io->bdev->blocklen; 6930 6931 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6932 switch (bdev_io->type) { 6933 case SPDK_BDEV_IO_TYPE_READ: 6934 io_stat->bytes_read += num_blocks * blocklen; 6935 io_stat->num_read_ops++; 6936 io_stat->read_latency_ticks += tsc_diff; 6937 if (io_stat->max_read_latency_ticks < tsc_diff) { 6938 io_stat->max_read_latency_ticks = tsc_diff; 6939 } 6940 if (io_stat->min_read_latency_ticks > tsc_diff) { 6941 io_stat->min_read_latency_ticks = tsc_diff; 6942 } 6943 break; 6944 case SPDK_BDEV_IO_TYPE_WRITE: 6945 io_stat->bytes_written += num_blocks * blocklen; 6946 io_stat->num_write_ops++; 6947 io_stat->write_latency_ticks += tsc_diff; 6948 if (io_stat->max_write_latency_ticks < tsc_diff) { 6949 io_stat->max_write_latency_ticks = tsc_diff; 6950 } 6951 if (io_stat->min_write_latency_ticks > tsc_diff) { 6952 io_stat->min_write_latency_ticks = tsc_diff; 6953 } 6954 break; 6955 case SPDK_BDEV_IO_TYPE_UNMAP: 6956 io_stat->bytes_unmapped += num_blocks * blocklen; 6957 io_stat->num_unmap_ops++; 6958 io_stat->unmap_latency_ticks += tsc_diff; 6959 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6960 io_stat->max_unmap_latency_ticks = tsc_diff; 6961 } 6962 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6963 io_stat->min_unmap_latency_ticks = tsc_diff; 6964 } 6965 break; 6966 case SPDK_BDEV_IO_TYPE_ZCOPY: 6967 /* Track the data in the start phase only */ 6968 if (bdev_io->u.bdev.zcopy.start) { 6969 if (bdev_io->u.bdev.zcopy.populate) { 6970 io_stat->bytes_read += num_blocks * blocklen; 6971 io_stat->num_read_ops++; 6972 io_stat->read_latency_ticks += tsc_diff; 6973 if (io_stat->max_read_latency_ticks < tsc_diff) { 6974 io_stat->max_read_latency_ticks = tsc_diff; 6975 } 6976 if (io_stat->min_read_latency_ticks > tsc_diff) { 6977 io_stat->min_read_latency_ticks = tsc_diff; 6978 } 6979 } else { 6980 io_stat->bytes_written += num_blocks * blocklen; 6981 io_stat->num_write_ops++; 6982 io_stat->write_latency_ticks += tsc_diff; 6983 if (io_stat->max_write_latency_ticks < tsc_diff) { 6984 io_stat->max_write_latency_ticks = tsc_diff; 6985 } 6986 if (io_stat->min_write_latency_ticks > tsc_diff) { 6987 io_stat->min_write_latency_ticks = tsc_diff; 6988 } 6989 } 6990 } 6991 break; 6992 case SPDK_BDEV_IO_TYPE_COPY: 6993 io_stat->bytes_copied += num_blocks * blocklen; 6994 io_stat->num_copy_ops++; 6995 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6996 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6997 io_stat->max_copy_latency_ticks = tsc_diff; 6998 } 6999 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7000 io_stat->min_copy_latency_ticks = tsc_diff; 7001 } 7002 break; 7003 default: 7004 break; 7005 } 7006 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7007 io_stat = bdev_io->bdev->internal.stat; 7008 assert(io_stat->io_error != NULL); 7009 7010 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7011 io_stat->io_error->error_status[-io_status - 1]++; 7012 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7013 } 7014 7015 #ifdef SPDK_CONFIG_VTUNE 7016 uint64_t now_tsc = spdk_get_ticks(); 7017 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7018 uint64_t data[5]; 7019 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7020 7021 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7022 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7023 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7024 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7025 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7026 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7027 7028 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7029 __itt_metadata_u64, 5, data); 7030 7031 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7032 bdev_io->internal.ch->start_tsc = now_tsc; 7033 } 7034 #endif 7035 } 7036 7037 static inline void 7038 _bdev_io_complete(void *ctx) 7039 { 7040 struct spdk_bdev_io *bdev_io = ctx; 7041 7042 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 7043 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7044 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7045 } 7046 7047 assert(bdev_io->internal.cb != NULL); 7048 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7049 7050 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7051 bdev_io->internal.caller_ctx); 7052 } 7053 7054 static inline void 7055 bdev_io_complete(void *ctx) 7056 { 7057 struct spdk_bdev_io *bdev_io = ctx; 7058 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7059 uint64_t tsc, tsc_diff; 7060 7061 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7062 /* 7063 * Defer completion to avoid potential infinite recursion if the 7064 * user's completion callback issues a new I/O. 7065 */ 7066 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7067 bdev_io_complete, bdev_io); 7068 return; 7069 } 7070 7071 tsc = spdk_get_ticks(); 7072 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7073 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7074 bdev_io->internal.caller_ctx); 7075 7076 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7077 7078 if (bdev_io->internal.ch->histogram) { 7079 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7080 } 7081 7082 bdev_io_update_io_stat(bdev_io, tsc_diff); 7083 _bdev_io_complete(bdev_io); 7084 } 7085 7086 /* The difference between this function and bdev_io_complete() is that this should be called to 7087 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7088 * io_submitted list and don't have submit_tsc updated. 7089 */ 7090 static inline void 7091 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7092 { 7093 /* Since the IO hasn't been submitted it's bound to be failed */ 7094 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7095 7096 /* At this point we don't know if the IO is completed from submission context or not, but, 7097 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7098 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7099 _bdev_io_complete, bdev_io); 7100 } 7101 7102 static void bdev_destroy_cb(void *io_device); 7103 7104 static void 7105 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7106 { 7107 struct spdk_bdev_io *bdev_io = _ctx; 7108 7109 if (bdev_io->u.reset.ch_ref != NULL) { 7110 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7111 bdev_io->u.reset.ch_ref = NULL; 7112 } 7113 7114 bdev_io_complete(bdev_io); 7115 7116 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7117 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7118 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7119 } 7120 } 7121 7122 static void 7123 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7124 struct spdk_io_channel *_ch, void *_ctx) 7125 { 7126 struct spdk_bdev_io *bdev_io = _ctx; 7127 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7128 struct spdk_bdev_io *queued_reset; 7129 7130 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7131 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7132 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7133 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7134 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7135 } 7136 7137 spdk_bdev_for_each_channel_continue(i, 0); 7138 } 7139 7140 static void 7141 bdev_io_complete_sequence_cb(void *ctx, int status) 7142 { 7143 struct spdk_bdev_io *bdev_io = ctx; 7144 7145 /* u.bdev.accel_sequence should have already been cleared at this point */ 7146 assert(bdev_io->u.bdev.accel_sequence == NULL); 7147 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7148 bdev_io->internal.accel_sequence = NULL; 7149 7150 if (spdk_unlikely(status != 0)) { 7151 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7152 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7153 } 7154 7155 bdev_io_complete(bdev_io); 7156 } 7157 7158 void 7159 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7160 { 7161 struct spdk_bdev *bdev = bdev_io->bdev; 7162 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7163 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7164 7165 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7166 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7167 spdk_bdev_get_module_name(bdev), 7168 bdev_io_status_get_string(bdev_io->internal.status)); 7169 assert(false); 7170 } 7171 bdev_io->internal.status = status; 7172 7173 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7174 bool unlock_channels = false; 7175 7176 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7177 SPDK_ERRLOG("NOMEM returned for reset\n"); 7178 } 7179 spdk_spin_lock(&bdev->internal.spinlock); 7180 if (bdev_io == bdev->internal.reset_in_progress) { 7181 bdev->internal.reset_in_progress = NULL; 7182 unlock_channels = true; 7183 } 7184 spdk_spin_unlock(&bdev->internal.spinlock); 7185 7186 if (unlock_channels) { 7187 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7188 bdev_reset_complete); 7189 return; 7190 } 7191 } else { 7192 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7193 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7194 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7195 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7196 return; 7197 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7198 !bdev_io_use_accel_sequence(bdev_io))) { 7199 _bdev_io_push_bounce_data_buffer(bdev_io, 7200 _bdev_io_complete_push_bounce_done); 7201 /* bdev IO will be completed in the callback */ 7202 return; 7203 } 7204 } 7205 7206 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7207 return; 7208 } 7209 } 7210 7211 bdev_io_complete(bdev_io); 7212 } 7213 7214 void 7215 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7216 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7217 { 7218 enum spdk_bdev_io_status status; 7219 7220 if (sc == SPDK_SCSI_STATUS_GOOD) { 7221 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7222 } else { 7223 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7224 bdev_io->internal.error.scsi.sc = sc; 7225 bdev_io->internal.error.scsi.sk = sk; 7226 bdev_io->internal.error.scsi.asc = asc; 7227 bdev_io->internal.error.scsi.ascq = ascq; 7228 } 7229 7230 spdk_bdev_io_complete(bdev_io, status); 7231 } 7232 7233 void 7234 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7235 int *sc, int *sk, int *asc, int *ascq) 7236 { 7237 assert(sc != NULL); 7238 assert(sk != NULL); 7239 assert(asc != NULL); 7240 assert(ascq != NULL); 7241 7242 switch (bdev_io->internal.status) { 7243 case SPDK_BDEV_IO_STATUS_SUCCESS: 7244 *sc = SPDK_SCSI_STATUS_GOOD; 7245 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7246 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7247 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7248 break; 7249 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7250 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7251 break; 7252 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7253 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7254 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7255 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7256 *ascq = bdev_io->internal.error.scsi.ascq; 7257 break; 7258 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7259 *sc = bdev_io->internal.error.scsi.sc; 7260 *sk = bdev_io->internal.error.scsi.sk; 7261 *asc = bdev_io->internal.error.scsi.asc; 7262 *ascq = bdev_io->internal.error.scsi.ascq; 7263 break; 7264 default: 7265 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7266 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7267 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7268 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7269 break; 7270 } 7271 } 7272 7273 void 7274 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7275 { 7276 enum spdk_bdev_io_status status; 7277 7278 if (aio_result == 0) { 7279 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7280 } else { 7281 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7282 } 7283 7284 bdev_io->internal.error.aio_result = aio_result; 7285 7286 spdk_bdev_io_complete(bdev_io, status); 7287 } 7288 7289 void 7290 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7291 { 7292 assert(aio_result != NULL); 7293 7294 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7295 *aio_result = bdev_io->internal.error.aio_result; 7296 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7297 *aio_result = 0; 7298 } else { 7299 *aio_result = -EIO; 7300 } 7301 } 7302 7303 void 7304 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7305 { 7306 enum spdk_bdev_io_status status; 7307 7308 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7309 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7310 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7311 status = SPDK_BDEV_IO_STATUS_ABORTED; 7312 } else { 7313 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7314 } 7315 7316 bdev_io->internal.error.nvme.cdw0 = cdw0; 7317 bdev_io->internal.error.nvme.sct = sct; 7318 bdev_io->internal.error.nvme.sc = sc; 7319 7320 spdk_bdev_io_complete(bdev_io, status); 7321 } 7322 7323 void 7324 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7325 { 7326 assert(sct != NULL); 7327 assert(sc != NULL); 7328 assert(cdw0 != NULL); 7329 7330 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7331 *sct = SPDK_NVME_SCT_GENERIC; 7332 *sc = SPDK_NVME_SC_SUCCESS; 7333 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7334 *cdw0 = 0; 7335 } else { 7336 *cdw0 = 1U; 7337 } 7338 return; 7339 } 7340 7341 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7342 *sct = bdev_io->internal.error.nvme.sct; 7343 *sc = bdev_io->internal.error.nvme.sc; 7344 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7345 *sct = SPDK_NVME_SCT_GENERIC; 7346 *sc = SPDK_NVME_SC_SUCCESS; 7347 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7348 *sct = SPDK_NVME_SCT_GENERIC; 7349 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7350 } else { 7351 *sct = SPDK_NVME_SCT_GENERIC; 7352 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7353 } 7354 7355 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7356 } 7357 7358 void 7359 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7360 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7361 { 7362 assert(first_sct != NULL); 7363 assert(first_sc != NULL); 7364 assert(second_sct != NULL); 7365 assert(second_sc != NULL); 7366 assert(cdw0 != NULL); 7367 7368 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7369 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7370 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7371 *first_sct = bdev_io->internal.error.nvme.sct; 7372 *first_sc = bdev_io->internal.error.nvme.sc; 7373 *second_sct = SPDK_NVME_SCT_GENERIC; 7374 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7375 } else { 7376 *first_sct = SPDK_NVME_SCT_GENERIC; 7377 *first_sc = SPDK_NVME_SC_SUCCESS; 7378 *second_sct = bdev_io->internal.error.nvme.sct; 7379 *second_sc = bdev_io->internal.error.nvme.sc; 7380 } 7381 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7382 *first_sct = SPDK_NVME_SCT_GENERIC; 7383 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7384 *second_sct = SPDK_NVME_SCT_GENERIC; 7385 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7386 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7387 *first_sct = SPDK_NVME_SCT_GENERIC; 7388 *first_sc = SPDK_NVME_SC_SUCCESS; 7389 *second_sct = SPDK_NVME_SCT_GENERIC; 7390 *second_sc = SPDK_NVME_SC_SUCCESS; 7391 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7392 *first_sct = SPDK_NVME_SCT_GENERIC; 7393 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7394 *second_sct = SPDK_NVME_SCT_GENERIC; 7395 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7396 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7397 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7398 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7399 *second_sct = SPDK_NVME_SCT_GENERIC; 7400 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7401 } else { 7402 *first_sct = SPDK_NVME_SCT_GENERIC; 7403 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7404 *second_sct = SPDK_NVME_SCT_GENERIC; 7405 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7406 } 7407 7408 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7409 } 7410 7411 struct spdk_thread * 7412 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7413 { 7414 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7415 } 7416 7417 struct spdk_io_channel * 7418 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7419 { 7420 return bdev_io->internal.ch->channel; 7421 } 7422 7423 static int 7424 bdev_register(struct spdk_bdev *bdev) 7425 { 7426 char *bdev_name; 7427 char uuid[SPDK_UUID_STRING_LEN]; 7428 struct spdk_iobuf_opts iobuf_opts; 7429 int ret, i; 7430 7431 assert(bdev->module != NULL); 7432 7433 if (!bdev->name) { 7434 SPDK_ERRLOG("Bdev name is NULL\n"); 7435 return -EINVAL; 7436 } 7437 7438 if (!strlen(bdev->name)) { 7439 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7440 return -EINVAL; 7441 } 7442 7443 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7444 if (bdev->fn_table->accel_sequence_supported == NULL) { 7445 continue; 7446 } 7447 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7448 (enum spdk_bdev_io_type)i)) { 7449 continue; 7450 } 7451 7452 if (spdk_bdev_is_md_separate(bdev)) { 7453 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7454 "accel sequence support\n"); 7455 return -EINVAL; 7456 } 7457 } 7458 7459 /* Users often register their own I/O devices using the bdev name. In 7460 * order to avoid conflicts, prepend bdev_. */ 7461 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7462 if (!bdev_name) { 7463 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7464 return -ENOMEM; 7465 } 7466 7467 bdev->internal.stat = bdev_alloc_io_stat(true); 7468 if (!bdev->internal.stat) { 7469 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7470 free(bdev_name); 7471 return -ENOMEM; 7472 } 7473 7474 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7475 bdev->internal.measured_queue_depth = UINT64_MAX; 7476 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7477 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7478 bdev->internal.qd_poller = NULL; 7479 bdev->internal.qos = NULL; 7480 7481 TAILQ_INIT(&bdev->internal.open_descs); 7482 TAILQ_INIT(&bdev->internal.locked_ranges); 7483 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7484 TAILQ_INIT(&bdev->aliases); 7485 7486 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7487 if (ret != 0) { 7488 bdev_free_io_stat(bdev->internal.stat); 7489 free(bdev_name); 7490 return ret; 7491 } 7492 7493 /* UUID may be specified by the user or defined by bdev itself. 7494 * Otherwise it will be generated here, so this field will never be empty. */ 7495 if (spdk_uuid_is_null(&bdev->uuid)) { 7496 spdk_uuid_generate(&bdev->uuid); 7497 } 7498 7499 /* Add the UUID alias only if it's different than the name */ 7500 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7501 if (strcmp(bdev->name, uuid) != 0) { 7502 ret = spdk_bdev_alias_add(bdev, uuid); 7503 if (ret != 0) { 7504 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7505 bdev_name_del(&bdev->internal.bdev_name); 7506 bdev_free_io_stat(bdev->internal.stat); 7507 free(bdev_name); 7508 return ret; 7509 } 7510 } 7511 7512 spdk_iobuf_get_opts(&iobuf_opts); 7513 if (spdk_bdev_get_buf_align(bdev) > 1) { 7514 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7515 iobuf_opts.large_bufsize / bdev->blocklen); 7516 } 7517 7518 /* If the user didn't specify a write unit size, set it to one. */ 7519 if (bdev->write_unit_size == 0) { 7520 bdev->write_unit_size = 1; 7521 } 7522 7523 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7524 if (bdev->acwu == 0) { 7525 bdev->acwu = bdev->write_unit_size; 7526 } 7527 7528 if (bdev->phys_blocklen == 0) { 7529 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7530 } 7531 7532 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7533 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7534 } 7535 7536 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7537 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7538 } 7539 7540 bdev->internal.reset_in_progress = NULL; 7541 bdev->internal.qd_poll_in_progress = false; 7542 bdev->internal.period = 0; 7543 bdev->internal.new_period = 0; 7544 7545 spdk_io_device_register(__bdev_to_io_dev(bdev), 7546 bdev_channel_create, bdev_channel_destroy, 7547 sizeof(struct spdk_bdev_channel), 7548 bdev_name); 7549 7550 free(bdev_name); 7551 7552 spdk_spin_init(&bdev->internal.spinlock); 7553 7554 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7555 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7556 7557 return 0; 7558 } 7559 7560 static void 7561 bdev_destroy_cb(void *io_device) 7562 { 7563 int rc; 7564 struct spdk_bdev *bdev; 7565 spdk_bdev_unregister_cb cb_fn; 7566 void *cb_arg; 7567 7568 bdev = __bdev_from_io_dev(io_device); 7569 7570 if (bdev->internal.unregister_td != spdk_get_thread()) { 7571 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7572 return; 7573 } 7574 7575 cb_fn = bdev->internal.unregister_cb; 7576 cb_arg = bdev->internal.unregister_ctx; 7577 7578 spdk_spin_destroy(&bdev->internal.spinlock); 7579 free(bdev->internal.qos); 7580 bdev_free_io_stat(bdev->internal.stat); 7581 7582 rc = bdev->fn_table->destruct(bdev->ctxt); 7583 if (rc < 0) { 7584 SPDK_ERRLOG("destruct failed\n"); 7585 } 7586 if (rc <= 0 && cb_fn != NULL) { 7587 cb_fn(cb_arg, rc); 7588 } 7589 } 7590 7591 void 7592 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7593 { 7594 if (bdev->internal.unregister_cb != NULL) { 7595 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7596 } 7597 } 7598 7599 static void 7600 _remove_notify(void *arg) 7601 { 7602 struct spdk_bdev_desc *desc = arg; 7603 7604 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7605 } 7606 7607 /* returns: 0 - bdev removed and ready to be destructed. 7608 * -EBUSY - bdev can't be destructed yet. */ 7609 static int 7610 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7611 { 7612 struct spdk_bdev_desc *desc, *tmp; 7613 int rc = 0; 7614 char uuid[SPDK_UUID_STRING_LEN]; 7615 7616 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7617 assert(spdk_spin_held(&bdev->internal.spinlock)); 7618 7619 /* Notify each descriptor about hotremoval */ 7620 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7621 rc = -EBUSY; 7622 /* 7623 * Defer invocation of the event_cb to a separate message that will 7624 * run later on its thread. This ensures this context unwinds and 7625 * we don't recursively unregister this bdev again if the event_cb 7626 * immediately closes its descriptor. 7627 */ 7628 event_notify(desc, _remove_notify); 7629 } 7630 7631 /* If there are no descriptors, proceed removing the bdev */ 7632 if (rc == 0) { 7633 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7634 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7635 7636 /* Delete the name and the UUID alias */ 7637 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7638 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7639 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7640 7641 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7642 7643 if (bdev->internal.reset_in_progress != NULL) { 7644 /* If reset is in progress, let the completion callback for reset 7645 * unregister the bdev. 7646 */ 7647 rc = -EBUSY; 7648 } 7649 } 7650 7651 return rc; 7652 } 7653 7654 static void 7655 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7656 struct spdk_io_channel *io_ch, void *_ctx) 7657 { 7658 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7659 7660 bdev_channel_abort_queued_ios(bdev_ch); 7661 spdk_bdev_for_each_channel_continue(i, 0); 7662 } 7663 7664 static void 7665 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7666 { 7667 int rc; 7668 7669 spdk_spin_lock(&g_bdev_mgr.spinlock); 7670 spdk_spin_lock(&bdev->internal.spinlock); 7671 /* 7672 * Set the status to REMOVING after completing to abort channels. Otherwise, 7673 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7674 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7675 * may fail. 7676 */ 7677 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7678 rc = bdev_unregister_unsafe(bdev); 7679 spdk_spin_unlock(&bdev->internal.spinlock); 7680 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7681 7682 if (rc == 0) { 7683 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7684 } 7685 } 7686 7687 void 7688 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7689 { 7690 struct spdk_thread *thread; 7691 7692 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7693 7694 thread = spdk_get_thread(); 7695 if (!thread) { 7696 /* The user called this from a non-SPDK thread. */ 7697 if (cb_fn != NULL) { 7698 cb_fn(cb_arg, -ENOTSUP); 7699 } 7700 return; 7701 } 7702 7703 spdk_spin_lock(&g_bdev_mgr.spinlock); 7704 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7705 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7706 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7707 if (cb_fn) { 7708 cb_fn(cb_arg, -EBUSY); 7709 } 7710 return; 7711 } 7712 7713 spdk_spin_lock(&bdev->internal.spinlock); 7714 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7715 bdev->internal.unregister_cb = cb_fn; 7716 bdev->internal.unregister_ctx = cb_arg; 7717 bdev->internal.unregister_td = thread; 7718 spdk_spin_unlock(&bdev->internal.spinlock); 7719 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7720 7721 spdk_bdev_set_qd_sampling_period(bdev, 0); 7722 7723 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7724 bdev_unregister); 7725 } 7726 7727 int 7728 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7729 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7730 { 7731 struct spdk_bdev_desc *desc; 7732 struct spdk_bdev *bdev; 7733 int rc; 7734 7735 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7736 if (rc != 0) { 7737 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7738 return rc; 7739 } 7740 7741 bdev = spdk_bdev_desc_get_bdev(desc); 7742 7743 if (bdev->module != module) { 7744 spdk_bdev_close(desc); 7745 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7746 bdev_name); 7747 return -ENODEV; 7748 } 7749 7750 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7751 7752 spdk_bdev_close(desc); 7753 7754 return 0; 7755 } 7756 7757 static int 7758 bdev_start_qos(struct spdk_bdev *bdev) 7759 { 7760 struct set_qos_limit_ctx *ctx; 7761 7762 /* Enable QoS */ 7763 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7764 ctx = calloc(1, sizeof(*ctx)); 7765 if (ctx == NULL) { 7766 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7767 return -ENOMEM; 7768 } 7769 ctx->bdev = bdev; 7770 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7771 } 7772 7773 return 0; 7774 } 7775 7776 static void 7777 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7778 struct spdk_bdev *bdev) 7779 { 7780 enum spdk_bdev_claim_type type; 7781 const char *typename, *modname; 7782 extern struct spdk_log_flag SPDK_LOG_bdev; 7783 7784 assert(spdk_spin_held(&bdev->internal.spinlock)); 7785 7786 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7787 return; 7788 } 7789 7790 type = bdev->internal.claim_type; 7791 typename = spdk_bdev_claim_get_name(type); 7792 7793 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7794 modname = bdev->internal.claim.v1.module->name; 7795 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7796 bdev->name, detail, typename, modname); 7797 return; 7798 } 7799 7800 if (claim_type_is_v2(type)) { 7801 struct spdk_bdev_module_claim *claim; 7802 7803 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7804 modname = claim->module->name; 7805 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7806 bdev->name, detail, typename, modname); 7807 } 7808 return; 7809 } 7810 7811 assert(false); 7812 } 7813 7814 static int 7815 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7816 { 7817 struct spdk_thread *thread; 7818 int rc = 0; 7819 7820 thread = spdk_get_thread(); 7821 if (!thread) { 7822 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7823 return -ENOTSUP; 7824 } 7825 7826 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7827 spdk_get_thread()); 7828 7829 desc->bdev = bdev; 7830 desc->thread = thread; 7831 desc->write = write; 7832 7833 spdk_spin_lock(&bdev->internal.spinlock); 7834 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7835 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7836 spdk_spin_unlock(&bdev->internal.spinlock); 7837 return -ENODEV; 7838 } 7839 7840 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7841 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7842 spdk_spin_unlock(&bdev->internal.spinlock); 7843 return -EPERM; 7844 } 7845 7846 rc = bdev_start_qos(bdev); 7847 if (rc != 0) { 7848 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7849 spdk_spin_unlock(&bdev->internal.spinlock); 7850 return rc; 7851 } 7852 7853 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7854 7855 spdk_spin_unlock(&bdev->internal.spinlock); 7856 7857 return 0; 7858 } 7859 7860 static int 7861 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7862 struct spdk_bdev_desc **_desc) 7863 { 7864 struct spdk_bdev_desc *desc; 7865 unsigned int i; 7866 7867 desc = calloc(1, sizeof(*desc)); 7868 if (desc == NULL) { 7869 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7870 return -ENOMEM; 7871 } 7872 7873 TAILQ_INIT(&desc->pending_media_events); 7874 TAILQ_INIT(&desc->free_media_events); 7875 7876 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7877 desc->callback.event_fn = event_cb; 7878 desc->callback.ctx = event_ctx; 7879 spdk_spin_init(&desc->spinlock); 7880 7881 if (bdev->media_events) { 7882 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7883 sizeof(*desc->media_events_buffer)); 7884 if (desc->media_events_buffer == NULL) { 7885 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7886 bdev_desc_free(desc); 7887 return -ENOMEM; 7888 } 7889 7890 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7891 TAILQ_INSERT_TAIL(&desc->free_media_events, 7892 &desc->media_events_buffer[i], tailq); 7893 } 7894 } 7895 7896 if (bdev->fn_table->accel_sequence_supported != NULL) { 7897 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7898 desc->accel_sequence_supported[i] = 7899 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7900 (enum spdk_bdev_io_type)i); 7901 } 7902 } 7903 7904 *_desc = desc; 7905 7906 return 0; 7907 } 7908 7909 static int 7910 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7911 void *event_ctx, struct spdk_bdev_desc **_desc) 7912 { 7913 struct spdk_bdev_desc *desc; 7914 struct spdk_bdev *bdev; 7915 int rc; 7916 7917 bdev = bdev_get_by_name(bdev_name); 7918 7919 if (bdev == NULL) { 7920 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7921 return -ENODEV; 7922 } 7923 7924 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7925 if (rc != 0) { 7926 return rc; 7927 } 7928 7929 rc = bdev_open(bdev, write, desc); 7930 if (rc != 0) { 7931 bdev_desc_free(desc); 7932 desc = NULL; 7933 } 7934 7935 *_desc = desc; 7936 7937 return rc; 7938 } 7939 7940 int 7941 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7942 void *event_ctx, struct spdk_bdev_desc **_desc) 7943 { 7944 int rc; 7945 7946 if (event_cb == NULL) { 7947 SPDK_ERRLOG("Missing event callback function\n"); 7948 return -EINVAL; 7949 } 7950 7951 spdk_spin_lock(&g_bdev_mgr.spinlock); 7952 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 7953 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7954 7955 return rc; 7956 } 7957 7958 struct spdk_bdev_open_async_ctx { 7959 char *bdev_name; 7960 spdk_bdev_event_cb_t event_cb; 7961 void *event_ctx; 7962 bool write; 7963 int rc; 7964 spdk_bdev_open_async_cb_t cb_fn; 7965 void *cb_arg; 7966 struct spdk_bdev_desc *desc; 7967 struct spdk_bdev_open_async_opts opts; 7968 uint64_t start_ticks; 7969 struct spdk_thread *orig_thread; 7970 struct spdk_poller *poller; 7971 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 7972 }; 7973 7974 static void 7975 bdev_open_async_done(void *arg) 7976 { 7977 struct spdk_bdev_open_async_ctx *ctx = arg; 7978 7979 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 7980 7981 free(ctx->bdev_name); 7982 free(ctx); 7983 } 7984 7985 static void 7986 bdev_open_async_cancel(void *arg) 7987 { 7988 struct spdk_bdev_open_async_ctx *ctx = arg; 7989 7990 assert(ctx->rc == -ESHUTDOWN); 7991 7992 spdk_poller_unregister(&ctx->poller); 7993 7994 bdev_open_async_done(ctx); 7995 } 7996 7997 /* This is called when the bdev library finishes at shutdown. */ 7998 static void 7999 bdev_open_async_fini(void) 8000 { 8001 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8002 8003 spdk_spin_lock(&g_bdev_mgr.spinlock); 8004 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8005 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8006 /* 8007 * We have to move to ctx->orig_thread to unregister ctx->poller. 8008 * However, there is a chance that ctx->poller is executed before 8009 * message is executed, which could result in bdev_open_async_done() 8010 * being called twice. To avoid such race condition, set ctx->rc to 8011 * -ESHUTDOWN. 8012 */ 8013 ctx->rc = -ESHUTDOWN; 8014 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8015 } 8016 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8017 } 8018 8019 static int bdev_open_async(void *arg); 8020 8021 static void 8022 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8023 { 8024 uint64_t timeout_ticks; 8025 8026 if (ctx->rc == -ESHUTDOWN) { 8027 /* This context is being canceled. Do nothing. */ 8028 return; 8029 } 8030 8031 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8032 &ctx->desc); 8033 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8034 goto exit; 8035 } 8036 8037 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8038 if (spdk_get_ticks() >= timeout_ticks) { 8039 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8040 ctx->rc = -ETIMEDOUT; 8041 goto exit; 8042 } 8043 8044 return; 8045 8046 exit: 8047 spdk_poller_unregister(&ctx->poller); 8048 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8049 8050 /* Completion callback is processed after stack unwinding. */ 8051 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8052 } 8053 8054 static int 8055 bdev_open_async(void *arg) 8056 { 8057 struct spdk_bdev_open_async_ctx *ctx = arg; 8058 8059 spdk_spin_lock(&g_bdev_mgr.spinlock); 8060 8061 _bdev_open_async(ctx); 8062 8063 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8064 8065 return SPDK_POLLER_BUSY; 8066 } 8067 8068 static void 8069 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8070 struct spdk_bdev_open_async_opts *opts_src, 8071 size_t size) 8072 { 8073 assert(opts); 8074 assert(opts_src); 8075 8076 opts->size = size; 8077 8078 #define SET_FIELD(field) \ 8079 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8080 opts->field = opts_src->field; \ 8081 } \ 8082 8083 SET_FIELD(timeout_ms); 8084 8085 /* Do not remove this statement, you should always update this statement when you adding a new field, 8086 * and do not forget to add the SET_FIELD statement for your added field. */ 8087 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8088 8089 #undef SET_FIELD 8090 } 8091 8092 static void 8093 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8094 { 8095 assert(opts); 8096 8097 opts->size = size; 8098 8099 #define SET_FIELD(field, value) \ 8100 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8101 opts->field = value; \ 8102 } \ 8103 8104 SET_FIELD(timeout_ms, 0); 8105 8106 #undef SET_FIELD 8107 } 8108 8109 int 8110 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8111 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8112 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8113 { 8114 struct spdk_bdev_open_async_ctx *ctx; 8115 8116 if (event_cb == NULL) { 8117 SPDK_ERRLOG("Missing event callback function\n"); 8118 return -EINVAL; 8119 } 8120 8121 if (open_cb == NULL) { 8122 SPDK_ERRLOG("Missing open callback function\n"); 8123 return -EINVAL; 8124 } 8125 8126 if (opts != NULL && opts->size == 0) { 8127 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8128 return -EINVAL; 8129 } 8130 8131 ctx = calloc(1, sizeof(*ctx)); 8132 if (ctx == NULL) { 8133 SPDK_ERRLOG("Failed to allocate open context\n"); 8134 return -ENOMEM; 8135 } 8136 8137 ctx->bdev_name = strdup(bdev_name); 8138 if (ctx->bdev_name == NULL) { 8139 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8140 free(ctx); 8141 return -ENOMEM; 8142 } 8143 8144 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8145 if (ctx->poller == NULL) { 8146 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8147 free(ctx->bdev_name); 8148 free(ctx); 8149 return -ENOMEM; 8150 } 8151 8152 ctx->cb_fn = open_cb; 8153 ctx->cb_arg = open_cb_arg; 8154 ctx->write = write; 8155 ctx->event_cb = event_cb; 8156 ctx->event_ctx = event_ctx; 8157 ctx->orig_thread = spdk_get_thread(); 8158 ctx->start_ticks = spdk_get_ticks(); 8159 8160 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8161 if (opts != NULL) { 8162 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8163 } 8164 8165 spdk_spin_lock(&g_bdev_mgr.spinlock); 8166 8167 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8168 _bdev_open_async(ctx); 8169 8170 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8171 8172 return 0; 8173 } 8174 8175 static void 8176 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8177 { 8178 int rc; 8179 8180 spdk_spin_lock(&bdev->internal.spinlock); 8181 spdk_spin_lock(&desc->spinlock); 8182 8183 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8184 8185 desc->closed = true; 8186 8187 if (desc->claim != NULL) { 8188 bdev_desc_release_claims(desc); 8189 } 8190 8191 if (0 == desc->refs) { 8192 spdk_spin_unlock(&desc->spinlock); 8193 bdev_desc_free(desc); 8194 } else { 8195 spdk_spin_unlock(&desc->spinlock); 8196 } 8197 8198 /* If no more descriptors, kill QoS channel */ 8199 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8200 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8201 bdev->name, spdk_get_thread()); 8202 8203 if (bdev_qos_destroy(bdev)) { 8204 /* There isn't anything we can do to recover here. Just let the 8205 * old QoS poller keep running. The QoS handling won't change 8206 * cores when the user allocates a new channel, but it won't break. */ 8207 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8208 } 8209 } 8210 8211 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8212 rc = bdev_unregister_unsafe(bdev); 8213 spdk_spin_unlock(&bdev->internal.spinlock); 8214 8215 if (rc == 0) { 8216 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8217 } 8218 } else { 8219 spdk_spin_unlock(&bdev->internal.spinlock); 8220 } 8221 } 8222 8223 void 8224 spdk_bdev_close(struct spdk_bdev_desc *desc) 8225 { 8226 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8227 8228 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8229 spdk_get_thread()); 8230 8231 assert(desc->thread == spdk_get_thread()); 8232 8233 spdk_poller_unregister(&desc->io_timeout_poller); 8234 8235 spdk_spin_lock(&g_bdev_mgr.spinlock); 8236 8237 bdev_close(bdev, desc); 8238 8239 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8240 } 8241 8242 static void 8243 bdev_register_finished(void *arg) 8244 { 8245 struct spdk_bdev_desc *desc = arg; 8246 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8247 8248 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8249 8250 spdk_spin_lock(&g_bdev_mgr.spinlock); 8251 8252 bdev_close(bdev, desc); 8253 8254 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8255 } 8256 8257 int 8258 spdk_bdev_register(struct spdk_bdev *bdev) 8259 { 8260 struct spdk_bdev_desc *desc; 8261 struct spdk_thread *thread = spdk_get_thread(); 8262 int rc; 8263 8264 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8265 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 8266 thread ? spdk_thread_get_name(thread) : "null"); 8267 return -EINVAL; 8268 } 8269 8270 rc = bdev_register(bdev); 8271 if (rc != 0) { 8272 return rc; 8273 } 8274 8275 /* A descriptor is opened to prevent bdev deletion during examination */ 8276 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8277 if (rc != 0) { 8278 spdk_bdev_unregister(bdev, NULL, NULL); 8279 return rc; 8280 } 8281 8282 rc = bdev_open(bdev, false, desc); 8283 if (rc != 0) { 8284 bdev_desc_free(desc); 8285 spdk_bdev_unregister(bdev, NULL, NULL); 8286 return rc; 8287 } 8288 8289 /* Examine configuration before initializing I/O */ 8290 bdev_examine(bdev); 8291 8292 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8293 if (rc != 0) { 8294 bdev_close(bdev, desc); 8295 spdk_bdev_unregister(bdev, NULL, NULL); 8296 } 8297 8298 return rc; 8299 } 8300 8301 int 8302 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8303 struct spdk_bdev_module *module) 8304 { 8305 spdk_spin_lock(&bdev->internal.spinlock); 8306 8307 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8308 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8309 spdk_spin_unlock(&bdev->internal.spinlock); 8310 return -EPERM; 8311 } 8312 8313 if (desc && !desc->write) { 8314 desc->write = true; 8315 } 8316 8317 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8318 bdev->internal.claim.v1.module = module; 8319 8320 spdk_spin_unlock(&bdev->internal.spinlock); 8321 return 0; 8322 } 8323 8324 void 8325 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8326 { 8327 spdk_spin_lock(&bdev->internal.spinlock); 8328 8329 assert(bdev->internal.claim.v1.module != NULL); 8330 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8331 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8332 bdev->internal.claim.v1.module = NULL; 8333 8334 spdk_spin_unlock(&bdev->internal.spinlock); 8335 } 8336 8337 /* 8338 * Start claims v2 8339 */ 8340 8341 const char * 8342 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8343 { 8344 switch (type) { 8345 case SPDK_BDEV_CLAIM_NONE: 8346 return "not_claimed"; 8347 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8348 return "exclusive_write"; 8349 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8350 return "read_many_write_one"; 8351 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8352 return "read_many_write_none"; 8353 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8354 return "read_many_write_many"; 8355 default: 8356 break; 8357 } 8358 return "invalid_claim"; 8359 } 8360 8361 static bool 8362 claim_type_is_v2(enum spdk_bdev_claim_type type) 8363 { 8364 switch (type) { 8365 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8366 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8367 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8368 return true; 8369 default: 8370 break; 8371 } 8372 return false; 8373 } 8374 8375 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8376 static bool 8377 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8378 { 8379 switch (type) { 8380 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8381 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8382 return true; 8383 default: 8384 break; 8385 } 8386 return false; 8387 } 8388 8389 void 8390 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8391 { 8392 if (opts == NULL) { 8393 SPDK_ERRLOG("opts should not be NULL\n"); 8394 assert(opts != NULL); 8395 return; 8396 } 8397 if (size == 0) { 8398 SPDK_ERRLOG("size should not be zero\n"); 8399 assert(size != 0); 8400 return; 8401 } 8402 8403 memset(opts, 0, size); 8404 opts->opts_size = size; 8405 8406 #define FIELD_OK(field) \ 8407 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8408 8409 #define SET_FIELD(field, value) \ 8410 if (FIELD_OK(field)) { \ 8411 opts->field = value; \ 8412 } \ 8413 8414 SET_FIELD(shared_claim_key, 0); 8415 8416 #undef FIELD_OK 8417 #undef SET_FIELD 8418 } 8419 8420 static int 8421 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8422 { 8423 if (src->opts_size == 0) { 8424 SPDK_ERRLOG("size should not be zero\n"); 8425 return -1; 8426 } 8427 8428 memset(dst, 0, sizeof(*dst)); 8429 dst->opts_size = src->opts_size; 8430 8431 #define FIELD_OK(field) \ 8432 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8433 8434 #define SET_FIELD(field) \ 8435 if (FIELD_OK(field)) { \ 8436 dst->field = src->field; \ 8437 } \ 8438 8439 if (FIELD_OK(name)) { 8440 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8441 } 8442 8443 SET_FIELD(shared_claim_key); 8444 8445 /* You should not remove this statement, but need to update the assert statement 8446 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8447 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8448 8449 #undef FIELD_OK 8450 #undef SET_FIELD 8451 return 0; 8452 } 8453 8454 /* Returns 0 if a read-write-once claim can be taken. */ 8455 static int 8456 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8457 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8458 { 8459 struct spdk_bdev *bdev = desc->bdev; 8460 struct spdk_bdev_desc *open_desc; 8461 8462 assert(spdk_spin_held(&bdev->internal.spinlock)); 8463 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8464 8465 if (opts->shared_claim_key != 0) { 8466 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8467 bdev->name); 8468 return -EINVAL; 8469 } 8470 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8471 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8472 return -EPERM; 8473 } 8474 if (desc->claim != NULL) { 8475 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8476 bdev->name, desc->claim->module->name); 8477 return -EPERM; 8478 } 8479 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8480 if (desc != open_desc && open_desc->write) { 8481 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8482 "another descriptor is open for writing\n", 8483 bdev->name); 8484 return -EPERM; 8485 } 8486 } 8487 8488 return 0; 8489 } 8490 8491 /* Returns 0 if a read-only-many claim can be taken. */ 8492 static int 8493 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8494 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8495 { 8496 struct spdk_bdev *bdev = desc->bdev; 8497 struct spdk_bdev_desc *open_desc; 8498 8499 assert(spdk_spin_held(&bdev->internal.spinlock)); 8500 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8501 assert(desc->claim == NULL); 8502 8503 if (desc->write) { 8504 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8505 bdev->name); 8506 return -EINVAL; 8507 } 8508 if (opts->shared_claim_key != 0) { 8509 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8510 return -EINVAL; 8511 } 8512 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8513 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8514 if (open_desc->write) { 8515 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8516 "another descriptor is open for writing\n", 8517 bdev->name); 8518 return -EPERM; 8519 } 8520 } 8521 } 8522 8523 return 0; 8524 } 8525 8526 /* Returns 0 if a read-write-many claim can be taken. */ 8527 static int 8528 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8529 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8530 { 8531 struct spdk_bdev *bdev = desc->bdev; 8532 struct spdk_bdev_desc *open_desc; 8533 8534 assert(spdk_spin_held(&bdev->internal.spinlock)); 8535 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8536 assert(desc->claim == NULL); 8537 8538 if (opts->shared_claim_key == 0) { 8539 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8540 bdev->name); 8541 return -EINVAL; 8542 } 8543 switch (bdev->internal.claim_type) { 8544 case SPDK_BDEV_CLAIM_NONE: 8545 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8546 if (open_desc == desc) { 8547 continue; 8548 } 8549 if (open_desc->write) { 8550 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8551 "another descriptor is open for writing without a " 8552 "claim\n", bdev->name); 8553 return -EPERM; 8554 } 8555 } 8556 break; 8557 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8558 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8559 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8560 return -EPERM; 8561 } 8562 break; 8563 default: 8564 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8565 return -EBUSY; 8566 } 8567 8568 return 0; 8569 } 8570 8571 /* Updates desc and its bdev with a v2 claim. */ 8572 static int 8573 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8574 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8575 { 8576 struct spdk_bdev *bdev = desc->bdev; 8577 struct spdk_bdev_module_claim *claim; 8578 8579 assert(spdk_spin_held(&bdev->internal.spinlock)); 8580 assert(claim_type_is_v2(type)); 8581 assert(desc->claim == NULL); 8582 8583 claim = calloc(1, sizeof(*desc->claim)); 8584 if (claim == NULL) { 8585 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8586 return -ENOMEM; 8587 } 8588 claim->module = module; 8589 claim->desc = desc; 8590 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8591 memcpy(claim->name, opts->name, sizeof(claim->name)); 8592 desc->claim = claim; 8593 8594 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8595 bdev->internal.claim_type = type; 8596 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8597 bdev->internal.claim.v2.key = opts->shared_claim_key; 8598 } 8599 assert(type == bdev->internal.claim_type); 8600 8601 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8602 8603 if (!desc->write && claim_type_promotes_to_write(type)) { 8604 desc->write = true; 8605 } 8606 8607 return 0; 8608 } 8609 8610 int 8611 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8612 struct spdk_bdev_claim_opts *_opts, 8613 struct spdk_bdev_module *module) 8614 { 8615 struct spdk_bdev *bdev; 8616 struct spdk_bdev_claim_opts opts; 8617 int rc = 0; 8618 8619 if (desc == NULL) { 8620 SPDK_ERRLOG("descriptor must not be NULL\n"); 8621 return -EINVAL; 8622 } 8623 8624 bdev = desc->bdev; 8625 8626 if (_opts == NULL) { 8627 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8628 } else if (claim_opts_copy(_opts, &opts) != 0) { 8629 return -EINVAL; 8630 } 8631 8632 spdk_spin_lock(&bdev->internal.spinlock); 8633 8634 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8635 bdev->internal.claim_type != type) { 8636 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8637 spdk_spin_unlock(&bdev->internal.spinlock); 8638 return -EPERM; 8639 } 8640 8641 if (claim_type_is_v2(type) && desc->claim != NULL) { 8642 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8643 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8644 spdk_spin_unlock(&bdev->internal.spinlock); 8645 return -EPERM; 8646 } 8647 8648 switch (type) { 8649 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8650 spdk_spin_unlock(&bdev->internal.spinlock); 8651 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8652 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8653 rc = claim_verify_rwo(desc, type, &opts, module); 8654 break; 8655 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8656 rc = claim_verify_rom(desc, type, &opts, module); 8657 break; 8658 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8659 rc = claim_verify_rwm(desc, type, &opts, module); 8660 break; 8661 default: 8662 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8663 rc = -ENOTSUP; 8664 } 8665 8666 if (rc == 0) { 8667 rc = claim_bdev(desc, type, &opts, module); 8668 } 8669 8670 spdk_spin_unlock(&bdev->internal.spinlock); 8671 return rc; 8672 } 8673 8674 static void 8675 claim_reset(struct spdk_bdev *bdev) 8676 { 8677 assert(spdk_spin_held(&bdev->internal.spinlock)); 8678 assert(claim_type_is_v2(bdev->internal.claim_type)); 8679 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8680 8681 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8682 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8683 } 8684 8685 static void 8686 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8687 { 8688 struct spdk_bdev *bdev = desc->bdev; 8689 8690 assert(spdk_spin_held(&bdev->internal.spinlock)); 8691 assert(claim_type_is_v2(bdev->internal.claim_type)); 8692 8693 if (bdev->internal.examine_in_progress == 0) { 8694 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8695 free(desc->claim); 8696 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8697 claim_reset(bdev); 8698 } 8699 } else { 8700 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8701 desc->claim->module = NULL; 8702 desc->claim->desc = NULL; 8703 } 8704 desc->claim = NULL; 8705 } 8706 8707 /* 8708 * End claims v2 8709 */ 8710 8711 struct spdk_bdev * 8712 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8713 { 8714 assert(desc != NULL); 8715 return desc->bdev; 8716 } 8717 8718 int 8719 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8720 { 8721 struct spdk_bdev *bdev, *tmp; 8722 struct spdk_bdev_desc *desc; 8723 int rc = 0; 8724 8725 assert(fn != NULL); 8726 8727 spdk_spin_lock(&g_bdev_mgr.spinlock); 8728 bdev = spdk_bdev_first(); 8729 while (bdev != NULL) { 8730 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8731 if (rc != 0) { 8732 break; 8733 } 8734 rc = bdev_open(bdev, false, desc); 8735 if (rc != 0) { 8736 bdev_desc_free(desc); 8737 if (rc == -ENODEV) { 8738 /* Ignore the error and move to the next bdev. */ 8739 rc = 0; 8740 bdev = spdk_bdev_next(bdev); 8741 continue; 8742 } 8743 break; 8744 } 8745 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8746 8747 rc = fn(ctx, bdev); 8748 8749 spdk_spin_lock(&g_bdev_mgr.spinlock); 8750 tmp = spdk_bdev_next(bdev); 8751 bdev_close(bdev, desc); 8752 if (rc != 0) { 8753 break; 8754 } 8755 bdev = tmp; 8756 } 8757 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8758 8759 return rc; 8760 } 8761 8762 int 8763 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8764 { 8765 struct spdk_bdev *bdev, *tmp; 8766 struct spdk_bdev_desc *desc; 8767 int rc = 0; 8768 8769 assert(fn != NULL); 8770 8771 spdk_spin_lock(&g_bdev_mgr.spinlock); 8772 bdev = spdk_bdev_first_leaf(); 8773 while (bdev != NULL) { 8774 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8775 if (rc != 0) { 8776 break; 8777 } 8778 rc = bdev_open(bdev, false, desc); 8779 if (rc != 0) { 8780 bdev_desc_free(desc); 8781 if (rc == -ENODEV) { 8782 /* Ignore the error and move to the next bdev. */ 8783 rc = 0; 8784 bdev = spdk_bdev_next_leaf(bdev); 8785 continue; 8786 } 8787 break; 8788 } 8789 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8790 8791 rc = fn(ctx, bdev); 8792 8793 spdk_spin_lock(&g_bdev_mgr.spinlock); 8794 tmp = spdk_bdev_next_leaf(bdev); 8795 bdev_close(bdev, desc); 8796 if (rc != 0) { 8797 break; 8798 } 8799 bdev = tmp; 8800 } 8801 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8802 8803 return rc; 8804 } 8805 8806 void 8807 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8808 { 8809 struct iovec *iovs; 8810 int iovcnt; 8811 8812 if (bdev_io == NULL) { 8813 return; 8814 } 8815 8816 switch (bdev_io->type) { 8817 case SPDK_BDEV_IO_TYPE_READ: 8818 case SPDK_BDEV_IO_TYPE_WRITE: 8819 case SPDK_BDEV_IO_TYPE_ZCOPY: 8820 iovs = bdev_io->u.bdev.iovs; 8821 iovcnt = bdev_io->u.bdev.iovcnt; 8822 break; 8823 default: 8824 iovs = NULL; 8825 iovcnt = 0; 8826 break; 8827 } 8828 8829 if (iovp) { 8830 *iovp = iovs; 8831 } 8832 if (iovcntp) { 8833 *iovcntp = iovcnt; 8834 } 8835 } 8836 8837 void * 8838 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8839 { 8840 if (bdev_io == NULL) { 8841 return NULL; 8842 } 8843 8844 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8845 return NULL; 8846 } 8847 8848 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8849 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8850 return bdev_io->u.bdev.md_buf; 8851 } 8852 8853 return NULL; 8854 } 8855 8856 void * 8857 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8858 { 8859 if (bdev_io == NULL) { 8860 assert(false); 8861 return NULL; 8862 } 8863 8864 return bdev_io->internal.caller_ctx; 8865 } 8866 8867 void 8868 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8869 { 8870 8871 if (spdk_bdev_module_list_find(bdev_module->name)) { 8872 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8873 assert(false); 8874 } 8875 8876 spdk_spin_init(&bdev_module->internal.spinlock); 8877 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8878 8879 /* 8880 * Modules with examine callbacks must be initialized first, so they are 8881 * ready to handle examine callbacks from later modules that will 8882 * register physical bdevs. 8883 */ 8884 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8885 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8886 } else { 8887 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8888 } 8889 } 8890 8891 struct spdk_bdev_module * 8892 spdk_bdev_module_list_find(const char *name) 8893 { 8894 struct spdk_bdev_module *bdev_module; 8895 8896 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8897 if (strcmp(name, bdev_module->name) == 0) { 8898 break; 8899 } 8900 } 8901 8902 return bdev_module; 8903 } 8904 8905 static int 8906 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8907 { 8908 uint64_t num_blocks; 8909 void *md_buf = NULL; 8910 8911 num_blocks = bdev_io->u.bdev.num_blocks; 8912 8913 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8914 md_buf = (char *)g_bdev_mgr.zero_buffer + 8915 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8916 } 8917 8918 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8919 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8920 g_bdev_mgr.zero_buffer, md_buf, 8921 bdev_io->u.bdev.offset_blocks, num_blocks, 8922 bdev_write_zero_buffer_done, bdev_io); 8923 } 8924 8925 static void 8926 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8927 { 8928 struct spdk_bdev_io *parent_io = cb_arg; 8929 8930 spdk_bdev_free_io(bdev_io); 8931 8932 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8933 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8934 } 8935 8936 static void 8937 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8938 { 8939 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8940 ctx->bdev->internal.qos_mod_in_progress = false; 8941 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8942 8943 if (ctx->cb_fn) { 8944 ctx->cb_fn(ctx->cb_arg, status); 8945 } 8946 free(ctx); 8947 } 8948 8949 static void 8950 bdev_disable_qos_done(void *cb_arg) 8951 { 8952 struct set_qos_limit_ctx *ctx = cb_arg; 8953 struct spdk_bdev *bdev = ctx->bdev; 8954 struct spdk_bdev_io *bdev_io; 8955 struct spdk_bdev_qos *qos; 8956 8957 spdk_spin_lock(&bdev->internal.spinlock); 8958 qos = bdev->internal.qos; 8959 bdev->internal.qos = NULL; 8960 spdk_spin_unlock(&bdev->internal.spinlock); 8961 8962 while (!TAILQ_EMPTY(&qos->queued)) { 8963 /* Send queued I/O back to their original thread for resubmission. */ 8964 bdev_io = TAILQ_FIRST(&qos->queued); 8965 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8966 8967 if (bdev_io->internal.io_submit_ch) { 8968 /* 8969 * Channel was changed when sending it to the QoS thread - change it back 8970 * before sending it back to the original thread. 8971 */ 8972 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8973 bdev_io->internal.io_submit_ch = NULL; 8974 } 8975 8976 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8977 _bdev_io_submit, bdev_io); 8978 } 8979 8980 if (qos->thread != NULL) { 8981 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8982 spdk_poller_unregister(&qos->poller); 8983 } 8984 8985 free(qos); 8986 8987 bdev_set_qos_limit_done(ctx, 0); 8988 } 8989 8990 static void 8991 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8992 { 8993 struct set_qos_limit_ctx *ctx = _ctx; 8994 struct spdk_thread *thread; 8995 8996 spdk_spin_lock(&bdev->internal.spinlock); 8997 thread = bdev->internal.qos->thread; 8998 spdk_spin_unlock(&bdev->internal.spinlock); 8999 9000 if (thread != NULL) { 9001 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9002 } else { 9003 bdev_disable_qos_done(ctx); 9004 } 9005 } 9006 9007 static void 9008 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9009 struct spdk_io_channel *ch, void *_ctx) 9010 { 9011 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9012 9013 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9014 9015 spdk_bdev_for_each_channel_continue(i, 0); 9016 } 9017 9018 static void 9019 bdev_update_qos_rate_limit_msg(void *cb_arg) 9020 { 9021 struct set_qos_limit_ctx *ctx = cb_arg; 9022 struct spdk_bdev *bdev = ctx->bdev; 9023 9024 spdk_spin_lock(&bdev->internal.spinlock); 9025 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9026 spdk_spin_unlock(&bdev->internal.spinlock); 9027 9028 bdev_set_qos_limit_done(ctx, 0); 9029 } 9030 9031 static void 9032 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9033 struct spdk_io_channel *ch, void *_ctx) 9034 { 9035 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9036 9037 spdk_spin_lock(&bdev->internal.spinlock); 9038 bdev_enable_qos(bdev, bdev_ch); 9039 spdk_spin_unlock(&bdev->internal.spinlock); 9040 spdk_bdev_for_each_channel_continue(i, 0); 9041 } 9042 9043 static void 9044 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9045 { 9046 struct set_qos_limit_ctx *ctx = _ctx; 9047 9048 bdev_set_qos_limit_done(ctx, status); 9049 } 9050 9051 static void 9052 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9053 { 9054 int i; 9055 9056 assert(bdev->internal.qos != NULL); 9057 9058 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9059 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9060 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9061 9062 if (limits[i] == 0) { 9063 bdev->internal.qos->rate_limits[i].limit = 9064 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9065 } 9066 } 9067 } 9068 } 9069 9070 void 9071 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9072 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9073 { 9074 struct set_qos_limit_ctx *ctx; 9075 uint32_t limit_set_complement; 9076 uint64_t min_limit_per_sec; 9077 int i; 9078 bool disable_rate_limit = true; 9079 9080 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9081 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9082 continue; 9083 } 9084 9085 if (limits[i] > 0) { 9086 disable_rate_limit = false; 9087 } 9088 9089 if (bdev_qos_is_iops_rate_limit(i) == true) { 9090 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9091 } else { 9092 /* Change from megabyte to byte rate limit */ 9093 limits[i] = limits[i] * 1024 * 1024; 9094 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9095 } 9096 9097 limit_set_complement = limits[i] % min_limit_per_sec; 9098 if (limit_set_complement) { 9099 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9100 limits[i], min_limit_per_sec); 9101 limits[i] += min_limit_per_sec - limit_set_complement; 9102 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9103 } 9104 } 9105 9106 ctx = calloc(1, sizeof(*ctx)); 9107 if (ctx == NULL) { 9108 cb_fn(cb_arg, -ENOMEM); 9109 return; 9110 } 9111 9112 ctx->cb_fn = cb_fn; 9113 ctx->cb_arg = cb_arg; 9114 ctx->bdev = bdev; 9115 9116 spdk_spin_lock(&bdev->internal.spinlock); 9117 if (bdev->internal.qos_mod_in_progress) { 9118 spdk_spin_unlock(&bdev->internal.spinlock); 9119 free(ctx); 9120 cb_fn(cb_arg, -EAGAIN); 9121 return; 9122 } 9123 bdev->internal.qos_mod_in_progress = true; 9124 9125 if (disable_rate_limit == true && bdev->internal.qos) { 9126 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9127 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9128 (bdev->internal.qos->rate_limits[i].limit > 0 && 9129 bdev->internal.qos->rate_limits[i].limit != 9130 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9131 disable_rate_limit = false; 9132 break; 9133 } 9134 } 9135 } 9136 9137 if (disable_rate_limit == false) { 9138 if (bdev->internal.qos == NULL) { 9139 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9140 if (!bdev->internal.qos) { 9141 spdk_spin_unlock(&bdev->internal.spinlock); 9142 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9143 bdev_set_qos_limit_done(ctx, -ENOMEM); 9144 return; 9145 } 9146 } 9147 9148 if (bdev->internal.qos->thread == NULL) { 9149 /* Enabling */ 9150 bdev_set_qos_rate_limits(bdev, limits); 9151 9152 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9153 bdev_enable_qos_done); 9154 } else { 9155 /* Updating */ 9156 bdev_set_qos_rate_limits(bdev, limits); 9157 9158 spdk_thread_send_msg(bdev->internal.qos->thread, 9159 bdev_update_qos_rate_limit_msg, ctx); 9160 } 9161 } else { 9162 if (bdev->internal.qos != NULL) { 9163 bdev_set_qos_rate_limits(bdev, limits); 9164 9165 /* Disabling */ 9166 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9167 bdev_disable_qos_msg_done); 9168 } else { 9169 spdk_spin_unlock(&bdev->internal.spinlock); 9170 bdev_set_qos_limit_done(ctx, 0); 9171 return; 9172 } 9173 } 9174 9175 spdk_spin_unlock(&bdev->internal.spinlock); 9176 } 9177 9178 struct spdk_bdev_histogram_ctx { 9179 spdk_bdev_histogram_status_cb cb_fn; 9180 void *cb_arg; 9181 struct spdk_bdev *bdev; 9182 int status; 9183 }; 9184 9185 static void 9186 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9187 { 9188 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9189 9190 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9191 ctx->bdev->internal.histogram_in_progress = false; 9192 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9193 ctx->cb_fn(ctx->cb_arg, ctx->status); 9194 free(ctx); 9195 } 9196 9197 static void 9198 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9199 struct spdk_io_channel *_ch, void *_ctx) 9200 { 9201 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9202 9203 if (ch->histogram != NULL) { 9204 spdk_histogram_data_free(ch->histogram); 9205 ch->histogram = NULL; 9206 } 9207 spdk_bdev_for_each_channel_continue(i, 0); 9208 } 9209 9210 static void 9211 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9212 { 9213 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9214 9215 if (status != 0) { 9216 ctx->status = status; 9217 ctx->bdev->internal.histogram_enabled = false; 9218 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9219 bdev_histogram_disable_channel_cb); 9220 } else { 9221 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9222 ctx->bdev->internal.histogram_in_progress = false; 9223 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9224 ctx->cb_fn(ctx->cb_arg, ctx->status); 9225 free(ctx); 9226 } 9227 } 9228 9229 static void 9230 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9231 struct spdk_io_channel *_ch, void *_ctx) 9232 { 9233 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9234 int status = 0; 9235 9236 if (ch->histogram == NULL) { 9237 ch->histogram = spdk_histogram_data_alloc(); 9238 if (ch->histogram == NULL) { 9239 status = -ENOMEM; 9240 } 9241 } 9242 9243 spdk_bdev_for_each_channel_continue(i, status); 9244 } 9245 9246 void 9247 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9248 void *cb_arg, bool enable) 9249 { 9250 struct spdk_bdev_histogram_ctx *ctx; 9251 9252 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9253 if (ctx == NULL) { 9254 cb_fn(cb_arg, -ENOMEM); 9255 return; 9256 } 9257 9258 ctx->bdev = bdev; 9259 ctx->status = 0; 9260 ctx->cb_fn = cb_fn; 9261 ctx->cb_arg = cb_arg; 9262 9263 spdk_spin_lock(&bdev->internal.spinlock); 9264 if (bdev->internal.histogram_in_progress) { 9265 spdk_spin_unlock(&bdev->internal.spinlock); 9266 free(ctx); 9267 cb_fn(cb_arg, -EAGAIN); 9268 return; 9269 } 9270 9271 bdev->internal.histogram_in_progress = true; 9272 spdk_spin_unlock(&bdev->internal.spinlock); 9273 9274 bdev->internal.histogram_enabled = enable; 9275 9276 if (enable) { 9277 /* Allocate histogram for each channel */ 9278 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9279 bdev_histogram_enable_channel_cb); 9280 } else { 9281 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9282 bdev_histogram_disable_channel_cb); 9283 } 9284 } 9285 9286 struct spdk_bdev_histogram_data_ctx { 9287 spdk_bdev_histogram_data_cb cb_fn; 9288 void *cb_arg; 9289 struct spdk_bdev *bdev; 9290 /** merged histogram data from all channels */ 9291 struct spdk_histogram_data *histogram; 9292 }; 9293 9294 static void 9295 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9296 { 9297 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9298 9299 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9300 free(ctx); 9301 } 9302 9303 static void 9304 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9305 struct spdk_io_channel *_ch, void *_ctx) 9306 { 9307 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9308 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9309 int status = 0; 9310 9311 if (ch->histogram == NULL) { 9312 status = -EFAULT; 9313 } else { 9314 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9315 } 9316 9317 spdk_bdev_for_each_channel_continue(i, status); 9318 } 9319 9320 void 9321 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9322 spdk_bdev_histogram_data_cb cb_fn, 9323 void *cb_arg) 9324 { 9325 struct spdk_bdev_histogram_data_ctx *ctx; 9326 9327 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9328 if (ctx == NULL) { 9329 cb_fn(cb_arg, -ENOMEM, NULL); 9330 return; 9331 } 9332 9333 ctx->bdev = bdev; 9334 ctx->cb_fn = cb_fn; 9335 ctx->cb_arg = cb_arg; 9336 9337 ctx->histogram = histogram; 9338 9339 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9340 bdev_histogram_get_channel_cb); 9341 } 9342 9343 void 9344 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9345 void *cb_arg) 9346 { 9347 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9348 int status = 0; 9349 9350 assert(cb_fn != NULL); 9351 9352 if (bdev_ch->histogram == NULL) { 9353 status = -EFAULT; 9354 } 9355 cb_fn(cb_arg, status, bdev_ch->histogram); 9356 } 9357 9358 size_t 9359 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9360 size_t max_events) 9361 { 9362 struct media_event_entry *entry; 9363 size_t num_events = 0; 9364 9365 for (; num_events < max_events; ++num_events) { 9366 entry = TAILQ_FIRST(&desc->pending_media_events); 9367 if (entry == NULL) { 9368 break; 9369 } 9370 9371 events[num_events] = entry->event; 9372 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9373 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9374 } 9375 9376 return num_events; 9377 } 9378 9379 int 9380 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9381 size_t num_events) 9382 { 9383 struct spdk_bdev_desc *desc; 9384 struct media_event_entry *entry; 9385 size_t event_id; 9386 int rc = 0; 9387 9388 assert(bdev->media_events); 9389 9390 spdk_spin_lock(&bdev->internal.spinlock); 9391 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9392 if (desc->write) { 9393 break; 9394 } 9395 } 9396 9397 if (desc == NULL || desc->media_events_buffer == NULL) { 9398 rc = -ENODEV; 9399 goto out; 9400 } 9401 9402 for (event_id = 0; event_id < num_events; ++event_id) { 9403 entry = TAILQ_FIRST(&desc->free_media_events); 9404 if (entry == NULL) { 9405 break; 9406 } 9407 9408 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9409 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9410 entry->event = events[event_id]; 9411 } 9412 9413 rc = event_id; 9414 out: 9415 spdk_spin_unlock(&bdev->internal.spinlock); 9416 return rc; 9417 } 9418 9419 static void 9420 _media_management_notify(void *arg) 9421 { 9422 struct spdk_bdev_desc *desc = arg; 9423 9424 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9425 } 9426 9427 void 9428 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9429 { 9430 struct spdk_bdev_desc *desc; 9431 9432 spdk_spin_lock(&bdev->internal.spinlock); 9433 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9434 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9435 event_notify(desc, _media_management_notify); 9436 } 9437 } 9438 spdk_spin_unlock(&bdev->internal.spinlock); 9439 } 9440 9441 struct locked_lba_range_ctx { 9442 struct lba_range range; 9443 struct lba_range *current_range; 9444 struct lba_range *owner_range; 9445 struct spdk_poller *poller; 9446 lock_range_cb cb_fn; 9447 void *cb_arg; 9448 }; 9449 9450 static void 9451 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9452 { 9453 struct locked_lba_range_ctx *ctx = _ctx; 9454 9455 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9456 free(ctx); 9457 } 9458 9459 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9460 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9461 9462 static void 9463 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9464 { 9465 struct locked_lba_range_ctx *ctx = _ctx; 9466 9467 if (status == -ENOMEM) { 9468 /* One of the channels could not allocate a range object. 9469 * So we have to go back and clean up any ranges that were 9470 * allocated successfully before we return error status to 9471 * the caller. We can reuse the unlock function to do that 9472 * clean up. 9473 */ 9474 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9475 bdev_lock_error_cleanup_cb); 9476 return; 9477 } 9478 9479 /* All channels have locked this range and no I/O overlapping the range 9480 * are outstanding! Set the owner_ch for the range object for the 9481 * locking channel, so that this channel will know that it is allowed 9482 * to write to this range. 9483 */ 9484 if (ctx->owner_range != NULL) { 9485 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9486 } 9487 9488 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9489 9490 /* Don't free the ctx here. Its range is in the bdev's global list of 9491 * locked ranges still, and will be removed and freed when this range 9492 * is later unlocked. 9493 */ 9494 } 9495 9496 static int 9497 bdev_lock_lba_range_check_io(void *_i) 9498 { 9499 struct spdk_bdev_channel_iter *i = _i; 9500 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9501 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9502 struct locked_lba_range_ctx *ctx = i->ctx; 9503 struct lba_range *range = ctx->current_range; 9504 struct spdk_bdev_io *bdev_io; 9505 9506 spdk_poller_unregister(&ctx->poller); 9507 9508 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9509 * range. But we need to wait until any outstanding IO overlapping with this range 9510 * are completed. 9511 */ 9512 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9513 if (bdev_io_range_is_locked(bdev_io, range)) { 9514 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9515 return SPDK_POLLER_BUSY; 9516 } 9517 } 9518 9519 spdk_bdev_for_each_channel_continue(i, 0); 9520 return SPDK_POLLER_BUSY; 9521 } 9522 9523 static void 9524 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9525 struct spdk_io_channel *_ch, void *_ctx) 9526 { 9527 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9528 struct locked_lba_range_ctx *ctx = _ctx; 9529 struct lba_range *range; 9530 9531 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9532 if (range->length == ctx->range.length && 9533 range->offset == ctx->range.offset && 9534 range->locked_ctx == ctx->range.locked_ctx) { 9535 /* This range already exists on this channel, so don't add 9536 * it again. This can happen when a new channel is created 9537 * while the for_each_channel operation is in progress. 9538 * Do not check for outstanding I/O in that case, since the 9539 * range was locked before any I/O could be submitted to the 9540 * new channel. 9541 */ 9542 spdk_bdev_for_each_channel_continue(i, 0); 9543 return; 9544 } 9545 } 9546 9547 range = calloc(1, sizeof(*range)); 9548 if (range == NULL) { 9549 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9550 return; 9551 } 9552 9553 range->length = ctx->range.length; 9554 range->offset = ctx->range.offset; 9555 range->locked_ctx = ctx->range.locked_ctx; 9556 ctx->current_range = range; 9557 if (ctx->range.owner_ch == ch) { 9558 /* This is the range object for the channel that will hold 9559 * the lock. Store it in the ctx object so that we can easily 9560 * set its owner_ch after the lock is finally acquired. 9561 */ 9562 ctx->owner_range = range; 9563 } 9564 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9565 bdev_lock_lba_range_check_io(i); 9566 } 9567 9568 static void 9569 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9570 { 9571 assert(spdk_get_thread() == ctx->range.owner_thread); 9572 assert(ctx->range.owner_ch == NULL || 9573 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9574 9575 /* We will add a copy of this range to each channel now. */ 9576 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9577 bdev_lock_lba_range_cb); 9578 } 9579 9580 static bool 9581 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9582 { 9583 struct lba_range *r; 9584 9585 TAILQ_FOREACH(r, tailq, tailq) { 9586 if (bdev_lba_range_overlapped(range, r)) { 9587 return true; 9588 } 9589 } 9590 return false; 9591 } 9592 9593 static int 9594 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9595 uint64_t offset, uint64_t length, 9596 lock_range_cb cb_fn, void *cb_arg) 9597 { 9598 struct locked_lba_range_ctx *ctx; 9599 9600 ctx = calloc(1, sizeof(*ctx)); 9601 if (ctx == NULL) { 9602 return -ENOMEM; 9603 } 9604 9605 ctx->range.offset = offset; 9606 ctx->range.length = length; 9607 ctx->range.owner_thread = spdk_get_thread(); 9608 ctx->range.owner_ch = ch; 9609 ctx->range.locked_ctx = cb_arg; 9610 ctx->range.bdev = bdev; 9611 ctx->cb_fn = cb_fn; 9612 ctx->cb_arg = cb_arg; 9613 9614 spdk_spin_lock(&bdev->internal.spinlock); 9615 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9616 /* There is an active lock overlapping with this range. 9617 * Put it on the pending list until this range no 9618 * longer overlaps with another. 9619 */ 9620 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9621 } else { 9622 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9623 bdev_lock_lba_range_ctx(bdev, ctx); 9624 } 9625 spdk_spin_unlock(&bdev->internal.spinlock); 9626 return 0; 9627 } 9628 9629 static int 9630 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9631 uint64_t offset, uint64_t length, 9632 lock_range_cb cb_fn, void *cb_arg) 9633 { 9634 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9635 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9636 9637 if (cb_arg == NULL) { 9638 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9639 return -EINVAL; 9640 } 9641 9642 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9643 } 9644 9645 static void 9646 bdev_lock_lba_range_ctx_msg(void *_ctx) 9647 { 9648 struct locked_lba_range_ctx *ctx = _ctx; 9649 9650 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9651 } 9652 9653 static void 9654 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9655 { 9656 struct locked_lba_range_ctx *ctx = _ctx; 9657 struct locked_lba_range_ctx *pending_ctx; 9658 struct lba_range *range, *tmp; 9659 9660 spdk_spin_lock(&bdev->internal.spinlock); 9661 /* Check if there are any pending locked ranges that overlap with this range 9662 * that was just unlocked. If there are, check that it doesn't overlap with any 9663 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9664 * the lock process. 9665 */ 9666 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9667 if (bdev_lba_range_overlapped(range, &ctx->range) && 9668 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9669 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9670 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9671 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9672 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9673 bdev_lock_lba_range_ctx_msg, pending_ctx); 9674 } 9675 } 9676 spdk_spin_unlock(&bdev->internal.spinlock); 9677 9678 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9679 free(ctx); 9680 } 9681 9682 static void 9683 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9684 struct spdk_io_channel *_ch, void *_ctx) 9685 { 9686 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9687 struct locked_lba_range_ctx *ctx = _ctx; 9688 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9689 struct spdk_bdev_io *bdev_io; 9690 struct lba_range *range; 9691 9692 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9693 if (ctx->range.offset == range->offset && 9694 ctx->range.length == range->length && 9695 ctx->range.locked_ctx == range->locked_ctx) { 9696 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9697 free(range); 9698 break; 9699 } 9700 } 9701 9702 /* Note: we should almost always be able to assert that the range specified 9703 * was found. But there are some very rare corner cases where a new channel 9704 * gets created simultaneously with a range unlock, where this function 9705 * would execute on that new channel and wouldn't have the range. 9706 * We also use this to clean up range allocations when a later allocation 9707 * fails in the locking path. 9708 * So we can't actually assert() here. 9709 */ 9710 9711 /* Swap the locked IO into a temporary list, and then try to submit them again. 9712 * We could hyper-optimize this to only resubmit locked I/O that overlap 9713 * with the range that was just unlocked, but this isn't a performance path so 9714 * we go for simplicity here. 9715 */ 9716 TAILQ_INIT(&io_locked); 9717 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9718 while (!TAILQ_EMPTY(&io_locked)) { 9719 bdev_io = TAILQ_FIRST(&io_locked); 9720 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9721 bdev_io_submit(bdev_io); 9722 } 9723 9724 spdk_bdev_for_each_channel_continue(i, 0); 9725 } 9726 9727 static int 9728 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9729 lock_range_cb cb_fn, void *cb_arg) 9730 { 9731 struct locked_lba_range_ctx *ctx; 9732 struct lba_range *range; 9733 9734 spdk_spin_lock(&bdev->internal.spinlock); 9735 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9736 * and remove it. This ensures new channels don't inherit the locked range. 9737 * Then we will send a message to each channel to remove the range from its 9738 * per-channel list. 9739 */ 9740 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9741 if (range->offset == offset && range->length == length && 9742 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9743 break; 9744 } 9745 } 9746 if (range == NULL) { 9747 assert(false); 9748 spdk_spin_unlock(&bdev->internal.spinlock); 9749 return -EINVAL; 9750 } 9751 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9752 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9753 spdk_spin_unlock(&bdev->internal.spinlock); 9754 9755 ctx->cb_fn = cb_fn; 9756 ctx->cb_arg = cb_arg; 9757 9758 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9759 bdev_unlock_lba_range_cb); 9760 return 0; 9761 } 9762 9763 static int 9764 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9765 uint64_t offset, uint64_t length, 9766 lock_range_cb cb_fn, void *cb_arg) 9767 { 9768 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9769 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9770 struct lba_range *range; 9771 bool range_found = false; 9772 9773 /* Let's make sure the specified channel actually has a lock on 9774 * the specified range. Note that the range must match exactly. 9775 */ 9776 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9777 if (range->offset == offset && range->length == length && 9778 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9779 range_found = true; 9780 break; 9781 } 9782 } 9783 9784 if (!range_found) { 9785 return -EINVAL; 9786 } 9787 9788 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9789 } 9790 9791 struct bdev_quiesce_ctx { 9792 spdk_bdev_quiesce_cb cb_fn; 9793 void *cb_arg; 9794 }; 9795 9796 static void 9797 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9798 { 9799 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9800 9801 if (quiesce_ctx->cb_fn != NULL) { 9802 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9803 } 9804 9805 free(quiesce_ctx); 9806 } 9807 9808 static void 9809 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9810 { 9811 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9812 struct spdk_bdev_module *module = range->bdev->module; 9813 9814 if (status != 0) { 9815 if (quiesce_ctx->cb_fn != NULL) { 9816 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9817 } 9818 free(quiesce_ctx); 9819 return; 9820 } 9821 9822 spdk_spin_lock(&module->internal.spinlock); 9823 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9824 spdk_spin_unlock(&module->internal.spinlock); 9825 9826 if (quiesce_ctx->cb_fn != NULL) { 9827 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9828 quiesce_ctx->cb_fn = NULL; 9829 quiesce_ctx->cb_arg = NULL; 9830 } 9831 /* quiesce_ctx will be freed on unquiesce */ 9832 } 9833 9834 static int 9835 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9836 uint64_t offset, uint64_t length, 9837 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9838 bool unquiesce) 9839 { 9840 struct bdev_quiesce_ctx *quiesce_ctx; 9841 int rc; 9842 9843 if (module != bdev->module) { 9844 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9845 return -EINVAL; 9846 } 9847 9848 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9849 return -EINVAL; 9850 } 9851 9852 if (unquiesce) { 9853 struct lba_range *range; 9854 9855 /* Make sure the specified range is actually quiesced in the specified module and 9856 * then remove it from the list. Note that the range must match exactly. 9857 */ 9858 spdk_spin_lock(&module->internal.spinlock); 9859 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9860 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9861 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9862 break; 9863 } 9864 } 9865 spdk_spin_unlock(&module->internal.spinlock); 9866 9867 if (range == NULL) { 9868 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9869 return -EINVAL; 9870 } 9871 9872 quiesce_ctx = range->locked_ctx; 9873 quiesce_ctx->cb_fn = cb_fn; 9874 quiesce_ctx->cb_arg = cb_arg; 9875 9876 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9877 } else { 9878 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9879 if (quiesce_ctx == NULL) { 9880 return -ENOMEM; 9881 } 9882 9883 quiesce_ctx->cb_fn = cb_fn; 9884 quiesce_ctx->cb_arg = cb_arg; 9885 9886 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9887 if (rc != 0) { 9888 free(quiesce_ctx); 9889 } 9890 } 9891 9892 return rc; 9893 } 9894 9895 int 9896 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9897 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9898 { 9899 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9900 } 9901 9902 int 9903 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9904 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9905 { 9906 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9907 } 9908 9909 int 9910 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9911 uint64_t offset, uint64_t length, 9912 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9913 { 9914 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9915 } 9916 9917 int 9918 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9919 uint64_t offset, uint64_t length, 9920 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9921 { 9922 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9923 } 9924 9925 int 9926 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9927 int array_size) 9928 { 9929 if (!bdev) { 9930 return -EINVAL; 9931 } 9932 9933 if (bdev->fn_table->get_memory_domains) { 9934 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9935 } 9936 9937 return 0; 9938 } 9939 9940 struct spdk_bdev_for_each_io_ctx { 9941 void *ctx; 9942 spdk_bdev_io_fn fn; 9943 spdk_bdev_for_each_io_cb cb; 9944 }; 9945 9946 static void 9947 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9948 struct spdk_io_channel *io_ch, void *_ctx) 9949 { 9950 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9951 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9952 struct spdk_bdev_io *bdev_io; 9953 int rc = 0; 9954 9955 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9956 rc = ctx->fn(ctx->ctx, bdev_io); 9957 if (rc != 0) { 9958 break; 9959 } 9960 } 9961 9962 spdk_bdev_for_each_channel_continue(i, rc); 9963 } 9964 9965 static void 9966 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9967 { 9968 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9969 9970 ctx->cb(ctx->ctx, status); 9971 9972 free(ctx); 9973 } 9974 9975 void 9976 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9977 spdk_bdev_for_each_io_cb cb) 9978 { 9979 struct spdk_bdev_for_each_io_ctx *ctx; 9980 9981 assert(fn != NULL && cb != NULL); 9982 9983 ctx = calloc(1, sizeof(*ctx)); 9984 if (ctx == NULL) { 9985 SPDK_ERRLOG("Failed to allocate context.\n"); 9986 cb(_ctx, -ENOMEM); 9987 return; 9988 } 9989 9990 ctx->ctx = _ctx; 9991 ctx->fn = fn; 9992 ctx->cb = cb; 9993 9994 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9995 bdev_for_each_io_done); 9996 } 9997 9998 void 9999 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10000 { 10001 spdk_for_each_channel_continue(iter->i, status); 10002 } 10003 10004 static struct spdk_bdev * 10005 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10006 { 10007 void *io_device = spdk_io_channel_iter_get_io_device(i); 10008 10009 return __bdev_from_io_dev(io_device); 10010 } 10011 10012 static void 10013 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10014 { 10015 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10016 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10017 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10018 10019 iter->i = i; 10020 iter->fn(iter, bdev, ch, iter->ctx); 10021 } 10022 10023 static void 10024 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10025 { 10026 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10027 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10028 10029 iter->i = i; 10030 iter->cpl(bdev, iter->ctx, status); 10031 10032 free(iter); 10033 } 10034 10035 void 10036 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10037 void *ctx, spdk_bdev_for_each_channel_done cpl) 10038 { 10039 struct spdk_bdev_channel_iter *iter; 10040 10041 assert(bdev != NULL && fn != NULL && ctx != NULL); 10042 10043 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10044 if (iter == NULL) { 10045 SPDK_ERRLOG("Unable to allocate iterator\n"); 10046 assert(false); 10047 return; 10048 } 10049 10050 iter->fn = fn; 10051 iter->cpl = cpl; 10052 iter->ctx = ctx; 10053 10054 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10055 iter, bdev_each_channel_cpl); 10056 } 10057 10058 static void 10059 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10060 { 10061 struct spdk_bdev_io *parent_io = cb_arg; 10062 10063 spdk_bdev_free_io(bdev_io); 10064 10065 /* Check return status of write */ 10066 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10067 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10068 } 10069 10070 static void 10071 bdev_copy_do_write(void *_bdev_io) 10072 { 10073 struct spdk_bdev_io *bdev_io = _bdev_io; 10074 int rc; 10075 10076 /* Write blocks */ 10077 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10078 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10079 bdev_io->u.bdev.iovs[0].iov_base, 10080 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10081 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10082 10083 if (rc == -ENOMEM) { 10084 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10085 } else if (rc != 0) { 10086 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10087 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10088 } 10089 } 10090 10091 static void 10092 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10093 { 10094 struct spdk_bdev_io *parent_io = cb_arg; 10095 10096 spdk_bdev_free_io(bdev_io); 10097 10098 /* Check return status of read */ 10099 if (!success) { 10100 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10101 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10102 return; 10103 } 10104 10105 /* Do write */ 10106 bdev_copy_do_write(parent_io); 10107 } 10108 10109 static void 10110 bdev_copy_do_read(void *_bdev_io) 10111 { 10112 struct spdk_bdev_io *bdev_io = _bdev_io; 10113 int rc; 10114 10115 /* Read blocks */ 10116 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10117 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10118 bdev_io->u.bdev.iovs[0].iov_base, 10119 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10120 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10121 10122 if (rc == -ENOMEM) { 10123 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10124 } else if (rc != 0) { 10125 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10126 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10127 } 10128 } 10129 10130 static void 10131 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10132 { 10133 if (!success) { 10134 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10135 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10136 return; 10137 } 10138 10139 bdev_copy_do_read(bdev_io); 10140 } 10141 10142 int 10143 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10144 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10145 spdk_bdev_io_completion_cb cb, void *cb_arg) 10146 { 10147 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10148 struct spdk_bdev_io *bdev_io; 10149 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10150 10151 if (!desc->write) { 10152 return -EBADF; 10153 } 10154 10155 if (num_blocks == 0) { 10156 SPDK_ERRLOG("Can't copy 0 blocks\n"); 10157 return -EINVAL; 10158 } 10159 10160 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10161 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10162 SPDK_DEBUGLOG(bdev, 10163 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10164 dst_offset_blocks, src_offset_blocks, num_blocks); 10165 return -EINVAL; 10166 } 10167 10168 bdev_io = bdev_channel_get_io(channel); 10169 if (!bdev_io) { 10170 return -ENOMEM; 10171 } 10172 10173 bdev_io->internal.ch = channel; 10174 bdev_io->internal.desc = desc; 10175 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10176 10177 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10178 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10179 bdev_io->u.bdev.num_blocks = num_blocks; 10180 bdev_io->u.bdev.memory_domain = NULL; 10181 bdev_io->u.bdev.memory_domain_ctx = NULL; 10182 bdev_io->u.bdev.iovs = NULL; 10183 bdev_io->u.bdev.iovcnt = 0; 10184 bdev_io->u.bdev.md_buf = NULL; 10185 bdev_io->u.bdev.accel_sequence = NULL; 10186 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10187 10188 if (dst_offset_blocks == src_offset_blocks) { 10189 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 10190 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 10191 10192 return 0; 10193 } 10194 10195 10196 /* If the copy size is large and should be split, use the generic split logic 10197 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10198 * 10199 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10200 * emulate it using regular read and write requests otherwise. 10201 */ 10202 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10203 bdev_io->internal.split) { 10204 bdev_io_submit(bdev_io); 10205 return 0; 10206 } 10207 10208 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10209 10210 return 0; 10211 } 10212 10213 SPDK_LOG_REGISTER_COMPONENT(bdev) 10214 10215 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10216 { 10217 struct spdk_trace_tpoint_opts opts[] = { 10218 { 10219 "BDEV_IO_START", TRACE_BDEV_IO_START, 10220 OWNER_BDEV, OBJECT_BDEV_IO, 1, 10221 { 10222 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10223 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10224 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10225 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10226 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10227 } 10228 }, 10229 { 10230 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10231 OWNER_BDEV, OBJECT_BDEV_IO, 0, 10232 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10233 }, 10234 { 10235 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10236 OWNER_BDEV, OBJECT_NONE, 1, 10237 { 10238 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10239 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10240 } 10241 }, 10242 { 10243 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10244 OWNER_BDEV, OBJECT_NONE, 0, 10245 { 10246 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10247 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10248 } 10249 }, 10250 }; 10251 10252 10253 spdk_trace_register_owner(OWNER_BDEV, 'b'); 10254 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10255 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10256 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10257 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10258 } 10259