1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 79 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 80 }; 81 82 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 83 84 RB_HEAD(bdev_name_tree, spdk_bdev_name); 85 86 static int 87 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 88 { 89 return strcmp(name1->name, name2->name); 90 } 91 92 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 93 94 struct spdk_bdev_mgr { 95 struct spdk_mempool *bdev_io_pool; 96 97 void *zero_buffer; 98 99 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 100 101 struct spdk_bdev_list bdevs; 102 struct bdev_name_tree bdev_names; 103 104 bool init_complete; 105 bool module_init_complete; 106 107 struct spdk_spinlock spinlock; 108 109 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 123 }; 124 125 static void 126 __attribute__((constructor)) 127 _bdev_init(void) 128 { 129 spdk_spin_init(&g_bdev_mgr.spinlock); 130 } 131 132 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 133 134 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 135 136 struct lba_range { 137 struct spdk_bdev *bdev; 138 uint64_t offset; 139 uint64_t length; 140 bool quiesce; 141 void *locked_ctx; 142 struct spdk_thread *owner_thread; 143 struct spdk_bdev_channel *owner_ch; 144 TAILQ_ENTRY(lba_range) tailq; 145 TAILQ_ENTRY(lba_range) tailq_module; 146 }; 147 148 static struct spdk_bdev_opts g_bdev_opts = { 149 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 150 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 151 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 152 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 153 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 154 }; 155 156 static spdk_bdev_init_cb g_init_cb_fn = NULL; 157 static void *g_init_cb_arg = NULL; 158 159 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 160 static void *g_fini_cb_arg = NULL; 161 static struct spdk_thread *g_fini_thread = NULL; 162 163 struct spdk_bdev_qos_limit { 164 /** IOs or bytes allowed per second (i.e., 1s). */ 165 uint64_t limit; 166 167 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 168 * For remaining bytes, allowed to run negative if an I/O is submitted when 169 * some bytes are remaining, but the I/O is bigger than that amount. The 170 * excess will be deducted from the next timeslice. 171 */ 172 int64_t remaining_this_timeslice; 173 174 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 175 uint32_t min_per_timeslice; 176 177 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 178 uint32_t max_per_timeslice; 179 180 /** Function to check whether to queue the IO. 181 * If The IO is allowed to pass, the quota will be reduced correspondingly. 182 */ 183 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 184 185 /** Function to rewind the quota once the IO was allowed to be sent by this 186 * limit but queued due to one of the further limits. 187 */ 188 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 189 }; 190 191 struct spdk_bdev_qos { 192 /** Types of structure of rate limits. */ 193 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 194 195 /** The channel that all I/O are funneled through. */ 196 struct spdk_bdev_channel *ch; 197 198 /** The thread on which the poller is running. */ 199 struct spdk_thread *thread; 200 201 /** Size of a timeslice in tsc ticks. */ 202 uint64_t timeslice_size; 203 204 /** Timestamp of start of last timeslice. */ 205 uint64_t last_timeslice; 206 207 /** Poller that processes queued I/O commands each time slice. */ 208 struct spdk_poller *poller; 209 }; 210 211 struct spdk_bdev_mgmt_channel { 212 /* 213 * Each thread keeps a cache of bdev_io - this allows 214 * bdev threads which are *not* DPDK threads to still 215 * benefit from a per-thread bdev_io cache. Without 216 * this, non-DPDK threads fetching from the mempool 217 * incur a cmpxchg on get and put. 218 */ 219 bdev_io_stailq_t per_thread_cache; 220 uint32_t per_thread_cache_count; 221 uint32_t bdev_io_cache_size; 222 223 struct spdk_iobuf_channel iobuf; 224 225 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 226 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 227 }; 228 229 /* 230 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 231 * will queue here their IO that awaits retry. It makes it possible to retry sending 232 * IO to one bdev after IO from other bdev completes. 233 */ 234 struct spdk_bdev_shared_resource { 235 /* The bdev management channel */ 236 struct spdk_bdev_mgmt_channel *mgmt_ch; 237 238 /* 239 * Count of I/O submitted to bdev module and waiting for completion. 240 * Incremented before submit_request() is called on an spdk_bdev_io. 241 */ 242 uint64_t io_outstanding; 243 244 /* 245 * Queue of IO awaiting retry because of a previous NOMEM status returned 246 * on this channel. 247 */ 248 bdev_io_tailq_t nomem_io; 249 250 /* 251 * Threshold which io_outstanding must drop to before retrying nomem_io. 252 */ 253 uint64_t nomem_threshold; 254 255 /* I/O channel allocated by a bdev module */ 256 struct spdk_io_channel *shared_ch; 257 258 struct spdk_poller *nomem_poller; 259 260 /* Refcount of bdev channels using this resource */ 261 uint32_t ref; 262 263 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 264 }; 265 266 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 267 #define BDEV_CH_QOS_ENABLED (1 << 1) 268 269 struct spdk_bdev_channel { 270 struct spdk_bdev *bdev; 271 272 /* The channel for the underlying device */ 273 struct spdk_io_channel *channel; 274 275 /* Accel channel */ 276 struct spdk_io_channel *accel_channel; 277 278 /* Per io_device per thread data */ 279 struct spdk_bdev_shared_resource *shared_resource; 280 281 struct spdk_bdev_io_stat *stat; 282 283 /* 284 * Count of I/O submitted to the underlying dev module through this channel 285 * and waiting for completion. 286 */ 287 uint64_t io_outstanding; 288 289 /* 290 * List of all submitted I/Os including I/O that are generated via splitting. 291 */ 292 bdev_io_tailq_t io_submitted; 293 294 /* 295 * List of spdk_bdev_io that are currently queued because they write to a locked 296 * LBA range. 297 */ 298 bdev_io_tailq_t io_locked; 299 300 /* List of I/Os with accel sequence being currently executed */ 301 bdev_io_tailq_t io_accel_exec; 302 303 /* List of I/Os doing memory domain pull/push */ 304 bdev_io_tailq_t io_memory_domain; 305 306 uint32_t flags; 307 308 struct spdk_histogram_data *histogram; 309 310 #ifdef SPDK_CONFIG_VTUNE 311 uint64_t start_tsc; 312 uint64_t interval_tsc; 313 __itt_string_handle *handle; 314 struct spdk_bdev_io_stat *prev_stat; 315 #endif 316 317 bdev_io_tailq_t queued_resets; 318 319 lba_range_tailq_t locked_ranges; 320 321 /** List of I/Os queued by QoS. */ 322 bdev_io_tailq_t qos_queued_io; 323 }; 324 325 struct media_event_entry { 326 struct spdk_bdev_media_event event; 327 TAILQ_ENTRY(media_event_entry) tailq; 328 }; 329 330 #define MEDIA_EVENT_POOL_SIZE 64 331 332 struct spdk_bdev_desc { 333 struct spdk_bdev *bdev; 334 struct spdk_thread *thread; 335 struct { 336 spdk_bdev_event_cb_t event_fn; 337 void *ctx; 338 } callback; 339 bool closed; 340 bool write; 341 bool memory_domains_supported; 342 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 343 struct spdk_spinlock spinlock; 344 uint32_t refs; 345 TAILQ_HEAD(, media_event_entry) pending_media_events; 346 TAILQ_HEAD(, media_event_entry) free_media_events; 347 struct media_event_entry *media_events_buffer; 348 TAILQ_ENTRY(spdk_bdev_desc) link; 349 350 uint64_t timeout_in_sec; 351 spdk_bdev_io_timeout_cb cb_fn; 352 void *cb_arg; 353 struct spdk_poller *io_timeout_poller; 354 struct spdk_bdev_module_claim *claim; 355 }; 356 357 struct spdk_bdev_iostat_ctx { 358 struct spdk_bdev_io_stat *stat; 359 spdk_bdev_get_device_stat_cb cb; 360 void *cb_arg; 361 }; 362 363 struct set_qos_limit_ctx { 364 void (*cb_fn)(void *cb_arg, int status); 365 void *cb_arg; 366 struct spdk_bdev *bdev; 367 }; 368 369 struct spdk_bdev_channel_iter { 370 spdk_bdev_for_each_channel_msg fn; 371 spdk_bdev_for_each_channel_done cpl; 372 struct spdk_io_channel_iter *i; 373 void *ctx; 374 }; 375 376 struct spdk_bdev_io_error_stat { 377 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 378 }; 379 380 enum bdev_io_retry_state { 381 BDEV_IO_RETRY_STATE_INVALID, 382 BDEV_IO_RETRY_STATE_PULL, 383 BDEV_IO_RETRY_STATE_PULL_MD, 384 BDEV_IO_RETRY_STATE_SUBMIT, 385 BDEV_IO_RETRY_STATE_PUSH, 386 BDEV_IO_RETRY_STATE_PUSH_MD, 387 }; 388 389 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 390 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 391 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 392 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 393 394 static inline void bdev_io_complete(void *ctx); 395 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 396 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 397 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 398 399 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 400 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 401 402 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 403 struct spdk_io_channel *ch, void *_ctx); 404 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 405 406 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 407 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 408 uint64_t num_blocks, 409 struct spdk_memory_domain *domain, void *domain_ctx, 410 struct spdk_accel_sequence *seq, 411 spdk_bdev_io_completion_cb cb, void *cb_arg); 412 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 413 struct iovec *iov, int iovcnt, void *md_buf, 414 uint64_t offset_blocks, uint64_t num_blocks, 415 struct spdk_memory_domain *domain, void *domain_ctx, 416 struct spdk_accel_sequence *seq, 417 spdk_bdev_io_completion_cb cb, void *cb_arg); 418 419 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 420 uint64_t offset, uint64_t length, 421 lock_range_cb cb_fn, void *cb_arg); 422 423 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 424 uint64_t offset, uint64_t length, 425 lock_range_cb cb_fn, void *cb_arg); 426 427 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 428 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 429 430 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 431 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 432 static void claim_reset(struct spdk_bdev *bdev); 433 434 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 435 436 #define bdev_get_ext_io_opt(opts, field, defval) \ 437 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 438 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 439 440 void 441 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 442 { 443 if (!opts) { 444 SPDK_ERRLOG("opts should not be NULL\n"); 445 return; 446 } 447 448 if (!opts_size) { 449 SPDK_ERRLOG("opts_size should not be zero value\n"); 450 return; 451 } 452 453 opts->opts_size = opts_size; 454 455 #define SET_FIELD(field) \ 456 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 457 opts->field = g_bdev_opts.field; \ 458 } \ 459 460 SET_FIELD(bdev_io_pool_size); 461 SET_FIELD(bdev_io_cache_size); 462 SET_FIELD(bdev_auto_examine); 463 SET_FIELD(iobuf_small_cache_size); 464 SET_FIELD(iobuf_large_cache_size); 465 466 /* Do not remove this statement, you should always update this statement when you adding a new field, 467 * and do not forget to add the SET_FIELD statement for your added field. */ 468 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 469 470 #undef SET_FIELD 471 } 472 473 int 474 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 475 { 476 uint32_t min_pool_size; 477 478 if (!opts) { 479 SPDK_ERRLOG("opts cannot be NULL\n"); 480 return -1; 481 } 482 483 if (!opts->opts_size) { 484 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 485 return -1; 486 } 487 488 /* 489 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 490 * initialization. A second mgmt_ch will be created on the same thread when the application starts 491 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 492 */ 493 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 494 if (opts->bdev_io_pool_size < min_pool_size) { 495 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 496 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 497 spdk_thread_get_count()); 498 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 499 return -1; 500 } 501 502 #define SET_FIELD(field) \ 503 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 504 g_bdev_opts.field = opts->field; \ 505 } \ 506 507 SET_FIELD(bdev_io_pool_size); 508 SET_FIELD(bdev_io_cache_size); 509 SET_FIELD(bdev_auto_examine); 510 SET_FIELD(iobuf_small_cache_size); 511 SET_FIELD(iobuf_large_cache_size); 512 513 g_bdev_opts.opts_size = opts->opts_size; 514 515 #undef SET_FIELD 516 517 return 0; 518 } 519 520 static struct spdk_bdev * 521 bdev_get_by_name(const char *bdev_name) 522 { 523 struct spdk_bdev_name find; 524 struct spdk_bdev_name *res; 525 526 find.name = (char *)bdev_name; 527 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 528 if (res != NULL) { 529 return res->bdev; 530 } 531 532 return NULL; 533 } 534 535 struct spdk_bdev * 536 spdk_bdev_get_by_name(const char *bdev_name) 537 { 538 struct spdk_bdev *bdev; 539 540 spdk_spin_lock(&g_bdev_mgr.spinlock); 541 bdev = bdev_get_by_name(bdev_name); 542 spdk_spin_unlock(&g_bdev_mgr.spinlock); 543 544 return bdev; 545 } 546 547 struct bdev_io_status_string { 548 enum spdk_bdev_io_status status; 549 const char *str; 550 }; 551 552 static const struct bdev_io_status_string bdev_io_status_strings[] = { 553 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 554 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 555 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 556 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 557 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 558 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 559 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 560 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 561 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 562 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 563 }; 564 565 static const char * 566 bdev_io_status_get_string(enum spdk_bdev_io_status status) 567 { 568 uint32_t i; 569 570 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 571 if (bdev_io_status_strings[i].status == status) { 572 return bdev_io_status_strings[i].str; 573 } 574 } 575 576 return "reserved"; 577 } 578 579 struct spdk_bdev_wait_for_examine_ctx { 580 struct spdk_poller *poller; 581 spdk_bdev_wait_for_examine_cb cb_fn; 582 void *cb_arg; 583 }; 584 585 static bool bdev_module_all_actions_completed(void); 586 587 static int 588 bdev_wait_for_examine_cb(void *arg) 589 { 590 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 591 592 if (!bdev_module_all_actions_completed()) { 593 return SPDK_POLLER_IDLE; 594 } 595 596 spdk_poller_unregister(&ctx->poller); 597 ctx->cb_fn(ctx->cb_arg); 598 free(ctx); 599 600 return SPDK_POLLER_BUSY; 601 } 602 603 int 604 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 605 { 606 struct spdk_bdev_wait_for_examine_ctx *ctx; 607 608 ctx = calloc(1, sizeof(*ctx)); 609 if (ctx == NULL) { 610 return -ENOMEM; 611 } 612 ctx->cb_fn = cb_fn; 613 ctx->cb_arg = cb_arg; 614 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 615 616 return 0; 617 } 618 619 struct spdk_bdev_examine_item { 620 char *name; 621 TAILQ_ENTRY(spdk_bdev_examine_item) link; 622 }; 623 624 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 625 626 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 627 g_bdev_examine_allowlist); 628 629 static inline bool 630 bdev_examine_allowlist_check(const char *name) 631 { 632 struct spdk_bdev_examine_item *item; 633 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 634 if (strcmp(name, item->name) == 0) { 635 return true; 636 } 637 } 638 return false; 639 } 640 641 static inline void 642 bdev_examine_allowlist_free(void) 643 { 644 struct spdk_bdev_examine_item *item; 645 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 646 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 647 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 648 free(item->name); 649 free(item); 650 } 651 } 652 653 static inline bool 654 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 655 { 656 struct spdk_bdev_alias *tmp; 657 if (bdev_examine_allowlist_check(bdev->name)) { 658 return true; 659 } 660 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 661 if (bdev_examine_allowlist_check(tmp->alias.name)) { 662 return true; 663 } 664 } 665 return false; 666 } 667 668 static inline bool 669 bdev_ok_to_examine(struct spdk_bdev *bdev) 670 { 671 if (g_bdev_opts.bdev_auto_examine) { 672 return true; 673 } else { 674 return bdev_in_examine_allowlist(bdev); 675 } 676 } 677 678 static void 679 bdev_examine(struct spdk_bdev *bdev) 680 { 681 struct spdk_bdev_module *module; 682 struct spdk_bdev_module_claim *claim, *tmpclaim; 683 uint32_t action; 684 685 if (!bdev_ok_to_examine(bdev)) { 686 return; 687 } 688 689 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 690 if (module->examine_config) { 691 spdk_spin_lock(&module->internal.spinlock); 692 action = module->internal.action_in_progress; 693 module->internal.action_in_progress++; 694 spdk_spin_unlock(&module->internal.spinlock); 695 module->examine_config(bdev); 696 if (action != module->internal.action_in_progress) { 697 SPDK_ERRLOG("examine_config for module %s did not call " 698 "spdk_bdev_module_examine_done()\n", module->name); 699 } 700 } 701 } 702 703 spdk_spin_lock(&bdev->internal.spinlock); 704 705 switch (bdev->internal.claim_type) { 706 case SPDK_BDEV_CLAIM_NONE: 707 /* Examine by all bdev modules */ 708 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 709 if (module->examine_disk) { 710 spdk_spin_lock(&module->internal.spinlock); 711 module->internal.action_in_progress++; 712 spdk_spin_unlock(&module->internal.spinlock); 713 spdk_spin_unlock(&bdev->internal.spinlock); 714 module->examine_disk(bdev); 715 spdk_spin_lock(&bdev->internal.spinlock); 716 } 717 } 718 break; 719 case SPDK_BDEV_CLAIM_EXCL_WRITE: 720 /* Examine by the one bdev module with a v1 claim */ 721 module = bdev->internal.claim.v1.module; 722 if (module->examine_disk) { 723 spdk_spin_lock(&module->internal.spinlock); 724 module->internal.action_in_progress++; 725 spdk_spin_unlock(&module->internal.spinlock); 726 spdk_spin_unlock(&bdev->internal.spinlock); 727 module->examine_disk(bdev); 728 return; 729 } 730 break; 731 default: 732 /* Examine by all bdev modules with a v2 claim */ 733 assert(claim_type_is_v2(bdev->internal.claim_type)); 734 /* 735 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 736 * list, perhaps accessing freed memory. Without protection, this could happen 737 * while the lock is dropped during the examine callback. 738 */ 739 bdev->internal.examine_in_progress++; 740 741 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 742 module = claim->module; 743 744 if (module == NULL) { 745 /* This is a vestigial claim, held by examine_count */ 746 continue; 747 } 748 749 if (module->examine_disk == NULL) { 750 continue; 751 } 752 753 spdk_spin_lock(&module->internal.spinlock); 754 module->internal.action_in_progress++; 755 spdk_spin_unlock(&module->internal.spinlock); 756 757 /* Call examine_disk without holding internal.spinlock. */ 758 spdk_spin_unlock(&bdev->internal.spinlock); 759 module->examine_disk(bdev); 760 spdk_spin_lock(&bdev->internal.spinlock); 761 } 762 763 assert(bdev->internal.examine_in_progress > 0); 764 bdev->internal.examine_in_progress--; 765 if (bdev->internal.examine_in_progress == 0) { 766 /* Remove any claims that were released during examine_disk */ 767 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 768 if (claim->desc != NULL) { 769 continue; 770 } 771 772 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 773 free(claim); 774 } 775 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 776 claim_reset(bdev); 777 } 778 } 779 } 780 781 spdk_spin_unlock(&bdev->internal.spinlock); 782 } 783 784 int 785 spdk_bdev_examine(const char *name) 786 { 787 struct spdk_bdev *bdev; 788 struct spdk_bdev_examine_item *item; 789 struct spdk_thread *thread = spdk_get_thread(); 790 791 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 792 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 793 thread ? spdk_thread_get_name(thread) : "null"); 794 return -EINVAL; 795 } 796 797 if (g_bdev_opts.bdev_auto_examine) { 798 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 799 return -EINVAL; 800 } 801 802 if (bdev_examine_allowlist_check(name)) { 803 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 804 return -EEXIST; 805 } 806 807 item = calloc(1, sizeof(*item)); 808 if (!item) { 809 return -ENOMEM; 810 } 811 item->name = strdup(name); 812 if (!item->name) { 813 free(item); 814 return -ENOMEM; 815 } 816 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 817 818 bdev = spdk_bdev_get_by_name(name); 819 if (bdev) { 820 bdev_examine(bdev); 821 } 822 return 0; 823 } 824 825 static inline void 826 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 827 { 828 struct spdk_bdev_examine_item *item; 829 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 830 spdk_json_write_object_begin(w); 831 spdk_json_write_named_string(w, "method", "bdev_examine"); 832 spdk_json_write_named_object_begin(w, "params"); 833 spdk_json_write_named_string(w, "name", item->name); 834 spdk_json_write_object_end(w); 835 spdk_json_write_object_end(w); 836 } 837 } 838 839 struct spdk_bdev * 840 spdk_bdev_first(void) 841 { 842 struct spdk_bdev *bdev; 843 844 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 845 if (bdev) { 846 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 847 } 848 849 return bdev; 850 } 851 852 struct spdk_bdev * 853 spdk_bdev_next(struct spdk_bdev *prev) 854 { 855 struct spdk_bdev *bdev; 856 857 bdev = TAILQ_NEXT(prev, internal.link); 858 if (bdev) { 859 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 860 } 861 862 return bdev; 863 } 864 865 static struct spdk_bdev * 866 _bdev_next_leaf(struct spdk_bdev *bdev) 867 { 868 while (bdev != NULL) { 869 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 870 return bdev; 871 } else { 872 bdev = TAILQ_NEXT(bdev, internal.link); 873 } 874 } 875 876 return bdev; 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_first_leaf(void) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 885 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 struct spdk_bdev * 894 spdk_bdev_next_leaf(struct spdk_bdev *prev) 895 { 896 struct spdk_bdev *bdev; 897 898 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 899 900 if (bdev) { 901 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 902 } 903 904 return bdev; 905 } 906 907 static inline bool 908 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 909 { 910 return bdev_io->internal.memory_domain; 911 } 912 913 static inline bool 914 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 915 { 916 return bdev_io->internal.has_accel_sequence; 917 } 918 919 static inline void 920 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 921 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 922 { 923 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 924 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 925 * channels we will instead wait for half to complete. 926 */ 927 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 928 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 929 930 assert(state != BDEV_IO_RETRY_STATE_INVALID); 931 bdev_io->internal.retry_state = state; 932 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 933 } 934 935 static inline void 936 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 937 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 938 { 939 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 940 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 941 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 942 943 assert(state != BDEV_IO_RETRY_STATE_INVALID); 944 bdev_io->internal.retry_state = state; 945 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 946 } 947 948 void 949 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 950 { 951 struct iovec *iovs; 952 953 if (bdev_io->u.bdev.iovs == NULL) { 954 bdev_io->u.bdev.iovs = &bdev_io->iov; 955 bdev_io->u.bdev.iovcnt = 1; 956 } 957 958 iovs = bdev_io->u.bdev.iovs; 959 960 assert(iovs != NULL); 961 assert(bdev_io->u.bdev.iovcnt >= 1); 962 963 iovs[0].iov_base = buf; 964 iovs[0].iov_len = len; 965 } 966 967 void 968 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 969 { 970 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 971 bdev_io->u.bdev.md_buf = md_buf; 972 } 973 974 static bool 975 _is_buf_allocated(const struct iovec *iovs) 976 { 977 if (iovs == NULL) { 978 return false; 979 } 980 981 return iovs[0].iov_base != NULL; 982 } 983 984 static bool 985 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 986 { 987 int i; 988 uintptr_t iov_base; 989 990 if (spdk_likely(alignment == 1)) { 991 return true; 992 } 993 994 for (i = 0; i < iovcnt; i++) { 995 iov_base = (uintptr_t)iovs[i].iov_base; 996 if ((iov_base & (alignment - 1)) != 0) { 997 return false; 998 } 999 } 1000 1001 return true; 1002 } 1003 1004 static inline bool 1005 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1006 { 1007 if (!bdev_io->internal.accel_sequence) { 1008 return false; 1009 } 1010 1011 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1012 * bdev module didn't support accel sequences */ 1013 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1014 } 1015 1016 static inline void 1017 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1018 struct spdk_bdev_shared_resource *shared_resource) 1019 { 1020 bdev_ch->io_outstanding++; 1021 shared_resource->io_outstanding++; 1022 } 1023 1024 static inline void 1025 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1026 struct spdk_bdev_shared_resource *shared_resource) 1027 { 1028 assert(bdev_ch->io_outstanding > 0); 1029 assert(shared_resource->io_outstanding > 0); 1030 bdev_ch->io_outstanding--; 1031 shared_resource->io_outstanding--; 1032 } 1033 1034 static void 1035 bdev_io_submit_sequence_cb(void *ctx, int status) 1036 { 1037 struct spdk_bdev_io *bdev_io = ctx; 1038 1039 bdev_io->u.bdev.accel_sequence = NULL; 1040 bdev_io->internal.accel_sequence = NULL; 1041 1042 if (spdk_unlikely(status != 0)) { 1043 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1044 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1045 bdev_io_complete_unsubmitted(bdev_io); 1046 return; 1047 } 1048 1049 bdev_io_submit(bdev_io); 1050 } 1051 1052 static void 1053 bdev_io_exec_sequence_cb(void *ctx, int status) 1054 { 1055 struct spdk_bdev_io *bdev_io = ctx; 1056 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1057 1058 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1059 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1060 1061 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1062 bdev_ch_retry_io(ch); 1063 } 1064 1065 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1066 } 1067 1068 static void 1069 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1070 { 1071 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1072 1073 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1074 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1075 1076 /* Since the operations are appended during submission, they're in the opposite order than 1077 * how we want to execute them for reads (i.e. we need to execute the most recently added 1078 * operation first), so reverse the sequence before executing it. 1079 */ 1080 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1081 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1082 } 1083 1084 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1085 bdev_io_increment_outstanding(ch, ch->shared_resource); 1086 bdev_io->internal.data_transfer_cpl = cb_fn; 1087 1088 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1089 bdev_io_exec_sequence_cb, bdev_io); 1090 } 1091 1092 static void 1093 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1094 { 1095 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1096 void *buf; 1097 1098 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1099 buf = bdev_io->internal.buf; 1100 bdev_io->internal.buf = NULL; 1101 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1102 bdev_io->internal.get_aux_buf_cb = NULL; 1103 } else { 1104 assert(bdev_io->internal.get_buf_cb != NULL); 1105 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1106 bdev_io->internal.get_buf_cb = NULL; 1107 } 1108 } 1109 1110 static void 1111 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1112 { 1113 struct spdk_bdev_io *bdev_io = ctx; 1114 1115 if (rc) { 1116 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1117 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1118 } 1119 bdev_io_get_buf_complete(bdev_io, !rc); 1120 } 1121 1122 static void 1123 bdev_io_pull_md_buf_done(void *ctx, int status) 1124 { 1125 struct spdk_bdev_io *bdev_io = ctx; 1126 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1127 1128 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1129 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1130 1131 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1132 bdev_ch_retry_io(ch); 1133 } 1134 1135 assert(bdev_io->internal.data_transfer_cpl); 1136 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1137 } 1138 1139 static void 1140 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1141 { 1142 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1143 int rc = 0; 1144 1145 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1146 if (bdev_io_use_memory_domain(bdev_io)) { 1147 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1148 bdev_io_increment_outstanding(ch, ch->shared_resource); 1149 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1150 bdev_io->internal.memory_domain_ctx, 1151 &bdev_io->internal.orig_md_iov, 1, 1152 &bdev_io->internal.bounce_md_iov, 1, 1153 bdev_io_pull_md_buf_done, bdev_io); 1154 if (rc == 0) { 1155 /* Continue to submit IO in completion callback */ 1156 return; 1157 } 1158 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1159 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1160 if (rc != -ENOMEM) { 1161 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1162 spdk_memory_domain_get_dma_device_id( 1163 bdev_io->internal.memory_domain), rc); 1164 } 1165 } else { 1166 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1167 bdev_io->internal.orig_md_iov.iov_base, 1168 bdev_io->internal.orig_md_iov.iov_len); 1169 } 1170 } 1171 1172 if (spdk_unlikely(rc == -ENOMEM)) { 1173 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1174 } else { 1175 assert(bdev_io->internal.data_transfer_cpl); 1176 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1177 } 1178 } 1179 1180 static void 1181 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1182 { 1183 /* save original md_buf */ 1184 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1185 bdev_io->internal.orig_md_iov.iov_len = len; 1186 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1187 bdev_io->internal.bounce_md_iov.iov_len = len; 1188 /* set bounce md_buf */ 1189 bdev_io->u.bdev.md_buf = md_buf; 1190 1191 bdev_io_pull_md_buf(bdev_io); 1192 } 1193 1194 static void 1195 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1196 { 1197 struct spdk_bdev *bdev = bdev_io->bdev; 1198 uint64_t md_len; 1199 void *buf; 1200 1201 if (spdk_bdev_is_md_separate(bdev)) { 1202 assert(!bdev_io_use_accel_sequence(bdev_io)); 1203 1204 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1205 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1206 1207 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1208 1209 if (bdev_io->u.bdev.md_buf != NULL) { 1210 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1211 return; 1212 } else { 1213 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1214 } 1215 } 1216 1217 bdev_io_get_buf_complete(bdev_io, true); 1218 } 1219 1220 static inline void 1221 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1222 { 1223 if (rc) { 1224 SPDK_ERRLOG("Failed to get data buffer\n"); 1225 assert(bdev_io->internal.data_transfer_cpl); 1226 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1227 return; 1228 } 1229 1230 _bdev_io_set_md_buf(bdev_io); 1231 } 1232 1233 static void 1234 bdev_io_pull_data_done_and_track(void *ctx, int status) 1235 { 1236 struct spdk_bdev_io *bdev_io = ctx; 1237 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1238 1239 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1240 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1241 1242 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1243 bdev_ch_retry_io(ch); 1244 } 1245 1246 bdev_io_pull_data_done(bdev_io, status); 1247 } 1248 1249 static void 1250 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1251 { 1252 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1253 int rc = 0; 1254 1255 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1256 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1257 * operation */ 1258 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1259 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1260 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1261 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1262 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1263 NULL, NULL, 1264 bdev_io->internal.orig_iovs, 1265 bdev_io->internal.orig_iovcnt, 1266 bdev_io->internal.memory_domain, 1267 bdev_io->internal.memory_domain_ctx, 1268 0, NULL, NULL); 1269 } else { 1270 /* We need to reverse the src/dst for reads */ 1271 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1272 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1273 bdev_io->internal.orig_iovs, 1274 bdev_io->internal.orig_iovcnt, 1275 bdev_io->internal.memory_domain, 1276 bdev_io->internal.memory_domain_ctx, 1277 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1278 NULL, NULL, 0, NULL, NULL); 1279 } 1280 1281 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1282 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1283 bdev_io->internal.accel_sequence); 1284 } 1285 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1286 /* if this is write path, copy data from original buffer to bounce buffer */ 1287 if (bdev_io_use_memory_domain(bdev_io)) { 1288 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1289 bdev_io_increment_outstanding(ch, ch->shared_resource); 1290 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1291 bdev_io->internal.memory_domain_ctx, 1292 bdev_io->internal.orig_iovs, 1293 (uint32_t) bdev_io->internal.orig_iovcnt, 1294 bdev_io->u.bdev.iovs, 1, 1295 bdev_io_pull_data_done_and_track, 1296 bdev_io); 1297 if (rc == 0) { 1298 /* Continue to submit IO in completion callback */ 1299 return; 1300 } 1301 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1302 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1303 if (rc != -ENOMEM) { 1304 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1305 spdk_memory_domain_get_dma_device_id( 1306 bdev_io->internal.memory_domain)); 1307 } 1308 } else { 1309 assert(bdev_io->u.bdev.iovcnt == 1); 1310 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1311 bdev_io->u.bdev.iovs[0].iov_len, 1312 bdev_io->internal.orig_iovs, 1313 bdev_io->internal.orig_iovcnt); 1314 } 1315 } 1316 1317 if (spdk_unlikely(rc == -ENOMEM)) { 1318 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1319 } else { 1320 bdev_io_pull_data_done(bdev_io, rc); 1321 } 1322 } 1323 1324 static void 1325 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1326 bdev_copy_bounce_buffer_cpl cpl_cb) 1327 { 1328 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1329 1330 bdev_io->internal.data_transfer_cpl = cpl_cb; 1331 /* save original iovec */ 1332 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1333 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1334 /* set bounce iov */ 1335 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1336 bdev_io->u.bdev.iovcnt = 1; 1337 /* set bounce buffer for this operation */ 1338 bdev_io->u.bdev.iovs[0].iov_base = buf; 1339 bdev_io->u.bdev.iovs[0].iov_len = len; 1340 1341 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1342 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1343 } else { 1344 bdev_io_pull_data(bdev_io); 1345 } 1346 } 1347 1348 static void 1349 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1350 { 1351 struct spdk_bdev *bdev = bdev_io->bdev; 1352 bool buf_allocated; 1353 uint64_t alignment; 1354 void *aligned_buf; 1355 1356 bdev_io->internal.buf = buf; 1357 1358 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1359 bdev_io_get_buf_complete(bdev_io, true); 1360 return; 1361 } 1362 1363 alignment = spdk_bdev_get_buf_align(bdev); 1364 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1365 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1366 1367 if (buf_allocated) { 1368 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1369 /* Continue in completion callback */ 1370 return; 1371 } else { 1372 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1373 } 1374 1375 _bdev_io_set_md_buf(bdev_io); 1376 } 1377 1378 static inline uint64_t 1379 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1380 { 1381 struct spdk_bdev *bdev = bdev_io->bdev; 1382 uint64_t md_len, alignment; 1383 1384 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1385 1386 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1387 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1388 1389 return len + alignment + md_len; 1390 } 1391 1392 static void 1393 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1394 { 1395 struct spdk_bdev_mgmt_channel *ch; 1396 1397 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1398 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1399 } 1400 1401 static void 1402 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1403 { 1404 assert(bdev_io->internal.buf != NULL); 1405 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1406 bdev_io->internal.buf = NULL; 1407 } 1408 1409 void 1410 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1411 { 1412 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1413 1414 assert(buf != NULL); 1415 _bdev_io_put_buf(bdev_io, buf, len); 1416 } 1417 1418 static inline void 1419 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1420 struct spdk_bdev_io *bdev_io) 1421 { 1422 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1423 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1424 * sequence pointer to make sure we won't touch it anymore. */ 1425 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1426 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1427 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1428 bdev_io->internal.accel_sequence = NULL; 1429 } 1430 1431 bdev->fn_table->submit_request(ioch, bdev_io); 1432 } 1433 1434 static inline void 1435 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1436 { 1437 struct spdk_bdev *bdev = bdev_io->bdev; 1438 1439 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1440 bdev_io->internal.error.nvme.cdw0 = 0; 1441 bdev_io->num_retries++; 1442 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1443 } 1444 1445 static void 1446 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1447 { 1448 struct spdk_bdev_io *bdev_io; 1449 1450 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1451 /* 1452 * Allow some more I/O to complete before retrying the nomem_io queue. 1453 * Some drivers (such as nvme) cannot immediately take a new I/O in 1454 * the context of a completion, because the resources for the I/O are 1455 * not released until control returns to the bdev poller. Also, we 1456 * may require several small I/O to complete before a larger I/O 1457 * (that requires splitting) can be submitted. 1458 */ 1459 return; 1460 } 1461 1462 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1463 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1464 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1465 1466 switch (bdev_io->internal.retry_state) { 1467 case BDEV_IO_RETRY_STATE_SUBMIT: 1468 bdev_ch_resubmit_io(shared_resource, bdev_io); 1469 break; 1470 case BDEV_IO_RETRY_STATE_PULL: 1471 bdev_io_pull_data(bdev_io); 1472 break; 1473 case BDEV_IO_RETRY_STATE_PULL_MD: 1474 bdev_io_pull_md_buf(bdev_io); 1475 break; 1476 case BDEV_IO_RETRY_STATE_PUSH: 1477 bdev_io_push_bounce_data(bdev_io); 1478 break; 1479 case BDEV_IO_RETRY_STATE_PUSH_MD: 1480 bdev_io_push_bounce_md_buf(bdev_io); 1481 break; 1482 default: 1483 assert(0 && "invalid retry state"); 1484 break; 1485 } 1486 1487 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1488 /* This IO completed again with NOMEM status, so break the loop and 1489 * don't try anymore. Note that a bdev_io that fails with NOMEM 1490 * always gets requeued at the front of the list, to maintain 1491 * ordering. 1492 */ 1493 break; 1494 } 1495 } 1496 } 1497 1498 static void 1499 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1500 { 1501 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1502 } 1503 1504 static int 1505 bdev_no_mem_poller(void *ctx) 1506 { 1507 struct spdk_bdev_shared_resource *shared_resource = ctx; 1508 1509 spdk_poller_unregister(&shared_resource->nomem_poller); 1510 1511 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1512 bdev_shared_ch_retry_io(shared_resource); 1513 } 1514 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && shared_resource->io_outstanding == 0) { 1515 /* No IOs were submitted, try again */ 1516 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1517 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1518 } 1519 1520 return SPDK_POLLER_BUSY; 1521 } 1522 1523 static inline bool 1524 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1525 { 1526 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1527 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1528 1529 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1530 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1531 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1532 1533 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1534 /* Special case when we have nomem IOs and no outstanding IOs which completions 1535 * could trigger retry of queued IOs 1536 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1537 * new IOs submitted, e.g. qd==1 */ 1538 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1539 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1540 } 1541 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1542 * ownership of that sequence is transferred back to the bdev layer, so we need to 1543 * restore internal.accel_sequence to make sure that the sequence is handled 1544 * correctly in case the I/O is later aborted. */ 1545 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1546 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1547 assert(bdev_io->internal.accel_sequence == NULL); 1548 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1549 } 1550 1551 return true; 1552 } 1553 1554 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1555 bdev_ch_retry_io(bdev_ch); 1556 } 1557 1558 return false; 1559 } 1560 1561 static void 1562 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1563 { 1564 struct spdk_bdev_io *bdev_io = ctx; 1565 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1566 1567 if (rc) { 1568 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1569 } 1570 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1571 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1572 */ 1573 bdev_io_put_buf(bdev_io); 1574 1575 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1576 bdev_ch_retry_io(ch); 1577 } 1578 1579 /* Continue with IO completion flow */ 1580 bdev_io_complete(bdev_io); 1581 } 1582 1583 static void 1584 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1585 { 1586 struct spdk_bdev_io *bdev_io = ctx; 1587 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1588 1589 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1590 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1591 1592 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1593 bdev_ch_retry_io(ch); 1594 } 1595 1596 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1597 } 1598 1599 static inline void 1600 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1601 { 1602 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1603 int rc = 0; 1604 1605 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1606 /* do the same for metadata buffer */ 1607 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1608 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1609 1610 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1611 if (bdev_io_use_memory_domain(bdev_io)) { 1612 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1613 bdev_io_increment_outstanding(ch, ch->shared_resource); 1614 /* If memory domain is used then we need to call async push function */ 1615 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1616 bdev_io->internal.memory_domain_ctx, 1617 &bdev_io->internal.orig_md_iov, 1618 (uint32_t)bdev_io->internal.orig_iovcnt, 1619 &bdev_io->internal.bounce_md_iov, 1, 1620 bdev_io_push_bounce_md_buf_done, 1621 bdev_io); 1622 if (rc == 0) { 1623 /* Continue IO completion in async callback */ 1624 return; 1625 } 1626 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1627 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1628 if (rc != -ENOMEM) { 1629 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1630 spdk_memory_domain_get_dma_device_id( 1631 bdev_io->internal.memory_domain)); 1632 } 1633 } else { 1634 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1635 bdev_io->internal.orig_md_iov.iov_len); 1636 } 1637 } 1638 } 1639 1640 if (spdk_unlikely(rc == -ENOMEM)) { 1641 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1642 } else { 1643 assert(bdev_io->internal.data_transfer_cpl); 1644 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1645 } 1646 } 1647 1648 static inline void 1649 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1650 { 1651 assert(bdev_io->internal.data_transfer_cpl); 1652 if (rc) { 1653 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1654 return; 1655 } 1656 1657 /* set original buffer for this io */ 1658 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1659 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1660 /* disable bouncing buffer for this io */ 1661 bdev_io->internal.orig_iovcnt = 0; 1662 bdev_io->internal.orig_iovs = NULL; 1663 1664 bdev_io_push_bounce_md_buf(bdev_io); 1665 } 1666 1667 static void 1668 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1669 { 1670 struct spdk_bdev_io *bdev_io = ctx; 1671 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1672 1673 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1674 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1675 1676 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1677 bdev_ch_retry_io(ch); 1678 } 1679 1680 bdev_io_push_bounce_data_done(bdev_io, status); 1681 } 1682 1683 static inline void 1684 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1685 { 1686 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1687 int rc = 0; 1688 1689 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1690 assert(!bdev_io_use_accel_sequence(bdev_io)); 1691 1692 /* if this is read path, copy data from bounce buffer to original buffer */ 1693 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1694 if (bdev_io_use_memory_domain(bdev_io)) { 1695 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1696 bdev_io_increment_outstanding(ch, ch->shared_resource); 1697 /* If memory domain is used then we need to call async push function */ 1698 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1699 bdev_io->internal.memory_domain_ctx, 1700 bdev_io->internal.orig_iovs, 1701 (uint32_t)bdev_io->internal.orig_iovcnt, 1702 &bdev_io->internal.bounce_iov, 1, 1703 bdev_io_push_bounce_data_done_and_track, 1704 bdev_io); 1705 if (rc == 0) { 1706 /* Continue IO completion in async callback */ 1707 return; 1708 } 1709 1710 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1711 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1712 if (rc != -ENOMEM) { 1713 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1714 spdk_memory_domain_get_dma_device_id( 1715 bdev_io->internal.memory_domain)); 1716 } 1717 } else { 1718 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1719 bdev_io->internal.orig_iovcnt, 1720 bdev_io->internal.bounce_iov.iov_base, 1721 bdev_io->internal.bounce_iov.iov_len); 1722 } 1723 } 1724 1725 if (spdk_unlikely(rc == -ENOMEM)) { 1726 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1727 } else { 1728 bdev_io_push_bounce_data_done(bdev_io, rc); 1729 } 1730 } 1731 1732 static inline void 1733 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1734 { 1735 bdev_io->internal.data_transfer_cpl = cpl_cb; 1736 bdev_io_push_bounce_data(bdev_io); 1737 } 1738 1739 static void 1740 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1741 { 1742 struct spdk_bdev_io *bdev_io; 1743 1744 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1745 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1746 } 1747 1748 static void 1749 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1750 { 1751 struct spdk_bdev_mgmt_channel *mgmt_ch; 1752 uint64_t max_len; 1753 void *buf; 1754 1755 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1756 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1757 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1758 1759 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1760 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1761 bdev_io_get_buf_complete(bdev_io, false); 1762 return; 1763 } 1764 1765 bdev_io->internal.buf_len = len; 1766 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1767 bdev_io_get_iobuf_cb); 1768 if (buf != NULL) { 1769 _bdev_io_set_buf(bdev_io, buf, len); 1770 } 1771 } 1772 1773 void 1774 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1775 { 1776 struct spdk_bdev *bdev = bdev_io->bdev; 1777 uint64_t alignment; 1778 1779 assert(cb != NULL); 1780 bdev_io->internal.get_buf_cb = cb; 1781 1782 alignment = spdk_bdev_get_buf_align(bdev); 1783 1784 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1785 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1786 /* Buffer already present and aligned */ 1787 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1788 return; 1789 } 1790 1791 bdev_io_get_buf(bdev_io, len); 1792 } 1793 1794 static void 1795 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1796 bool success) 1797 { 1798 if (!success) { 1799 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1800 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1801 bdev_io_complete_unsubmitted(bdev_io); 1802 return; 1803 } 1804 1805 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1806 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1807 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1808 return; 1809 } 1810 /* For reads we'll execute the sequence after the data is read, so, for now, only 1811 * clear out accel_sequence pointer and submit the IO */ 1812 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1813 bdev_io->u.bdev.accel_sequence = NULL; 1814 } 1815 1816 bdev_io_submit(bdev_io); 1817 } 1818 1819 static void 1820 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1821 uint64_t len) 1822 { 1823 assert(cb != NULL); 1824 bdev_io->internal.get_buf_cb = cb; 1825 1826 bdev_io_get_buf(bdev_io, len); 1827 } 1828 1829 void 1830 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1831 { 1832 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1833 1834 assert(cb != NULL); 1835 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1836 bdev_io->internal.get_aux_buf_cb = cb; 1837 bdev_io_get_buf(bdev_io, len); 1838 } 1839 1840 static int 1841 bdev_module_get_max_ctx_size(void) 1842 { 1843 struct spdk_bdev_module *bdev_module; 1844 int max_bdev_module_size = 0; 1845 1846 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1847 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1848 max_bdev_module_size = bdev_module->get_ctx_size(); 1849 } 1850 } 1851 1852 return max_bdev_module_size; 1853 } 1854 1855 static void 1856 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1857 { 1858 if (!bdev->internal.histogram_enabled) { 1859 return; 1860 } 1861 1862 spdk_json_write_object_begin(w); 1863 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1864 1865 spdk_json_write_named_object_begin(w, "params"); 1866 spdk_json_write_named_string(w, "name", bdev->name); 1867 1868 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1869 spdk_json_write_object_end(w); 1870 1871 spdk_json_write_object_end(w); 1872 } 1873 1874 static void 1875 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1876 { 1877 int i; 1878 struct spdk_bdev_qos *qos = bdev->internal.qos; 1879 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1880 1881 if (!qos) { 1882 return; 1883 } 1884 1885 spdk_bdev_get_qos_rate_limits(bdev, limits); 1886 1887 spdk_json_write_object_begin(w); 1888 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1889 1890 spdk_json_write_named_object_begin(w, "params"); 1891 spdk_json_write_named_string(w, "name", bdev->name); 1892 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1893 if (limits[i] > 0) { 1894 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1895 } 1896 } 1897 spdk_json_write_object_end(w); 1898 1899 spdk_json_write_object_end(w); 1900 } 1901 1902 void 1903 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1904 { 1905 struct spdk_bdev_module *bdev_module; 1906 struct spdk_bdev *bdev; 1907 1908 assert(w != NULL); 1909 1910 spdk_json_write_array_begin(w); 1911 1912 spdk_json_write_object_begin(w); 1913 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1914 spdk_json_write_named_object_begin(w, "params"); 1915 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1916 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1917 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1918 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1919 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1920 spdk_json_write_object_end(w); 1921 spdk_json_write_object_end(w); 1922 1923 bdev_examine_allowlist_config_json(w); 1924 1925 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1926 if (bdev_module->config_json) { 1927 bdev_module->config_json(w); 1928 } 1929 } 1930 1931 spdk_spin_lock(&g_bdev_mgr.spinlock); 1932 1933 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1934 if (bdev->fn_table->write_config_json) { 1935 bdev->fn_table->write_config_json(bdev, w); 1936 } 1937 1938 bdev_qos_config_json(bdev, w); 1939 bdev_enable_histogram_config_json(bdev, w); 1940 } 1941 1942 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1943 1944 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1945 spdk_json_write_object_begin(w); 1946 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1947 spdk_json_write_object_end(w); 1948 1949 spdk_json_write_array_end(w); 1950 } 1951 1952 static void 1953 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1954 { 1955 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1956 struct spdk_bdev_io *bdev_io; 1957 1958 spdk_iobuf_channel_fini(&ch->iobuf); 1959 1960 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1961 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1962 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1963 ch->per_thread_cache_count--; 1964 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1965 } 1966 1967 assert(ch->per_thread_cache_count == 0); 1968 } 1969 1970 static int 1971 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1972 { 1973 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1974 struct spdk_bdev_io *bdev_io; 1975 uint32_t i; 1976 int rc; 1977 1978 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 1979 g_bdev_opts.iobuf_small_cache_size, 1980 g_bdev_opts.iobuf_large_cache_size); 1981 if (rc != 0) { 1982 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1983 return -1; 1984 } 1985 1986 STAILQ_INIT(&ch->per_thread_cache); 1987 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1988 1989 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1990 ch->per_thread_cache_count = 0; 1991 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1992 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1993 if (bdev_io == NULL) { 1994 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1995 assert(false); 1996 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1997 return -1; 1998 } 1999 ch->per_thread_cache_count++; 2000 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2001 } 2002 2003 TAILQ_INIT(&ch->shared_resources); 2004 TAILQ_INIT(&ch->io_wait_queue); 2005 2006 return 0; 2007 } 2008 2009 static void 2010 bdev_init_complete(int rc) 2011 { 2012 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2013 void *cb_arg = g_init_cb_arg; 2014 struct spdk_bdev_module *m; 2015 2016 g_bdev_mgr.init_complete = true; 2017 g_init_cb_fn = NULL; 2018 g_init_cb_arg = NULL; 2019 2020 /* 2021 * For modules that need to know when subsystem init is complete, 2022 * inform them now. 2023 */ 2024 if (rc == 0) { 2025 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2026 if (m->init_complete) { 2027 m->init_complete(); 2028 } 2029 } 2030 } 2031 2032 cb_fn(cb_arg, rc); 2033 } 2034 2035 static bool 2036 bdev_module_all_actions_completed(void) 2037 { 2038 struct spdk_bdev_module *m; 2039 2040 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2041 if (m->internal.action_in_progress > 0) { 2042 return false; 2043 } 2044 } 2045 return true; 2046 } 2047 2048 static void 2049 bdev_module_action_complete(void) 2050 { 2051 /* 2052 * Don't finish bdev subsystem initialization if 2053 * module pre-initialization is still in progress, or 2054 * the subsystem been already initialized. 2055 */ 2056 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2057 return; 2058 } 2059 2060 /* 2061 * Check all bdev modules for inits/examinations in progress. If any 2062 * exist, return immediately since we cannot finish bdev subsystem 2063 * initialization until all are completed. 2064 */ 2065 if (!bdev_module_all_actions_completed()) { 2066 return; 2067 } 2068 2069 /* 2070 * Modules already finished initialization - now that all 2071 * the bdev modules have finished their asynchronous I/O 2072 * processing, the entire bdev layer can be marked as complete. 2073 */ 2074 bdev_init_complete(0); 2075 } 2076 2077 static void 2078 bdev_module_action_done(struct spdk_bdev_module *module) 2079 { 2080 spdk_spin_lock(&module->internal.spinlock); 2081 assert(module->internal.action_in_progress > 0); 2082 module->internal.action_in_progress--; 2083 spdk_spin_unlock(&module->internal.spinlock); 2084 bdev_module_action_complete(); 2085 } 2086 2087 void 2088 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2089 { 2090 assert(module->async_init); 2091 bdev_module_action_done(module); 2092 } 2093 2094 void 2095 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2096 { 2097 bdev_module_action_done(module); 2098 } 2099 2100 /** The last initialized bdev module */ 2101 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2102 2103 static void 2104 bdev_init_failed(void *cb_arg) 2105 { 2106 struct spdk_bdev_module *module = cb_arg; 2107 2108 spdk_spin_lock(&module->internal.spinlock); 2109 assert(module->internal.action_in_progress > 0); 2110 module->internal.action_in_progress--; 2111 spdk_spin_unlock(&module->internal.spinlock); 2112 bdev_init_complete(-1); 2113 } 2114 2115 static int 2116 bdev_modules_init(void) 2117 { 2118 struct spdk_bdev_module *module; 2119 int rc = 0; 2120 2121 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2122 g_resume_bdev_module = module; 2123 if (module->async_init) { 2124 spdk_spin_lock(&module->internal.spinlock); 2125 module->internal.action_in_progress = 1; 2126 spdk_spin_unlock(&module->internal.spinlock); 2127 } 2128 rc = module->module_init(); 2129 if (rc != 0) { 2130 /* Bump action_in_progress to prevent other modules from completion of modules_init 2131 * Send message to defer application shutdown until resources are cleaned up */ 2132 spdk_spin_lock(&module->internal.spinlock); 2133 module->internal.action_in_progress = 1; 2134 spdk_spin_unlock(&module->internal.spinlock); 2135 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2136 return rc; 2137 } 2138 } 2139 2140 g_resume_bdev_module = NULL; 2141 return 0; 2142 } 2143 2144 void 2145 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2146 { 2147 int rc = 0; 2148 char mempool_name[32]; 2149 2150 assert(cb_fn != NULL); 2151 2152 g_init_cb_fn = cb_fn; 2153 g_init_cb_arg = cb_arg; 2154 2155 spdk_notify_type_register("bdev_register"); 2156 spdk_notify_type_register("bdev_unregister"); 2157 2158 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2159 2160 rc = spdk_iobuf_register_module("bdev"); 2161 if (rc != 0) { 2162 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2163 bdev_init_complete(-1); 2164 return; 2165 } 2166 2167 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2168 g_bdev_opts.bdev_io_pool_size, 2169 sizeof(struct spdk_bdev_io) + 2170 bdev_module_get_max_ctx_size(), 2171 0, 2172 SPDK_ENV_SOCKET_ID_ANY); 2173 2174 if (g_bdev_mgr.bdev_io_pool == NULL) { 2175 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2176 bdev_init_complete(-1); 2177 return; 2178 } 2179 2180 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2181 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2182 if (!g_bdev_mgr.zero_buffer) { 2183 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2184 bdev_init_complete(-1); 2185 return; 2186 } 2187 2188 #ifdef SPDK_CONFIG_VTUNE 2189 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2190 #endif 2191 2192 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2193 bdev_mgmt_channel_destroy, 2194 sizeof(struct spdk_bdev_mgmt_channel), 2195 "bdev_mgr"); 2196 2197 rc = bdev_modules_init(); 2198 g_bdev_mgr.module_init_complete = true; 2199 if (rc != 0) { 2200 SPDK_ERRLOG("bdev modules init failed\n"); 2201 return; 2202 } 2203 2204 bdev_module_action_complete(); 2205 } 2206 2207 static void 2208 bdev_mgr_unregister_cb(void *io_device) 2209 { 2210 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2211 2212 if (g_bdev_mgr.bdev_io_pool) { 2213 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2214 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2215 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2216 g_bdev_opts.bdev_io_pool_size); 2217 } 2218 2219 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2220 } 2221 2222 spdk_free(g_bdev_mgr.zero_buffer); 2223 2224 bdev_examine_allowlist_free(); 2225 2226 cb_fn(g_fini_cb_arg); 2227 g_fini_cb_fn = NULL; 2228 g_fini_cb_arg = NULL; 2229 g_bdev_mgr.init_complete = false; 2230 g_bdev_mgr.module_init_complete = false; 2231 } 2232 2233 static void 2234 bdev_module_fini_iter(void *arg) 2235 { 2236 struct spdk_bdev_module *bdev_module; 2237 2238 /* FIXME: Handling initialization failures is broken now, 2239 * so we won't even try cleaning up after successfully 2240 * initialized modules. if module_init_complete is false, 2241 * just call spdk_bdev_mgr_unregister_cb 2242 */ 2243 if (!g_bdev_mgr.module_init_complete) { 2244 bdev_mgr_unregister_cb(NULL); 2245 return; 2246 } 2247 2248 /* Start iterating from the last touched module */ 2249 if (!g_resume_bdev_module) { 2250 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2251 } else { 2252 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2253 internal.tailq); 2254 } 2255 2256 while (bdev_module) { 2257 if (bdev_module->async_fini) { 2258 /* Save our place so we can resume later. We must 2259 * save the variable here, before calling module_fini() 2260 * below, because in some cases the module may immediately 2261 * call spdk_bdev_module_fini_done() and re-enter 2262 * this function to continue iterating. */ 2263 g_resume_bdev_module = bdev_module; 2264 } 2265 2266 if (bdev_module->module_fini) { 2267 bdev_module->module_fini(); 2268 } 2269 2270 if (bdev_module->async_fini) { 2271 return; 2272 } 2273 2274 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2275 internal.tailq); 2276 } 2277 2278 g_resume_bdev_module = NULL; 2279 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2280 } 2281 2282 void 2283 spdk_bdev_module_fini_done(void) 2284 { 2285 if (spdk_get_thread() != g_fini_thread) { 2286 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2287 } else { 2288 bdev_module_fini_iter(NULL); 2289 } 2290 } 2291 2292 static void 2293 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2294 { 2295 struct spdk_bdev *bdev = cb_arg; 2296 2297 if (bdeverrno && bdev) { 2298 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2299 bdev->name); 2300 2301 /* 2302 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2303 * bdev; try to continue by manually removing this bdev from the list and continue 2304 * with the next bdev in the list. 2305 */ 2306 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2307 } 2308 2309 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2310 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2311 /* 2312 * Bdev module finish need to be deferred as we might be in the middle of some context 2313 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2314 * after returning. 2315 */ 2316 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2317 return; 2318 } 2319 2320 /* 2321 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2322 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2323 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2324 * base bdevs. 2325 * 2326 * Also, walk the list in the reverse order. 2327 */ 2328 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2329 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2330 spdk_spin_lock(&bdev->internal.spinlock); 2331 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2332 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2333 spdk_spin_unlock(&bdev->internal.spinlock); 2334 continue; 2335 } 2336 spdk_spin_unlock(&bdev->internal.spinlock); 2337 2338 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2339 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2340 return; 2341 } 2342 2343 /* 2344 * If any bdev fails to unclaim underlying bdev properly, we may face the 2345 * case of bdev list consisting of claimed bdevs only (if claims are managed 2346 * correctly, this would mean there's a loop in the claims graph which is 2347 * clearly impossible). Warn and unregister last bdev on the list then. 2348 */ 2349 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2350 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2351 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2352 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2353 return; 2354 } 2355 } 2356 2357 static void 2358 bdev_module_fini_start_iter(void *arg) 2359 { 2360 struct spdk_bdev_module *bdev_module; 2361 2362 if (!g_resume_bdev_module) { 2363 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2364 } else { 2365 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2366 } 2367 2368 while (bdev_module) { 2369 if (bdev_module->async_fini_start) { 2370 /* Save our place so we can resume later. We must 2371 * save the variable here, before calling fini_start() 2372 * below, because in some cases the module may immediately 2373 * call spdk_bdev_module_fini_start_done() and re-enter 2374 * this function to continue iterating. */ 2375 g_resume_bdev_module = bdev_module; 2376 } 2377 2378 if (bdev_module->fini_start) { 2379 bdev_module->fini_start(); 2380 } 2381 2382 if (bdev_module->async_fini_start) { 2383 return; 2384 } 2385 2386 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2387 } 2388 2389 g_resume_bdev_module = NULL; 2390 2391 bdev_finish_unregister_bdevs_iter(NULL, 0); 2392 } 2393 2394 void 2395 spdk_bdev_module_fini_start_done(void) 2396 { 2397 if (spdk_get_thread() != g_fini_thread) { 2398 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2399 } else { 2400 bdev_module_fini_start_iter(NULL); 2401 } 2402 } 2403 2404 static void 2405 bdev_finish_wait_for_examine_done(void *cb_arg) 2406 { 2407 bdev_module_fini_start_iter(NULL); 2408 } 2409 2410 static void bdev_open_async_fini(void); 2411 2412 void 2413 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2414 { 2415 int rc; 2416 2417 assert(cb_fn != NULL); 2418 2419 g_fini_thread = spdk_get_thread(); 2420 2421 g_fini_cb_fn = cb_fn; 2422 g_fini_cb_arg = cb_arg; 2423 2424 bdev_open_async_fini(); 2425 2426 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2427 if (rc != 0) { 2428 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2429 bdev_finish_wait_for_examine_done(NULL); 2430 } 2431 } 2432 2433 struct spdk_bdev_io * 2434 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2435 { 2436 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2437 struct spdk_bdev_io *bdev_io; 2438 2439 if (ch->per_thread_cache_count > 0) { 2440 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2441 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2442 ch->per_thread_cache_count--; 2443 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2444 /* 2445 * Don't try to look for bdev_ios in the global pool if there are 2446 * waiters on bdev_ios - we don't want this caller to jump the line. 2447 */ 2448 bdev_io = NULL; 2449 } else { 2450 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2451 } 2452 2453 return bdev_io; 2454 } 2455 2456 void 2457 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2458 { 2459 struct spdk_bdev_mgmt_channel *ch; 2460 2461 assert(bdev_io != NULL); 2462 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2463 2464 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2465 2466 if (bdev_io->internal.buf != NULL) { 2467 bdev_io_put_buf(bdev_io); 2468 } 2469 2470 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2471 ch->per_thread_cache_count++; 2472 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2473 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2474 struct spdk_bdev_io_wait_entry *entry; 2475 2476 entry = TAILQ_FIRST(&ch->io_wait_queue); 2477 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2478 entry->cb_fn(entry->cb_arg); 2479 } 2480 } else { 2481 /* We should never have a full cache with entries on the io wait queue. */ 2482 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2483 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2484 } 2485 } 2486 2487 static bool 2488 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2489 { 2490 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2491 2492 switch (limit) { 2493 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2494 return true; 2495 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2496 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2497 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2498 return false; 2499 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2500 default: 2501 return false; 2502 } 2503 } 2504 2505 static bool 2506 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2507 { 2508 switch (bdev_io->type) { 2509 case SPDK_BDEV_IO_TYPE_NVME_IO: 2510 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2511 case SPDK_BDEV_IO_TYPE_READ: 2512 case SPDK_BDEV_IO_TYPE_WRITE: 2513 return true; 2514 case SPDK_BDEV_IO_TYPE_ZCOPY: 2515 if (bdev_io->u.bdev.zcopy.start) { 2516 return true; 2517 } else { 2518 return false; 2519 } 2520 default: 2521 return false; 2522 } 2523 } 2524 2525 static bool 2526 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2527 { 2528 switch (bdev_io->type) { 2529 case SPDK_BDEV_IO_TYPE_NVME_IO: 2530 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2531 /* Bit 1 (0x2) set for read operation */ 2532 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2533 return true; 2534 } else { 2535 return false; 2536 } 2537 case SPDK_BDEV_IO_TYPE_READ: 2538 return true; 2539 case SPDK_BDEV_IO_TYPE_ZCOPY: 2540 /* Populate to read from disk */ 2541 if (bdev_io->u.bdev.zcopy.populate) { 2542 return true; 2543 } else { 2544 return false; 2545 } 2546 default: 2547 return false; 2548 } 2549 } 2550 2551 static uint64_t 2552 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2553 { 2554 struct spdk_bdev *bdev = bdev_io->bdev; 2555 2556 switch (bdev_io->type) { 2557 case SPDK_BDEV_IO_TYPE_NVME_IO: 2558 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2559 return bdev_io->u.nvme_passthru.nbytes; 2560 case SPDK_BDEV_IO_TYPE_READ: 2561 case SPDK_BDEV_IO_TYPE_WRITE: 2562 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2563 case SPDK_BDEV_IO_TYPE_ZCOPY: 2564 /* Track the data in the start phase only */ 2565 if (bdev_io->u.bdev.zcopy.start) { 2566 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2567 } else { 2568 return 0; 2569 } 2570 default: 2571 return 0; 2572 } 2573 } 2574 2575 static inline bool 2576 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2577 { 2578 int64_t remaining_this_timeslice; 2579 2580 if (!limit->max_per_timeslice) { 2581 /* The QoS is disabled */ 2582 return false; 2583 } 2584 2585 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2586 __ATOMIC_RELAXED); 2587 if (remaining_this_timeslice + (int64_t)delta > 0) { 2588 /* There was still a quota for this delta -> the IO shouldn't be queued 2589 * 2590 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2591 * quota can be allowed once a while. Such overrun then taken into account in 2592 * the QoS poller, where the next timeslice quota is calculated. 2593 */ 2594 return false; 2595 } 2596 2597 /* There was no quota for this delta -> the IO should be queued 2598 * The remaining_this_timeslice must be rewinded so it reflects the real 2599 * amount of IOs or bytes allowed. 2600 */ 2601 __atomic_add_fetch( 2602 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2603 return true; 2604 } 2605 2606 static inline void 2607 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2608 { 2609 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2610 } 2611 2612 static bool 2613 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2614 { 2615 return bdev_qos_rw_queue_io(limit, io, 1); 2616 } 2617 2618 static void 2619 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2620 { 2621 bdev_qos_rw_rewind_io(limit, io, 1); 2622 } 2623 2624 static bool 2625 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2626 { 2627 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2628 } 2629 2630 static void 2631 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2632 { 2633 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2634 } 2635 2636 static bool 2637 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2638 { 2639 if (bdev_is_read_io(io) == false) { 2640 return false; 2641 } 2642 2643 return bdev_qos_rw_bps_queue(limit, io); 2644 } 2645 2646 static void 2647 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2648 { 2649 if (bdev_is_read_io(io) != false) { 2650 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2651 } 2652 } 2653 2654 static bool 2655 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2656 { 2657 if (bdev_is_read_io(io) == true) { 2658 return false; 2659 } 2660 2661 return bdev_qos_rw_bps_queue(limit, io); 2662 } 2663 2664 static void 2665 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2666 { 2667 if (bdev_is_read_io(io) != true) { 2668 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2669 } 2670 } 2671 2672 static void 2673 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2674 { 2675 int i; 2676 2677 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2678 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2679 qos->rate_limits[i].queue_io = NULL; 2680 continue; 2681 } 2682 2683 switch (i) { 2684 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2685 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2686 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2687 break; 2688 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2689 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2690 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2691 break; 2692 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2693 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2694 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2695 break; 2696 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2697 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2698 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2699 break; 2700 default: 2701 break; 2702 } 2703 } 2704 } 2705 2706 static void 2707 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2708 struct spdk_bdev_io *bdev_io, 2709 enum spdk_bdev_io_status status) 2710 { 2711 bdev_io->internal.in_submit_request = true; 2712 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2713 spdk_bdev_io_complete(bdev_io, status); 2714 bdev_io->internal.in_submit_request = false; 2715 } 2716 2717 static inline void 2718 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2719 { 2720 struct spdk_bdev *bdev = bdev_io->bdev; 2721 struct spdk_io_channel *ch = bdev_ch->channel; 2722 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2723 2724 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2725 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2726 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2727 2728 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2729 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2730 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2731 SPDK_BDEV_IO_STATUS_SUCCESS); 2732 return; 2733 } 2734 } 2735 2736 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2737 bdev_io->bdev->split_on_write_unit && 2738 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2739 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2740 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2741 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2742 return; 2743 } 2744 2745 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2746 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2747 bdev_io->internal.in_submit_request = true; 2748 bdev_submit_request(bdev, ch, bdev_io); 2749 bdev_io->internal.in_submit_request = false; 2750 } else { 2751 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2752 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2753 /* Special case when we have nomem IOs and no outstanding IOs which completions 2754 * could trigger retry of queued IOs */ 2755 bdev_shared_ch_retry_io(shared_resource); 2756 } 2757 } 2758 } 2759 2760 static bool 2761 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2762 { 2763 int i; 2764 2765 if (bdev_qos_io_to_limit(bdev_io) == true) { 2766 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2767 if (!qos->rate_limits[i].queue_io) { 2768 continue; 2769 } 2770 2771 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2772 bdev_io) == true) { 2773 for (i -= 1; i >= 0 ; i--) { 2774 if (!qos->rate_limits[i].queue_io) { 2775 continue; 2776 } 2777 2778 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2779 } 2780 return true; 2781 } 2782 } 2783 } 2784 2785 return false; 2786 } 2787 2788 static int 2789 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2790 { 2791 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2792 int submitted_ios = 0; 2793 2794 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2795 if (!bdev_qos_queue_io(qos, bdev_io)) { 2796 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2797 bdev_io_do_submit(ch, bdev_io); 2798 2799 submitted_ios++; 2800 } 2801 } 2802 2803 return submitted_ios; 2804 } 2805 2806 static void 2807 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2808 { 2809 int rc; 2810 2811 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2812 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2813 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2814 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2815 &bdev_io->internal.waitq_entry); 2816 if (rc != 0) { 2817 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2818 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2819 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2820 } 2821 } 2822 2823 static bool 2824 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2825 { 2826 uint32_t io_boundary; 2827 struct spdk_bdev *bdev = bdev_io->bdev; 2828 uint32_t max_segment_size = bdev->max_segment_size; 2829 uint32_t max_size = bdev->max_rw_size; 2830 int max_segs = bdev->max_num_segments; 2831 2832 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2833 io_boundary = bdev->write_unit_size; 2834 } else if (bdev->split_on_optimal_io_boundary) { 2835 io_boundary = bdev->optimal_io_boundary; 2836 } else { 2837 io_boundary = 0; 2838 } 2839 2840 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2841 return false; 2842 } 2843 2844 if (io_boundary) { 2845 uint64_t start_stripe, end_stripe; 2846 2847 start_stripe = bdev_io->u.bdev.offset_blocks; 2848 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2849 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2850 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2851 start_stripe >>= spdk_u32log2(io_boundary); 2852 end_stripe >>= spdk_u32log2(io_boundary); 2853 } else { 2854 start_stripe /= io_boundary; 2855 end_stripe /= io_boundary; 2856 } 2857 2858 if (start_stripe != end_stripe) { 2859 return true; 2860 } 2861 } 2862 2863 if (max_segs) { 2864 if (bdev_io->u.bdev.iovcnt > max_segs) { 2865 return true; 2866 } 2867 } 2868 2869 if (max_segment_size) { 2870 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2871 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2872 return true; 2873 } 2874 } 2875 } 2876 2877 if (max_size) { 2878 if (bdev_io->u.bdev.num_blocks > max_size) { 2879 return true; 2880 } 2881 } 2882 2883 return false; 2884 } 2885 2886 static bool 2887 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2888 { 2889 uint32_t num_unmap_segments; 2890 2891 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2892 return false; 2893 } 2894 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2895 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2896 return true; 2897 } 2898 2899 return false; 2900 } 2901 2902 static bool 2903 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2904 { 2905 if (!bdev_io->bdev->max_write_zeroes) { 2906 return false; 2907 } 2908 2909 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2910 return true; 2911 } 2912 2913 return false; 2914 } 2915 2916 static bool 2917 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2918 { 2919 if (bdev_io->bdev->max_copy != 0 && 2920 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2921 return true; 2922 } 2923 2924 return false; 2925 } 2926 2927 static bool 2928 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2929 { 2930 switch (bdev_io->type) { 2931 case SPDK_BDEV_IO_TYPE_READ: 2932 case SPDK_BDEV_IO_TYPE_WRITE: 2933 return bdev_rw_should_split(bdev_io); 2934 case SPDK_BDEV_IO_TYPE_UNMAP: 2935 return bdev_unmap_should_split(bdev_io); 2936 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2937 return bdev_write_zeroes_should_split(bdev_io); 2938 case SPDK_BDEV_IO_TYPE_COPY: 2939 return bdev_copy_should_split(bdev_io); 2940 default: 2941 return false; 2942 } 2943 } 2944 2945 static uint32_t 2946 _to_next_boundary(uint64_t offset, uint32_t boundary) 2947 { 2948 return (boundary - (offset % boundary)); 2949 } 2950 2951 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2952 2953 static void _bdev_rw_split(void *_bdev_io); 2954 2955 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2956 2957 static void 2958 _bdev_unmap_split(void *_bdev_io) 2959 { 2960 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2961 } 2962 2963 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2964 2965 static void 2966 _bdev_write_zeroes_split(void *_bdev_io) 2967 { 2968 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2969 } 2970 2971 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2972 2973 static void 2974 _bdev_copy_split(void *_bdev_io) 2975 { 2976 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2977 } 2978 2979 static int 2980 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2981 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2982 { 2983 int rc; 2984 uint64_t current_offset, current_remaining, current_src_offset; 2985 spdk_bdev_io_wait_cb io_wait_fn; 2986 2987 current_offset = *offset; 2988 current_remaining = *remaining; 2989 2990 bdev_io->u.bdev.split_outstanding++; 2991 2992 io_wait_fn = _bdev_rw_split; 2993 switch (bdev_io->type) { 2994 case SPDK_BDEV_IO_TYPE_READ: 2995 assert(bdev_io->u.bdev.accel_sequence == NULL); 2996 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2997 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2998 iov, iovcnt, md_buf, current_offset, 2999 num_blocks, bdev_io->internal.memory_domain, 3000 bdev_io->internal.memory_domain_ctx, NULL, 3001 bdev_io_split_done, bdev_io); 3002 break; 3003 case SPDK_BDEV_IO_TYPE_WRITE: 3004 assert(bdev_io->u.bdev.accel_sequence == NULL); 3005 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3006 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3007 iov, iovcnt, md_buf, current_offset, 3008 num_blocks, bdev_io->internal.memory_domain, 3009 bdev_io->internal.memory_domain_ctx, NULL, 3010 bdev_io_split_done, bdev_io); 3011 break; 3012 case SPDK_BDEV_IO_TYPE_UNMAP: 3013 io_wait_fn = _bdev_unmap_split; 3014 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3015 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3016 current_offset, num_blocks, 3017 bdev_io_split_done, bdev_io); 3018 break; 3019 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3020 io_wait_fn = _bdev_write_zeroes_split; 3021 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3022 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3023 current_offset, num_blocks, 3024 bdev_io_split_done, bdev_io); 3025 break; 3026 case SPDK_BDEV_IO_TYPE_COPY: 3027 io_wait_fn = _bdev_copy_split; 3028 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3029 (current_offset - bdev_io->u.bdev.offset_blocks); 3030 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3031 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3032 current_offset, current_src_offset, num_blocks, 3033 bdev_io_split_done, bdev_io); 3034 break; 3035 default: 3036 assert(false); 3037 rc = -EINVAL; 3038 break; 3039 } 3040 3041 if (rc == 0) { 3042 current_offset += num_blocks; 3043 current_remaining -= num_blocks; 3044 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 3045 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 3046 *offset = current_offset; 3047 *remaining = current_remaining; 3048 } else { 3049 bdev_io->u.bdev.split_outstanding--; 3050 if (rc == -ENOMEM) { 3051 if (bdev_io->u.bdev.split_outstanding == 0) { 3052 /* No I/O is outstanding. Hence we should wait here. */ 3053 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3054 } 3055 } else { 3056 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3057 if (bdev_io->u.bdev.split_outstanding == 0) { 3058 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3059 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3060 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3061 } 3062 } 3063 } 3064 3065 return rc; 3066 } 3067 3068 static void 3069 _bdev_rw_split(void *_bdev_io) 3070 { 3071 struct iovec *parent_iov, *iov; 3072 struct spdk_bdev_io *bdev_io = _bdev_io; 3073 struct spdk_bdev *bdev = bdev_io->bdev; 3074 uint64_t parent_offset, current_offset, remaining; 3075 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3076 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3077 uint32_t iovcnt, iov_len, child_iovsize; 3078 uint32_t blocklen = bdev->blocklen; 3079 uint32_t io_boundary; 3080 uint32_t max_segment_size = bdev->max_segment_size; 3081 uint32_t max_child_iovcnt = bdev->max_num_segments; 3082 uint32_t max_size = bdev->max_rw_size; 3083 void *md_buf = NULL; 3084 int rc; 3085 3086 max_size = max_size ? max_size : UINT32_MAX; 3087 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3088 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3089 SPDK_BDEV_IO_NUM_CHILD_IOV; 3090 3091 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3092 io_boundary = bdev->write_unit_size; 3093 } else if (bdev->split_on_optimal_io_boundary) { 3094 io_boundary = bdev->optimal_io_boundary; 3095 } else { 3096 io_boundary = UINT32_MAX; 3097 } 3098 3099 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3100 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3101 parent_offset = bdev_io->u.bdev.offset_blocks; 3102 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3103 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3104 3105 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3106 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3107 if (parent_iov_offset < parent_iov->iov_len) { 3108 break; 3109 } 3110 parent_iov_offset -= parent_iov->iov_len; 3111 } 3112 3113 child_iovcnt = 0; 3114 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3115 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3116 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3117 to_next_boundary = spdk_min(remaining, to_next_boundary); 3118 to_next_boundary = spdk_min(max_size, to_next_boundary); 3119 to_next_boundary_bytes = to_next_boundary * blocklen; 3120 3121 iov = &bdev_io->child_iov[child_iovcnt]; 3122 iovcnt = 0; 3123 3124 if (bdev_io->u.bdev.md_buf) { 3125 md_buf = (char *)bdev_io->u.bdev.md_buf + 3126 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3127 } 3128 3129 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3130 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3131 iovcnt < child_iovsize) { 3132 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3133 iov_len = parent_iov->iov_len - parent_iov_offset; 3134 3135 iov_len = spdk_min(iov_len, max_segment_size); 3136 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3137 to_next_boundary_bytes -= iov_len; 3138 3139 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3140 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3141 3142 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3143 parent_iov_offset += iov_len; 3144 } else { 3145 parent_iovpos++; 3146 parent_iov_offset = 0; 3147 } 3148 child_iovcnt++; 3149 iovcnt++; 3150 } 3151 3152 if (to_next_boundary_bytes > 0) { 3153 /* We had to stop this child I/O early because we ran out of 3154 * child_iov space or were limited by max_num_segments. 3155 * Ensure the iovs to be aligned with block size and 3156 * then adjust to_next_boundary before starting the 3157 * child I/O. 3158 */ 3159 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3160 iovcnt == child_iovsize); 3161 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3162 if (to_last_block_bytes != 0) { 3163 uint32_t child_iovpos = child_iovcnt - 1; 3164 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3165 * so the loop will naturally end 3166 */ 3167 3168 to_last_block_bytes = blocklen - to_last_block_bytes; 3169 to_next_boundary_bytes += to_last_block_bytes; 3170 while (to_last_block_bytes > 0 && iovcnt > 0) { 3171 iov_len = spdk_min(to_last_block_bytes, 3172 bdev_io->child_iov[child_iovpos].iov_len); 3173 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3174 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3175 child_iovpos--; 3176 if (--iovcnt == 0) { 3177 /* If the child IO is less than a block size just return. 3178 * If the first child IO of any split round is less than 3179 * a block size, an error exit. 3180 */ 3181 if (bdev_io->u.bdev.split_outstanding == 0) { 3182 SPDK_ERRLOG("The first child io was less than a block size\n"); 3183 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3184 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3185 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3186 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3187 } 3188 3189 return; 3190 } 3191 } 3192 3193 to_last_block_bytes -= iov_len; 3194 3195 if (parent_iov_offset == 0) { 3196 parent_iovpos--; 3197 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3198 } 3199 parent_iov_offset -= iov_len; 3200 } 3201 3202 assert(to_last_block_bytes == 0); 3203 } 3204 to_next_boundary -= to_next_boundary_bytes / blocklen; 3205 } 3206 3207 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3208 ¤t_offset, &remaining); 3209 if (spdk_unlikely(rc)) { 3210 return; 3211 } 3212 } 3213 } 3214 3215 static void 3216 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3217 { 3218 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3219 uint32_t num_children_reqs = 0; 3220 int rc; 3221 3222 offset = bdev_io->u.bdev.split_current_offset_blocks; 3223 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3224 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3225 3226 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3227 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3228 3229 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3230 &offset, &remaining); 3231 if (spdk_likely(rc == 0)) { 3232 num_children_reqs++; 3233 } else { 3234 return; 3235 } 3236 } 3237 } 3238 3239 static void 3240 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3241 { 3242 uint64_t offset, write_zeroes_blocks, remaining; 3243 uint32_t num_children_reqs = 0; 3244 int rc; 3245 3246 offset = bdev_io->u.bdev.split_current_offset_blocks; 3247 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3248 3249 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3250 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3251 3252 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3253 &offset, &remaining); 3254 if (spdk_likely(rc == 0)) { 3255 num_children_reqs++; 3256 } else { 3257 return; 3258 } 3259 } 3260 } 3261 3262 static void 3263 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3264 { 3265 uint64_t offset, copy_blocks, remaining; 3266 uint32_t num_children_reqs = 0; 3267 int rc; 3268 3269 offset = bdev_io->u.bdev.split_current_offset_blocks; 3270 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3271 3272 assert(bdev_io->bdev->max_copy != 0); 3273 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3274 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3275 3276 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3277 &offset, &remaining); 3278 if (spdk_likely(rc == 0)) { 3279 num_children_reqs++; 3280 } else { 3281 return; 3282 } 3283 } 3284 } 3285 3286 static void 3287 parent_bdev_io_complete(void *ctx, int rc) 3288 { 3289 struct spdk_bdev_io *parent_io = ctx; 3290 3291 if (rc) { 3292 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3293 } 3294 3295 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3296 parent_io->internal.caller_ctx); 3297 } 3298 3299 static void 3300 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3301 { 3302 struct spdk_bdev_io *bdev_io = ctx; 3303 3304 /* u.bdev.accel_sequence should have already been cleared at this point */ 3305 assert(bdev_io->u.bdev.accel_sequence == NULL); 3306 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3307 bdev_io->internal.accel_sequence = NULL; 3308 3309 if (spdk_unlikely(status != 0)) { 3310 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3311 } 3312 3313 parent_bdev_io_complete(bdev_io, status); 3314 } 3315 3316 static void 3317 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3318 { 3319 struct spdk_bdev_io *parent_io = cb_arg; 3320 3321 spdk_bdev_free_io(bdev_io); 3322 3323 if (!success) { 3324 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3325 /* If any child I/O failed, stop further splitting process. */ 3326 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3327 parent_io->u.bdev.split_remaining_num_blocks = 0; 3328 } 3329 parent_io->u.bdev.split_outstanding--; 3330 if (parent_io->u.bdev.split_outstanding != 0) { 3331 return; 3332 } 3333 3334 /* 3335 * Parent I/O finishes when all blocks are consumed. 3336 */ 3337 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3338 assert(parent_io->internal.cb != bdev_io_split_done); 3339 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3340 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3341 3342 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3343 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3344 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3345 return; 3346 } else if (parent_io->internal.orig_iovcnt != 0 && 3347 !bdev_io_use_accel_sequence(bdev_io)) { 3348 /* bdev IO will be completed in the callback */ 3349 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3350 return; 3351 } 3352 } 3353 3354 parent_bdev_io_complete(parent_io, 0); 3355 return; 3356 } 3357 3358 /* 3359 * Continue with the splitting process. This function will complete the parent I/O if the 3360 * splitting is done. 3361 */ 3362 switch (parent_io->type) { 3363 case SPDK_BDEV_IO_TYPE_READ: 3364 case SPDK_BDEV_IO_TYPE_WRITE: 3365 _bdev_rw_split(parent_io); 3366 break; 3367 case SPDK_BDEV_IO_TYPE_UNMAP: 3368 bdev_unmap_split(parent_io); 3369 break; 3370 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3371 bdev_write_zeroes_split(parent_io); 3372 break; 3373 case SPDK_BDEV_IO_TYPE_COPY: 3374 bdev_copy_split(parent_io); 3375 break; 3376 default: 3377 assert(false); 3378 break; 3379 } 3380 } 3381 3382 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3383 bool success); 3384 3385 static void 3386 bdev_io_split(struct spdk_bdev_io *bdev_io) 3387 { 3388 assert(bdev_io_should_split(bdev_io)); 3389 3390 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3391 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3392 bdev_io->u.bdev.split_outstanding = 0; 3393 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3394 3395 switch (bdev_io->type) { 3396 case SPDK_BDEV_IO_TYPE_READ: 3397 case SPDK_BDEV_IO_TYPE_WRITE: 3398 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3399 _bdev_rw_split(bdev_io); 3400 } else { 3401 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3402 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3403 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3404 } 3405 break; 3406 case SPDK_BDEV_IO_TYPE_UNMAP: 3407 bdev_unmap_split(bdev_io); 3408 break; 3409 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3410 bdev_write_zeroes_split(bdev_io); 3411 break; 3412 case SPDK_BDEV_IO_TYPE_COPY: 3413 bdev_copy_split(bdev_io); 3414 break; 3415 default: 3416 assert(false); 3417 break; 3418 } 3419 } 3420 3421 static void 3422 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3423 { 3424 if (!success) { 3425 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3426 return; 3427 } 3428 3429 _bdev_rw_split(bdev_io); 3430 } 3431 3432 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3433 * be inlined, at least on some compilers. 3434 */ 3435 static inline void 3436 _bdev_io_submit(void *ctx) 3437 { 3438 struct spdk_bdev_io *bdev_io = ctx; 3439 struct spdk_bdev *bdev = bdev_io->bdev; 3440 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3441 3442 if (spdk_likely(bdev_ch->flags == 0)) { 3443 bdev_io_do_submit(bdev_ch, bdev_io); 3444 return; 3445 } 3446 3447 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3448 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3449 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3450 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3451 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3452 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3453 } else { 3454 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3455 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3456 } 3457 } else { 3458 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3459 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3460 } 3461 } 3462 3463 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3464 3465 bool 3466 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3467 { 3468 if (range1->length == 0 || range2->length == 0) { 3469 return false; 3470 } 3471 3472 if (range1->offset + range1->length <= range2->offset) { 3473 return false; 3474 } 3475 3476 if (range2->offset + range2->length <= range1->offset) { 3477 return false; 3478 } 3479 3480 return true; 3481 } 3482 3483 static bool 3484 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3485 { 3486 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3487 struct lba_range r; 3488 3489 switch (bdev_io->type) { 3490 case SPDK_BDEV_IO_TYPE_NVME_IO: 3491 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3492 /* Don't try to decode the NVMe command - just assume worst-case and that 3493 * it overlaps a locked range. 3494 */ 3495 return true; 3496 case SPDK_BDEV_IO_TYPE_READ: 3497 if (!range->quiesce) { 3498 return false; 3499 } 3500 /* fallthrough */ 3501 case SPDK_BDEV_IO_TYPE_WRITE: 3502 case SPDK_BDEV_IO_TYPE_UNMAP: 3503 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3504 case SPDK_BDEV_IO_TYPE_ZCOPY: 3505 case SPDK_BDEV_IO_TYPE_COPY: 3506 r.offset = bdev_io->u.bdev.offset_blocks; 3507 r.length = bdev_io->u.bdev.num_blocks; 3508 if (!bdev_lba_range_overlapped(range, &r)) { 3509 /* This I/O doesn't overlap the specified LBA range. */ 3510 return false; 3511 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3512 /* This I/O overlaps, but the I/O is on the same channel that locked this 3513 * range, and the caller_ctx is the same as the locked_ctx. This means 3514 * that this I/O is associated with the lock, and is allowed to execute. 3515 */ 3516 return false; 3517 } else { 3518 return true; 3519 } 3520 default: 3521 return false; 3522 } 3523 } 3524 3525 void 3526 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3527 { 3528 struct spdk_bdev *bdev = bdev_io->bdev; 3529 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3530 3531 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3532 3533 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3534 struct lba_range *range; 3535 3536 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3537 if (bdev_io_range_is_locked(bdev_io, range)) { 3538 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3539 return; 3540 } 3541 } 3542 } 3543 3544 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3545 3546 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3547 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3548 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3549 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3550 spdk_bdev_get_name(bdev)); 3551 3552 if (bdev_io->internal.split) { 3553 bdev_io_split(bdev_io); 3554 return; 3555 } 3556 3557 _bdev_io_submit(bdev_io); 3558 } 3559 3560 static inline void 3561 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3562 { 3563 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3564 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3565 * For write operation we need to pull buffers from memory domain before submitting IO. 3566 * Once read operation completes, we need to use memory_domain push functionality to 3567 * update data in original memory domain IO buffer 3568 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3569 bdev_io->u.bdev.memory_domain = NULL; 3570 bdev_io->u.bdev.memory_domain_ctx = NULL; 3571 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3572 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3573 } 3574 3575 static inline void 3576 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3577 { 3578 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3579 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3580 3581 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3582 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3583 bdev_io_complete_unsubmitted(bdev_io); 3584 return; 3585 } 3586 3587 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3588 * support them, but we need to execute an accel sequence and the data buffer is from accel 3589 * memory domain (to avoid doing a push/pull from that domain). 3590 */ 3591 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3592 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3593 _bdev_io_ext_use_bounce_buffer(bdev_io); 3594 return; 3595 } 3596 3597 if (needs_exec) { 3598 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3599 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3600 return; 3601 } 3602 /* For reads we'll execute the sequence after the data is read, so, for now, only 3603 * clear out accel_sequence pointer and submit the IO */ 3604 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3605 bdev_io->u.bdev.accel_sequence = NULL; 3606 } 3607 3608 bdev_io_submit(bdev_io); 3609 } 3610 3611 static void 3612 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3613 { 3614 struct spdk_bdev *bdev = bdev_io->bdev; 3615 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3616 struct spdk_io_channel *ch = bdev_ch->channel; 3617 3618 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3619 3620 bdev_io->internal.in_submit_request = true; 3621 bdev_submit_request(bdev, ch, bdev_io); 3622 bdev_io->internal.in_submit_request = false; 3623 } 3624 3625 void 3626 bdev_io_init(struct spdk_bdev_io *bdev_io, 3627 struct spdk_bdev *bdev, void *cb_arg, 3628 spdk_bdev_io_completion_cb cb) 3629 { 3630 bdev_io->bdev = bdev; 3631 bdev_io->internal.caller_ctx = cb_arg; 3632 bdev_io->internal.cb = cb; 3633 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3634 bdev_io->internal.in_submit_request = false; 3635 bdev_io->internal.buf = NULL; 3636 bdev_io->internal.orig_iovs = NULL; 3637 bdev_io->internal.orig_iovcnt = 0; 3638 bdev_io->internal.orig_md_iov.iov_base = NULL; 3639 bdev_io->internal.error.nvme.cdw0 = 0; 3640 bdev_io->num_retries = 0; 3641 bdev_io->internal.get_buf_cb = NULL; 3642 bdev_io->internal.get_aux_buf_cb = NULL; 3643 bdev_io->internal.memory_domain = NULL; 3644 bdev_io->internal.memory_domain_ctx = NULL; 3645 bdev_io->internal.data_transfer_cpl = NULL; 3646 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3647 bdev_io->internal.accel_sequence = NULL; 3648 bdev_io->internal.has_accel_sequence = false; 3649 } 3650 3651 static bool 3652 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3653 { 3654 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3655 } 3656 3657 bool 3658 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3659 { 3660 bool supported; 3661 3662 supported = bdev_io_type_supported(bdev, io_type); 3663 3664 if (!supported) { 3665 switch (io_type) { 3666 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3667 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3668 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3669 break; 3670 default: 3671 break; 3672 } 3673 } 3674 3675 return supported; 3676 } 3677 3678 uint64_t 3679 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3680 { 3681 return bdev_io->internal.submit_tsc; 3682 } 3683 3684 int 3685 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3686 { 3687 if (bdev->fn_table->dump_info_json) { 3688 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3689 } 3690 3691 return 0; 3692 } 3693 3694 static void 3695 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3696 { 3697 uint32_t max_per_timeslice = 0; 3698 int i; 3699 3700 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3701 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3702 qos->rate_limits[i].max_per_timeslice = 0; 3703 continue; 3704 } 3705 3706 max_per_timeslice = qos->rate_limits[i].limit * 3707 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3708 3709 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3710 qos->rate_limits[i].min_per_timeslice); 3711 3712 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3713 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3714 } 3715 3716 bdev_qos_set_ops(qos); 3717 } 3718 3719 static void 3720 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3721 struct spdk_io_channel *io_ch, void *ctx) 3722 { 3723 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3724 int status; 3725 3726 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3727 3728 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3729 /* TODO: channels round robing */ 3730 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3731 3732 spdk_bdev_for_each_channel_continue(i, status); 3733 } 3734 3735 3736 static void 3737 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3738 { 3739 3740 } 3741 3742 static int 3743 bdev_channel_poll_qos(void *arg) 3744 { 3745 struct spdk_bdev *bdev = arg; 3746 struct spdk_bdev_qos *qos = bdev->internal.qos; 3747 uint64_t now = spdk_get_ticks(); 3748 int i; 3749 int64_t remaining_last_timeslice; 3750 3751 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3752 /* We received our callback earlier than expected - return 3753 * immediately and wait to do accounting until at least one 3754 * timeslice has actually expired. This should never happen 3755 * with a well-behaved timer implementation. 3756 */ 3757 return SPDK_POLLER_IDLE; 3758 } 3759 3760 /* Reset for next round of rate limiting */ 3761 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3762 /* We may have allowed the IOs or bytes to slightly overrun in the last 3763 * timeslice. remaining_this_timeslice is signed, so if it's negative 3764 * here, we'll account for the overrun so that the next timeslice will 3765 * be appropriately reduced. 3766 */ 3767 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3768 0, __ATOMIC_RELAXED); 3769 if (remaining_last_timeslice < 0) { 3770 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3771 * potentially use 2 atomic ops each, so they can intertwine. 3772 * This race can potentialy cause the limits to be a little fuzzy but won't cause any real damage. 3773 */ 3774 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3775 remaining_last_timeslice, __ATOMIC_RELAXED); 3776 } 3777 } 3778 3779 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3780 qos->last_timeslice += qos->timeslice_size; 3781 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3782 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3783 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3784 } 3785 } 3786 3787 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3788 bdev_channel_submit_qos_io_done); 3789 3790 return SPDK_POLLER_BUSY; 3791 } 3792 3793 static void 3794 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3795 { 3796 struct spdk_bdev_shared_resource *shared_resource; 3797 struct lba_range *range; 3798 3799 bdev_free_io_stat(ch->stat); 3800 #ifdef SPDK_CONFIG_VTUNE 3801 bdev_free_io_stat(ch->prev_stat); 3802 #endif 3803 3804 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3805 range = TAILQ_FIRST(&ch->locked_ranges); 3806 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3807 free(range); 3808 } 3809 3810 spdk_put_io_channel(ch->channel); 3811 spdk_put_io_channel(ch->accel_channel); 3812 3813 shared_resource = ch->shared_resource; 3814 3815 assert(TAILQ_EMPTY(&ch->io_locked)); 3816 assert(TAILQ_EMPTY(&ch->io_submitted)); 3817 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3818 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3819 assert(ch->io_outstanding == 0); 3820 assert(shared_resource->ref > 0); 3821 shared_resource->ref--; 3822 if (shared_resource->ref == 0) { 3823 assert(shared_resource->io_outstanding == 0); 3824 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3825 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3826 spdk_poller_unregister(&shared_resource->nomem_poller); 3827 free(shared_resource); 3828 } 3829 } 3830 3831 static void 3832 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3833 { 3834 struct spdk_bdev_qos *qos = bdev->internal.qos; 3835 int i; 3836 3837 assert(spdk_spin_held(&bdev->internal.spinlock)); 3838 3839 /* Rate limiting on this bdev enabled */ 3840 if (qos) { 3841 if (qos->ch == NULL) { 3842 struct spdk_io_channel *io_ch; 3843 3844 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3845 bdev->name, spdk_get_thread()); 3846 3847 /* No qos channel has been selected, so set one up */ 3848 3849 /* Take another reference to ch */ 3850 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3851 assert(io_ch != NULL); 3852 qos->ch = ch; 3853 3854 qos->thread = spdk_io_channel_get_thread(io_ch); 3855 3856 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3857 if (bdev_qos_is_iops_rate_limit(i) == true) { 3858 qos->rate_limits[i].min_per_timeslice = 3859 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3860 } else { 3861 qos->rate_limits[i].min_per_timeslice = 3862 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3863 } 3864 3865 if (qos->rate_limits[i].limit == 0) { 3866 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3867 } 3868 } 3869 bdev_qos_update_max_quota_per_timeslice(qos); 3870 qos->timeslice_size = 3871 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3872 qos->last_timeslice = spdk_get_ticks(); 3873 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3874 bdev, 3875 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3876 } 3877 3878 ch->flags |= BDEV_CH_QOS_ENABLED; 3879 } 3880 } 3881 3882 struct poll_timeout_ctx { 3883 struct spdk_bdev_desc *desc; 3884 uint64_t timeout_in_sec; 3885 spdk_bdev_io_timeout_cb cb_fn; 3886 void *cb_arg; 3887 }; 3888 3889 static void 3890 bdev_desc_free(struct spdk_bdev_desc *desc) 3891 { 3892 spdk_spin_destroy(&desc->spinlock); 3893 free(desc->media_events_buffer); 3894 free(desc); 3895 } 3896 3897 static void 3898 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3899 { 3900 struct poll_timeout_ctx *ctx = _ctx; 3901 struct spdk_bdev_desc *desc = ctx->desc; 3902 3903 free(ctx); 3904 3905 spdk_spin_lock(&desc->spinlock); 3906 desc->refs--; 3907 if (desc->closed == true && desc->refs == 0) { 3908 spdk_spin_unlock(&desc->spinlock); 3909 bdev_desc_free(desc); 3910 return; 3911 } 3912 spdk_spin_unlock(&desc->spinlock); 3913 } 3914 3915 static void 3916 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3917 struct spdk_io_channel *io_ch, void *_ctx) 3918 { 3919 struct poll_timeout_ctx *ctx = _ctx; 3920 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3921 struct spdk_bdev_desc *desc = ctx->desc; 3922 struct spdk_bdev_io *bdev_io; 3923 uint64_t now; 3924 3925 spdk_spin_lock(&desc->spinlock); 3926 if (desc->closed == true) { 3927 spdk_spin_unlock(&desc->spinlock); 3928 spdk_bdev_for_each_channel_continue(i, -1); 3929 return; 3930 } 3931 spdk_spin_unlock(&desc->spinlock); 3932 3933 now = spdk_get_ticks(); 3934 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3935 /* Exclude any I/O that are generated via splitting. */ 3936 if (bdev_io->internal.cb == bdev_io_split_done) { 3937 continue; 3938 } 3939 3940 /* Once we find an I/O that has not timed out, we can immediately 3941 * exit the loop. 3942 */ 3943 if (now < (bdev_io->internal.submit_tsc + 3944 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3945 goto end; 3946 } 3947 3948 if (bdev_io->internal.desc == desc) { 3949 ctx->cb_fn(ctx->cb_arg, bdev_io); 3950 } 3951 } 3952 3953 end: 3954 spdk_bdev_for_each_channel_continue(i, 0); 3955 } 3956 3957 static int 3958 bdev_poll_timeout_io(void *arg) 3959 { 3960 struct spdk_bdev_desc *desc = arg; 3961 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3962 struct poll_timeout_ctx *ctx; 3963 3964 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3965 if (!ctx) { 3966 SPDK_ERRLOG("failed to allocate memory\n"); 3967 return SPDK_POLLER_BUSY; 3968 } 3969 ctx->desc = desc; 3970 ctx->cb_arg = desc->cb_arg; 3971 ctx->cb_fn = desc->cb_fn; 3972 ctx->timeout_in_sec = desc->timeout_in_sec; 3973 3974 /* Take a ref on the descriptor in case it gets closed while we are checking 3975 * all of the channels. 3976 */ 3977 spdk_spin_lock(&desc->spinlock); 3978 desc->refs++; 3979 spdk_spin_unlock(&desc->spinlock); 3980 3981 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3982 bdev_channel_poll_timeout_io_done); 3983 3984 return SPDK_POLLER_BUSY; 3985 } 3986 3987 int 3988 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3989 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3990 { 3991 assert(desc->thread == spdk_get_thread()); 3992 3993 spdk_poller_unregister(&desc->io_timeout_poller); 3994 3995 if (timeout_in_sec) { 3996 assert(cb_fn != NULL); 3997 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3998 desc, 3999 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4000 1000); 4001 if (desc->io_timeout_poller == NULL) { 4002 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4003 return -1; 4004 } 4005 } 4006 4007 desc->cb_fn = cb_fn; 4008 desc->cb_arg = cb_arg; 4009 desc->timeout_in_sec = timeout_in_sec; 4010 4011 return 0; 4012 } 4013 4014 static int 4015 bdev_channel_create(void *io_device, void *ctx_buf) 4016 { 4017 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4018 struct spdk_bdev_channel *ch = ctx_buf; 4019 struct spdk_io_channel *mgmt_io_ch; 4020 struct spdk_bdev_mgmt_channel *mgmt_ch; 4021 struct spdk_bdev_shared_resource *shared_resource; 4022 struct lba_range *range; 4023 4024 ch->bdev = bdev; 4025 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4026 if (!ch->channel) { 4027 return -1; 4028 } 4029 4030 ch->accel_channel = spdk_accel_get_io_channel(); 4031 if (!ch->accel_channel) { 4032 spdk_put_io_channel(ch->channel); 4033 return -1; 4034 } 4035 4036 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 4037 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4038 4039 assert(ch->histogram == NULL); 4040 if (bdev->internal.histogram_enabled) { 4041 ch->histogram = spdk_histogram_data_alloc(); 4042 if (ch->histogram == NULL) { 4043 SPDK_ERRLOG("Could not allocate histogram\n"); 4044 } 4045 } 4046 4047 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4048 if (!mgmt_io_ch) { 4049 spdk_put_io_channel(ch->channel); 4050 spdk_put_io_channel(ch->accel_channel); 4051 return -1; 4052 } 4053 4054 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4055 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4056 if (shared_resource->shared_ch == ch->channel) { 4057 spdk_put_io_channel(mgmt_io_ch); 4058 shared_resource->ref++; 4059 break; 4060 } 4061 } 4062 4063 if (shared_resource == NULL) { 4064 shared_resource = calloc(1, sizeof(*shared_resource)); 4065 if (shared_resource == NULL) { 4066 spdk_put_io_channel(ch->channel); 4067 spdk_put_io_channel(ch->accel_channel); 4068 spdk_put_io_channel(mgmt_io_ch); 4069 return -1; 4070 } 4071 4072 shared_resource->mgmt_ch = mgmt_ch; 4073 shared_resource->io_outstanding = 0; 4074 TAILQ_INIT(&shared_resource->nomem_io); 4075 shared_resource->nomem_threshold = 0; 4076 shared_resource->shared_ch = ch->channel; 4077 shared_resource->ref = 1; 4078 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4079 } 4080 4081 ch->io_outstanding = 0; 4082 TAILQ_INIT(&ch->queued_resets); 4083 TAILQ_INIT(&ch->locked_ranges); 4084 TAILQ_INIT(&ch->qos_queued_io); 4085 ch->flags = 0; 4086 ch->shared_resource = shared_resource; 4087 4088 TAILQ_INIT(&ch->io_submitted); 4089 TAILQ_INIT(&ch->io_locked); 4090 TAILQ_INIT(&ch->io_accel_exec); 4091 TAILQ_INIT(&ch->io_memory_domain); 4092 4093 ch->stat = bdev_alloc_io_stat(false); 4094 if (ch->stat == NULL) { 4095 bdev_channel_destroy_resource(ch); 4096 return -1; 4097 } 4098 4099 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4100 4101 #ifdef SPDK_CONFIG_VTUNE 4102 { 4103 char *name; 4104 __itt_init_ittlib(NULL, 0); 4105 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4106 if (!name) { 4107 bdev_channel_destroy_resource(ch); 4108 return -1; 4109 } 4110 ch->handle = __itt_string_handle_create(name); 4111 free(name); 4112 ch->start_tsc = spdk_get_ticks(); 4113 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4114 ch->prev_stat = bdev_alloc_io_stat(false); 4115 if (ch->prev_stat == NULL) { 4116 bdev_channel_destroy_resource(ch); 4117 return -1; 4118 } 4119 } 4120 #endif 4121 4122 spdk_spin_lock(&bdev->internal.spinlock); 4123 bdev_enable_qos(bdev, ch); 4124 4125 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4126 struct lba_range *new_range; 4127 4128 new_range = calloc(1, sizeof(*new_range)); 4129 if (new_range == NULL) { 4130 spdk_spin_unlock(&bdev->internal.spinlock); 4131 bdev_channel_destroy_resource(ch); 4132 return -1; 4133 } 4134 new_range->length = range->length; 4135 new_range->offset = range->offset; 4136 new_range->locked_ctx = range->locked_ctx; 4137 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4138 } 4139 4140 spdk_spin_unlock(&bdev->internal.spinlock); 4141 4142 return 0; 4143 } 4144 4145 static int 4146 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4147 void *cb_ctx) 4148 { 4149 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4150 struct spdk_bdev_io *bdev_io; 4151 uint64_t buf_len; 4152 4153 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4154 if (bdev_io->internal.ch == bdev_ch) { 4155 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4156 spdk_iobuf_entry_abort(ch, entry, buf_len); 4157 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4158 } 4159 4160 return 0; 4161 } 4162 4163 /* 4164 * Abort I/O that are waiting on a data buffer. 4165 */ 4166 static void 4167 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4168 { 4169 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4170 bdev_abort_all_buf_io_cb, ch); 4171 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4172 bdev_abort_all_buf_io_cb, ch); 4173 } 4174 4175 /* 4176 * Abort I/O that are queued waiting for submission. These types of I/O are 4177 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4178 */ 4179 static void 4180 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4181 { 4182 struct spdk_bdev_io *bdev_io, *tmp; 4183 4184 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4185 if (bdev_io->internal.ch == ch) { 4186 TAILQ_REMOVE(queue, bdev_io, internal.link); 4187 /* 4188 * spdk_bdev_io_complete() assumes that the completed I/O had 4189 * been submitted to the bdev module. Since in this case it 4190 * hadn't, bump io_outstanding to account for the decrement 4191 * that spdk_bdev_io_complete() will do. 4192 */ 4193 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4194 bdev_io_increment_outstanding(ch, ch->shared_resource); 4195 } 4196 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4197 } 4198 } 4199 } 4200 4201 static bool 4202 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4203 { 4204 struct spdk_bdev_io *bdev_io; 4205 4206 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4207 if (bdev_io == bio_to_abort) { 4208 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4209 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4210 return true; 4211 } 4212 } 4213 4214 return false; 4215 } 4216 4217 static int 4218 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4219 { 4220 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4221 uint64_t buf_len; 4222 4223 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4224 if (bdev_io == bio_to_abort) { 4225 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4226 spdk_iobuf_entry_abort(ch, entry, buf_len); 4227 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4228 return 1; 4229 } 4230 4231 return 0; 4232 } 4233 4234 static bool 4235 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4236 { 4237 int rc; 4238 4239 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4240 bdev_abort_buf_io_cb, bio_to_abort); 4241 if (rc == 1) { 4242 return true; 4243 } 4244 4245 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4246 bdev_abort_buf_io_cb, bio_to_abort); 4247 return rc == 1; 4248 } 4249 4250 static void 4251 bdev_qos_channel_destroy(void *cb_arg) 4252 { 4253 struct spdk_bdev_qos *qos = cb_arg; 4254 4255 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4256 spdk_poller_unregister(&qos->poller); 4257 4258 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4259 4260 free(qos); 4261 } 4262 4263 static int 4264 bdev_qos_destroy(struct spdk_bdev *bdev) 4265 { 4266 int i; 4267 4268 /* 4269 * Cleanly shutting down the QoS poller is tricky, because 4270 * during the asynchronous operation the user could open 4271 * a new descriptor and create a new channel, spawning 4272 * a new QoS poller. 4273 * 4274 * The strategy is to create a new QoS structure here and swap it 4275 * in. The shutdown path then continues to refer to the old one 4276 * until it completes and then releases it. 4277 */ 4278 struct spdk_bdev_qos *new_qos, *old_qos; 4279 4280 old_qos = bdev->internal.qos; 4281 4282 new_qos = calloc(1, sizeof(*new_qos)); 4283 if (!new_qos) { 4284 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4285 return -ENOMEM; 4286 } 4287 4288 /* Copy the old QoS data into the newly allocated structure */ 4289 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4290 4291 /* Zero out the key parts of the QoS structure */ 4292 new_qos->ch = NULL; 4293 new_qos->thread = NULL; 4294 new_qos->poller = NULL; 4295 /* 4296 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4297 * It will be used later for the new QoS structure. 4298 */ 4299 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4300 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4301 new_qos->rate_limits[i].min_per_timeslice = 0; 4302 new_qos->rate_limits[i].max_per_timeslice = 0; 4303 } 4304 4305 bdev->internal.qos = new_qos; 4306 4307 if (old_qos->thread == NULL) { 4308 free(old_qos); 4309 } else { 4310 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4311 } 4312 4313 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4314 * been destroyed yet. The destruction path will end up waiting for the final 4315 * channel to be put before it releases resources. */ 4316 4317 return 0; 4318 } 4319 4320 void 4321 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4322 { 4323 total->bytes_read += add->bytes_read; 4324 total->num_read_ops += add->num_read_ops; 4325 total->bytes_written += add->bytes_written; 4326 total->num_write_ops += add->num_write_ops; 4327 total->bytes_unmapped += add->bytes_unmapped; 4328 total->num_unmap_ops += add->num_unmap_ops; 4329 total->bytes_copied += add->bytes_copied; 4330 total->num_copy_ops += add->num_copy_ops; 4331 total->read_latency_ticks += add->read_latency_ticks; 4332 total->write_latency_ticks += add->write_latency_ticks; 4333 total->unmap_latency_ticks += add->unmap_latency_ticks; 4334 total->copy_latency_ticks += add->copy_latency_ticks; 4335 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4336 total->max_read_latency_ticks = add->max_read_latency_ticks; 4337 } 4338 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4339 total->min_read_latency_ticks = add->min_read_latency_ticks; 4340 } 4341 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4342 total->max_write_latency_ticks = add->max_write_latency_ticks; 4343 } 4344 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4345 total->min_write_latency_ticks = add->min_write_latency_ticks; 4346 } 4347 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4348 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4349 } 4350 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4351 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4352 } 4353 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4354 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4355 } 4356 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4357 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4358 } 4359 } 4360 4361 static void 4362 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4363 { 4364 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4365 4366 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4367 memcpy(to_stat->io_error, from_stat->io_error, 4368 sizeof(struct spdk_bdev_io_error_stat)); 4369 } 4370 } 4371 4372 void 4373 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4374 { 4375 stat->max_read_latency_ticks = 0; 4376 stat->min_read_latency_ticks = UINT64_MAX; 4377 stat->max_write_latency_ticks = 0; 4378 stat->min_write_latency_ticks = UINT64_MAX; 4379 stat->max_unmap_latency_ticks = 0; 4380 stat->min_unmap_latency_ticks = UINT64_MAX; 4381 stat->max_copy_latency_ticks = 0; 4382 stat->min_copy_latency_ticks = UINT64_MAX; 4383 4384 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4385 return; 4386 } 4387 4388 stat->bytes_read = 0; 4389 stat->num_read_ops = 0; 4390 stat->bytes_written = 0; 4391 stat->num_write_ops = 0; 4392 stat->bytes_unmapped = 0; 4393 stat->num_unmap_ops = 0; 4394 stat->bytes_copied = 0; 4395 stat->num_copy_ops = 0; 4396 stat->read_latency_ticks = 0; 4397 stat->write_latency_ticks = 0; 4398 stat->unmap_latency_ticks = 0; 4399 stat->copy_latency_ticks = 0; 4400 4401 if (stat->io_error != NULL) { 4402 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4403 } 4404 } 4405 4406 struct spdk_bdev_io_stat * 4407 bdev_alloc_io_stat(bool io_error_stat) 4408 { 4409 struct spdk_bdev_io_stat *stat; 4410 4411 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4412 if (stat == NULL) { 4413 return NULL; 4414 } 4415 4416 if (io_error_stat) { 4417 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4418 if (stat->io_error == NULL) { 4419 free(stat); 4420 return NULL; 4421 } 4422 } else { 4423 stat->io_error = NULL; 4424 } 4425 4426 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4427 4428 return stat; 4429 } 4430 4431 void 4432 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4433 { 4434 if (stat != NULL) { 4435 free(stat->io_error); 4436 free(stat); 4437 } 4438 } 4439 4440 void 4441 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4442 { 4443 int i; 4444 4445 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4446 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4447 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4448 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4449 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4450 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4451 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4452 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4453 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4454 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4455 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4456 stat->min_read_latency_ticks != UINT64_MAX ? 4457 stat->min_read_latency_ticks : 0); 4458 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4459 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4460 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4461 stat->min_write_latency_ticks != UINT64_MAX ? 4462 stat->min_write_latency_ticks : 0); 4463 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4464 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4465 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4466 stat->min_unmap_latency_ticks != UINT64_MAX ? 4467 stat->min_unmap_latency_ticks : 0); 4468 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4469 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4470 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4471 stat->min_copy_latency_ticks != UINT64_MAX ? 4472 stat->min_copy_latency_ticks : 0); 4473 4474 if (stat->io_error != NULL) { 4475 spdk_json_write_named_object_begin(w, "io_error"); 4476 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4477 if (stat->io_error->error_status[i] != 0) { 4478 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4479 stat->io_error->error_status[i]); 4480 } 4481 } 4482 spdk_json_write_object_end(w); 4483 } 4484 } 4485 4486 static void 4487 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4488 { 4489 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4490 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4491 4492 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4493 bdev_abort_all_buf_io(mgmt_ch, ch); 4494 } 4495 4496 static void 4497 bdev_channel_destroy(void *io_device, void *ctx_buf) 4498 { 4499 struct spdk_bdev_channel *ch = ctx_buf; 4500 4501 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4502 spdk_get_thread()); 4503 4504 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4505 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4506 4507 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4508 spdk_spin_lock(&ch->bdev->internal.spinlock); 4509 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4510 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4511 4512 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4513 4514 bdev_channel_abort_queued_ios(ch); 4515 4516 if (ch->histogram) { 4517 spdk_histogram_data_free(ch->histogram); 4518 } 4519 4520 bdev_channel_destroy_resource(ch); 4521 } 4522 4523 /* 4524 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4525 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4526 */ 4527 static int 4528 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4529 { 4530 struct spdk_bdev_name *tmp; 4531 4532 bdev_name->name = strdup(name); 4533 if (bdev_name->name == NULL) { 4534 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4535 return -ENOMEM; 4536 } 4537 4538 bdev_name->bdev = bdev; 4539 4540 spdk_spin_lock(&g_bdev_mgr.spinlock); 4541 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4542 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4543 4544 if (tmp != NULL) { 4545 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4546 free(bdev_name->name); 4547 return -EEXIST; 4548 } 4549 4550 return 0; 4551 } 4552 4553 static void 4554 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4555 { 4556 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4557 free(bdev_name->name); 4558 } 4559 4560 static void 4561 bdev_name_del(struct spdk_bdev_name *bdev_name) 4562 { 4563 spdk_spin_lock(&g_bdev_mgr.spinlock); 4564 bdev_name_del_unsafe(bdev_name); 4565 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4566 } 4567 4568 int 4569 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4570 { 4571 struct spdk_bdev_alias *tmp; 4572 int ret; 4573 4574 if (alias == NULL) { 4575 SPDK_ERRLOG("Empty alias passed\n"); 4576 return -EINVAL; 4577 } 4578 4579 tmp = calloc(1, sizeof(*tmp)); 4580 if (tmp == NULL) { 4581 SPDK_ERRLOG("Unable to allocate alias\n"); 4582 return -ENOMEM; 4583 } 4584 4585 ret = bdev_name_add(&tmp->alias, bdev, alias); 4586 if (ret != 0) { 4587 free(tmp); 4588 return ret; 4589 } 4590 4591 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4592 4593 return 0; 4594 } 4595 4596 static int 4597 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4598 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4599 { 4600 struct spdk_bdev_alias *tmp; 4601 4602 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4603 if (strcmp(alias, tmp->alias.name) == 0) { 4604 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4605 alias_del_fn(&tmp->alias); 4606 free(tmp); 4607 return 0; 4608 } 4609 } 4610 4611 return -ENOENT; 4612 } 4613 4614 int 4615 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4616 { 4617 int rc; 4618 4619 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4620 if (rc == -ENOENT) { 4621 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4622 } 4623 4624 return rc; 4625 } 4626 4627 void 4628 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4629 { 4630 struct spdk_bdev_alias *p, *tmp; 4631 4632 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4633 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4634 bdev_name_del(&p->alias); 4635 free(p); 4636 } 4637 } 4638 4639 struct spdk_io_channel * 4640 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4641 { 4642 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4643 } 4644 4645 void * 4646 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4647 { 4648 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4649 void *ctx = NULL; 4650 4651 if (bdev->fn_table->get_module_ctx) { 4652 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4653 } 4654 4655 return ctx; 4656 } 4657 4658 const char * 4659 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4660 { 4661 return bdev->module->name; 4662 } 4663 4664 const char * 4665 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4666 { 4667 return bdev->name; 4668 } 4669 4670 const char * 4671 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4672 { 4673 return bdev->product_name; 4674 } 4675 4676 const struct spdk_bdev_aliases_list * 4677 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4678 { 4679 return &bdev->aliases; 4680 } 4681 4682 uint32_t 4683 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4684 { 4685 return bdev->blocklen; 4686 } 4687 4688 uint32_t 4689 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4690 { 4691 return bdev->write_unit_size; 4692 } 4693 4694 uint64_t 4695 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4696 { 4697 return bdev->blockcnt; 4698 } 4699 4700 const char * 4701 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4702 { 4703 return qos_rpc_type[type]; 4704 } 4705 4706 void 4707 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4708 { 4709 int i; 4710 4711 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4712 4713 spdk_spin_lock(&bdev->internal.spinlock); 4714 if (bdev->internal.qos) { 4715 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4716 if (bdev->internal.qos->rate_limits[i].limit != 4717 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4718 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4719 if (bdev_qos_is_iops_rate_limit(i) == false) { 4720 /* Change from Byte to Megabyte which is user visible. */ 4721 limits[i] = limits[i] / 1024 / 1024; 4722 } 4723 } 4724 } 4725 } 4726 spdk_spin_unlock(&bdev->internal.spinlock); 4727 } 4728 4729 size_t 4730 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4731 { 4732 return 1 << bdev->required_alignment; 4733 } 4734 4735 uint32_t 4736 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4737 { 4738 return bdev->optimal_io_boundary; 4739 } 4740 4741 bool 4742 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4743 { 4744 return bdev->write_cache; 4745 } 4746 4747 const struct spdk_uuid * 4748 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4749 { 4750 return &bdev->uuid; 4751 } 4752 4753 uint16_t 4754 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4755 { 4756 return bdev->acwu; 4757 } 4758 4759 uint32_t 4760 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4761 { 4762 return bdev->md_len; 4763 } 4764 4765 bool 4766 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4767 { 4768 return (bdev->md_len != 0) && bdev->md_interleave; 4769 } 4770 4771 bool 4772 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4773 { 4774 return (bdev->md_len != 0) && !bdev->md_interleave; 4775 } 4776 4777 bool 4778 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4779 { 4780 return bdev->zoned; 4781 } 4782 4783 uint32_t 4784 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4785 { 4786 if (spdk_bdev_is_md_interleaved(bdev)) { 4787 return bdev->blocklen - bdev->md_len; 4788 } else { 4789 return bdev->blocklen; 4790 } 4791 } 4792 4793 uint32_t 4794 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4795 { 4796 return bdev->phys_blocklen; 4797 } 4798 4799 static uint32_t 4800 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4801 { 4802 if (!spdk_bdev_is_md_interleaved(bdev)) { 4803 return bdev->blocklen + bdev->md_len; 4804 } else { 4805 return bdev->blocklen; 4806 } 4807 } 4808 4809 /* We have to use the typedef in the function declaration to appease astyle. */ 4810 typedef enum spdk_dif_type spdk_dif_type_t; 4811 4812 spdk_dif_type_t 4813 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4814 { 4815 if (bdev->md_len != 0) { 4816 return bdev->dif_type; 4817 } else { 4818 return SPDK_DIF_DISABLE; 4819 } 4820 } 4821 4822 bool 4823 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4824 { 4825 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4826 return bdev->dif_is_head_of_md; 4827 } else { 4828 return false; 4829 } 4830 } 4831 4832 bool 4833 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4834 enum spdk_dif_check_type check_type) 4835 { 4836 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4837 return false; 4838 } 4839 4840 switch (check_type) { 4841 case SPDK_DIF_CHECK_TYPE_REFTAG: 4842 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4843 case SPDK_DIF_CHECK_TYPE_APPTAG: 4844 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4845 case SPDK_DIF_CHECK_TYPE_GUARD: 4846 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4847 default: 4848 return false; 4849 } 4850 } 4851 4852 static uint32_t 4853 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4854 { 4855 uint64_t aligned_length, max_write_blocks; 4856 4857 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4858 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4859 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4860 4861 return max_write_blocks; 4862 } 4863 4864 uint32_t 4865 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4866 { 4867 return bdev->max_copy; 4868 } 4869 4870 uint64_t 4871 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4872 { 4873 return bdev->internal.measured_queue_depth; 4874 } 4875 4876 uint64_t 4877 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4878 { 4879 return bdev->internal.period; 4880 } 4881 4882 uint64_t 4883 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4884 { 4885 return bdev->internal.weighted_io_time; 4886 } 4887 4888 uint64_t 4889 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4890 { 4891 return bdev->internal.io_time; 4892 } 4893 4894 static void bdev_update_qd_sampling_period(void *ctx); 4895 4896 static void 4897 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4898 { 4899 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4900 4901 if (bdev->internal.measured_queue_depth) { 4902 bdev->internal.io_time += bdev->internal.period; 4903 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4904 } 4905 4906 bdev->internal.qd_poll_in_progress = false; 4907 4908 bdev_update_qd_sampling_period(bdev); 4909 } 4910 4911 static void 4912 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4913 struct spdk_io_channel *io_ch, void *_ctx) 4914 { 4915 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4916 4917 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4918 spdk_bdev_for_each_channel_continue(i, 0); 4919 } 4920 4921 static int 4922 bdev_calculate_measured_queue_depth(void *ctx) 4923 { 4924 struct spdk_bdev *bdev = ctx; 4925 4926 bdev->internal.qd_poll_in_progress = true; 4927 bdev->internal.temporary_queue_depth = 0; 4928 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4929 return SPDK_POLLER_BUSY; 4930 } 4931 4932 static void 4933 bdev_update_qd_sampling_period(void *ctx) 4934 { 4935 struct spdk_bdev *bdev = ctx; 4936 4937 if (bdev->internal.period == bdev->internal.new_period) { 4938 return; 4939 } 4940 4941 if (bdev->internal.qd_poll_in_progress) { 4942 return; 4943 } 4944 4945 bdev->internal.period = bdev->internal.new_period; 4946 4947 spdk_poller_unregister(&bdev->internal.qd_poller); 4948 if (bdev->internal.period != 0) { 4949 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4950 bdev, bdev->internal.period); 4951 } else { 4952 spdk_bdev_close(bdev->internal.qd_desc); 4953 bdev->internal.qd_desc = NULL; 4954 } 4955 } 4956 4957 static void 4958 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4959 { 4960 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4961 } 4962 4963 void 4964 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4965 { 4966 int rc; 4967 4968 if (bdev->internal.new_period == period) { 4969 return; 4970 } 4971 4972 bdev->internal.new_period = period; 4973 4974 if (bdev->internal.qd_desc != NULL) { 4975 assert(bdev->internal.period != 0); 4976 4977 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4978 bdev_update_qd_sampling_period, bdev); 4979 return; 4980 } 4981 4982 assert(bdev->internal.period == 0); 4983 4984 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4985 NULL, &bdev->internal.qd_desc); 4986 if (rc != 0) { 4987 return; 4988 } 4989 4990 bdev->internal.period = period; 4991 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4992 bdev, period); 4993 } 4994 4995 struct bdev_get_current_qd_ctx { 4996 uint64_t current_qd; 4997 spdk_bdev_get_current_qd_cb cb_fn; 4998 void *cb_arg; 4999 }; 5000 5001 static void 5002 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5003 { 5004 struct bdev_get_current_qd_ctx *ctx = _ctx; 5005 5006 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5007 5008 free(ctx); 5009 } 5010 5011 static void 5012 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5013 struct spdk_io_channel *io_ch, void *_ctx) 5014 { 5015 struct bdev_get_current_qd_ctx *ctx = _ctx; 5016 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5017 5018 ctx->current_qd += bdev_ch->io_outstanding; 5019 5020 spdk_bdev_for_each_channel_continue(i, 0); 5021 } 5022 5023 void 5024 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5025 void *cb_arg) 5026 { 5027 struct bdev_get_current_qd_ctx *ctx; 5028 5029 assert(cb_fn != NULL); 5030 5031 ctx = calloc(1, sizeof(*ctx)); 5032 if (ctx == NULL) { 5033 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5034 return; 5035 } 5036 5037 ctx->cb_fn = cb_fn; 5038 ctx->cb_arg = cb_arg; 5039 5040 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5041 } 5042 5043 static void 5044 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5045 { 5046 assert(desc->thread == spdk_get_thread()); 5047 5048 spdk_spin_lock(&desc->spinlock); 5049 desc->refs--; 5050 if (!desc->closed) { 5051 spdk_spin_unlock(&desc->spinlock); 5052 desc->callback.event_fn(type, 5053 desc->bdev, 5054 desc->callback.ctx); 5055 return; 5056 } else if (desc->refs == 0) { 5057 /* This descriptor was closed after this event_notify message was sent. 5058 * spdk_bdev_close() could not free the descriptor since this message was 5059 * in flight, so we free it now using bdev_desc_free(). 5060 */ 5061 spdk_spin_unlock(&desc->spinlock); 5062 bdev_desc_free(desc); 5063 return; 5064 } 5065 spdk_spin_unlock(&desc->spinlock); 5066 } 5067 5068 static void 5069 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5070 { 5071 spdk_spin_lock(&desc->spinlock); 5072 desc->refs++; 5073 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5074 spdk_spin_unlock(&desc->spinlock); 5075 } 5076 5077 static void 5078 _resize_notify(void *ctx) 5079 { 5080 struct spdk_bdev_desc *desc = ctx; 5081 5082 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5083 } 5084 5085 int 5086 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5087 { 5088 struct spdk_bdev_desc *desc; 5089 int ret; 5090 5091 if (size == bdev->blockcnt) { 5092 return 0; 5093 } 5094 5095 spdk_spin_lock(&bdev->internal.spinlock); 5096 5097 /* bdev has open descriptors */ 5098 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5099 bdev->blockcnt > size) { 5100 ret = -EBUSY; 5101 } else { 5102 bdev->blockcnt = size; 5103 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5104 event_notify(desc, _resize_notify); 5105 } 5106 ret = 0; 5107 } 5108 5109 spdk_spin_unlock(&bdev->internal.spinlock); 5110 5111 return ret; 5112 } 5113 5114 /* 5115 * Convert I/O offset and length from bytes to blocks. 5116 * 5117 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5118 */ 5119 static uint64_t 5120 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5121 uint64_t num_bytes, uint64_t *num_blocks) 5122 { 5123 uint32_t block_size = bdev->blocklen; 5124 uint8_t shift_cnt; 5125 5126 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5127 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5128 shift_cnt = spdk_u32log2(block_size); 5129 *offset_blocks = offset_bytes >> shift_cnt; 5130 *num_blocks = num_bytes >> shift_cnt; 5131 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5132 (num_bytes - (*num_blocks << shift_cnt)); 5133 } else { 5134 *offset_blocks = offset_bytes / block_size; 5135 *num_blocks = num_bytes / block_size; 5136 return (offset_bytes % block_size) | (num_bytes % block_size); 5137 } 5138 } 5139 5140 static bool 5141 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5142 { 5143 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5144 * has been an overflow and hence the offset has been wrapped around */ 5145 if (offset_blocks + num_blocks < offset_blocks) { 5146 return false; 5147 } 5148 5149 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5150 if (offset_blocks + num_blocks > bdev->blockcnt) { 5151 return false; 5152 } 5153 5154 return true; 5155 } 5156 5157 static void 5158 bdev_seek_complete_cb(void *ctx) 5159 { 5160 struct spdk_bdev_io *bdev_io = ctx; 5161 5162 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5163 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5164 } 5165 5166 static int 5167 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5168 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5169 spdk_bdev_io_completion_cb cb, void *cb_arg) 5170 { 5171 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5172 struct spdk_bdev_io *bdev_io; 5173 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5174 5175 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5176 5177 /* Check if offset_blocks is valid looking at the validity of one block */ 5178 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5179 return -EINVAL; 5180 } 5181 5182 bdev_io = bdev_channel_get_io(channel); 5183 if (!bdev_io) { 5184 return -ENOMEM; 5185 } 5186 5187 bdev_io->internal.ch = channel; 5188 bdev_io->internal.desc = desc; 5189 bdev_io->type = io_type; 5190 bdev_io->u.bdev.offset_blocks = offset_blocks; 5191 bdev_io->u.bdev.memory_domain = NULL; 5192 bdev_io->u.bdev.memory_domain_ctx = NULL; 5193 bdev_io->u.bdev.accel_sequence = NULL; 5194 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5195 5196 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5197 /* In case bdev doesn't support seek to next data/hole offset, 5198 * it is assumed that only data and no holes are present */ 5199 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5200 bdev_io->u.bdev.seek.offset = offset_blocks; 5201 } else { 5202 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5203 } 5204 5205 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5206 return 0; 5207 } 5208 5209 bdev_io_submit(bdev_io); 5210 return 0; 5211 } 5212 5213 int 5214 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5215 uint64_t offset_blocks, 5216 spdk_bdev_io_completion_cb cb, void *cb_arg) 5217 { 5218 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5219 } 5220 5221 int 5222 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5223 uint64_t offset_blocks, 5224 spdk_bdev_io_completion_cb cb, void *cb_arg) 5225 { 5226 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5227 } 5228 5229 uint64_t 5230 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5231 { 5232 return bdev_io->u.bdev.seek.offset; 5233 } 5234 5235 static int 5236 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5237 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5238 spdk_bdev_io_completion_cb cb, void *cb_arg) 5239 { 5240 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5241 struct spdk_bdev_io *bdev_io; 5242 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5243 5244 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5245 return -EINVAL; 5246 } 5247 5248 bdev_io = bdev_channel_get_io(channel); 5249 if (!bdev_io) { 5250 return -ENOMEM; 5251 } 5252 5253 bdev_io->internal.ch = channel; 5254 bdev_io->internal.desc = desc; 5255 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5256 bdev_io->u.bdev.iovs = &bdev_io->iov; 5257 bdev_io->u.bdev.iovs[0].iov_base = buf; 5258 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5259 bdev_io->u.bdev.iovcnt = 1; 5260 bdev_io->u.bdev.md_buf = md_buf; 5261 bdev_io->u.bdev.num_blocks = num_blocks; 5262 bdev_io->u.bdev.offset_blocks = offset_blocks; 5263 bdev_io->u.bdev.memory_domain = NULL; 5264 bdev_io->u.bdev.memory_domain_ctx = NULL; 5265 bdev_io->u.bdev.accel_sequence = NULL; 5266 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5267 5268 bdev_io_submit(bdev_io); 5269 return 0; 5270 } 5271 5272 int 5273 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5274 void *buf, uint64_t offset, uint64_t nbytes, 5275 spdk_bdev_io_completion_cb cb, void *cb_arg) 5276 { 5277 uint64_t offset_blocks, num_blocks; 5278 5279 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5280 nbytes, &num_blocks) != 0) { 5281 return -EINVAL; 5282 } 5283 5284 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5285 } 5286 5287 int 5288 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5289 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5290 spdk_bdev_io_completion_cb cb, void *cb_arg) 5291 { 5292 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5293 } 5294 5295 int 5296 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5297 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5298 spdk_bdev_io_completion_cb cb, void *cb_arg) 5299 { 5300 struct iovec iov = { 5301 .iov_base = buf, 5302 }; 5303 5304 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5305 return -EINVAL; 5306 } 5307 5308 if (md_buf && !_is_buf_allocated(&iov)) { 5309 return -EINVAL; 5310 } 5311 5312 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5313 cb, cb_arg); 5314 } 5315 5316 int 5317 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5318 struct iovec *iov, int iovcnt, 5319 uint64_t offset, uint64_t nbytes, 5320 spdk_bdev_io_completion_cb cb, void *cb_arg) 5321 { 5322 uint64_t offset_blocks, num_blocks; 5323 5324 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5325 nbytes, &num_blocks) != 0) { 5326 return -EINVAL; 5327 } 5328 5329 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5330 } 5331 5332 static int 5333 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5334 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5335 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5336 struct spdk_accel_sequence *seq, 5337 spdk_bdev_io_completion_cb cb, void *cb_arg) 5338 { 5339 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5340 struct spdk_bdev_io *bdev_io; 5341 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5342 5343 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5344 return -EINVAL; 5345 } 5346 5347 bdev_io = bdev_channel_get_io(channel); 5348 if (!bdev_io) { 5349 return -ENOMEM; 5350 } 5351 5352 bdev_io->internal.ch = channel; 5353 bdev_io->internal.desc = desc; 5354 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5355 bdev_io->u.bdev.iovs = iov; 5356 bdev_io->u.bdev.iovcnt = iovcnt; 5357 bdev_io->u.bdev.md_buf = md_buf; 5358 bdev_io->u.bdev.num_blocks = num_blocks; 5359 bdev_io->u.bdev.offset_blocks = offset_blocks; 5360 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5361 bdev_io->internal.memory_domain = domain; 5362 bdev_io->internal.memory_domain_ctx = domain_ctx; 5363 bdev_io->internal.accel_sequence = seq; 5364 bdev_io->internal.has_accel_sequence = seq != NULL; 5365 bdev_io->u.bdev.memory_domain = domain; 5366 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5367 bdev_io->u.bdev.accel_sequence = seq; 5368 5369 _bdev_io_submit_ext(desc, bdev_io); 5370 5371 return 0; 5372 } 5373 5374 int 5375 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5376 struct iovec *iov, int iovcnt, 5377 uint64_t offset_blocks, uint64_t num_blocks, 5378 spdk_bdev_io_completion_cb cb, void *cb_arg) 5379 { 5380 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5381 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5382 } 5383 5384 int 5385 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5386 struct iovec *iov, int iovcnt, void *md_buf, 5387 uint64_t offset_blocks, uint64_t num_blocks, 5388 spdk_bdev_io_completion_cb cb, void *cb_arg) 5389 { 5390 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5391 return -EINVAL; 5392 } 5393 5394 if (md_buf && !_is_buf_allocated(iov)) { 5395 return -EINVAL; 5396 } 5397 5398 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5399 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5400 } 5401 5402 static inline bool 5403 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5404 { 5405 /* 5406 * We check if opts size is at least of size when we first introduced 5407 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5408 * are not checked internal. 5409 */ 5410 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5411 sizeof(opts->metadata) && 5412 opts->size <= sizeof(*opts) && 5413 /* When memory domain is used, the user must provide data buffers */ 5414 (!opts->memory_domain || (iov && iov[0].iov_base)); 5415 } 5416 5417 int 5418 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5419 struct iovec *iov, int iovcnt, 5420 uint64_t offset_blocks, uint64_t num_blocks, 5421 spdk_bdev_io_completion_cb cb, void *cb_arg, 5422 struct spdk_bdev_ext_io_opts *opts) 5423 { 5424 void *md = NULL; 5425 5426 if (opts) { 5427 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5428 return -EINVAL; 5429 } 5430 md = opts->metadata; 5431 } 5432 5433 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5434 return -EINVAL; 5435 } 5436 5437 if (md && !_is_buf_allocated(iov)) { 5438 return -EINVAL; 5439 } 5440 5441 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5442 num_blocks, 5443 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5444 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5445 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5446 cb, cb_arg); 5447 } 5448 5449 static int 5450 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5451 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5452 spdk_bdev_io_completion_cb cb, void *cb_arg) 5453 { 5454 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5455 struct spdk_bdev_io *bdev_io; 5456 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5457 5458 if (!desc->write) { 5459 return -EBADF; 5460 } 5461 5462 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5463 return -EINVAL; 5464 } 5465 5466 bdev_io = bdev_channel_get_io(channel); 5467 if (!bdev_io) { 5468 return -ENOMEM; 5469 } 5470 5471 bdev_io->internal.ch = channel; 5472 bdev_io->internal.desc = desc; 5473 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5474 bdev_io->u.bdev.iovs = &bdev_io->iov; 5475 bdev_io->u.bdev.iovs[0].iov_base = buf; 5476 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5477 bdev_io->u.bdev.iovcnt = 1; 5478 bdev_io->u.bdev.md_buf = md_buf; 5479 bdev_io->u.bdev.num_blocks = num_blocks; 5480 bdev_io->u.bdev.offset_blocks = offset_blocks; 5481 bdev_io->u.bdev.memory_domain = NULL; 5482 bdev_io->u.bdev.memory_domain_ctx = NULL; 5483 bdev_io->u.bdev.accel_sequence = NULL; 5484 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5485 5486 bdev_io_submit(bdev_io); 5487 return 0; 5488 } 5489 5490 int 5491 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5492 void *buf, uint64_t offset, uint64_t nbytes, 5493 spdk_bdev_io_completion_cb cb, void *cb_arg) 5494 { 5495 uint64_t offset_blocks, num_blocks; 5496 5497 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5498 nbytes, &num_blocks) != 0) { 5499 return -EINVAL; 5500 } 5501 5502 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5503 } 5504 5505 int 5506 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5507 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5508 spdk_bdev_io_completion_cb cb, void *cb_arg) 5509 { 5510 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5511 cb, cb_arg); 5512 } 5513 5514 int 5515 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5516 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5517 spdk_bdev_io_completion_cb cb, void *cb_arg) 5518 { 5519 struct iovec iov = { 5520 .iov_base = buf, 5521 }; 5522 5523 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5524 return -EINVAL; 5525 } 5526 5527 if (md_buf && !_is_buf_allocated(&iov)) { 5528 return -EINVAL; 5529 } 5530 5531 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5532 cb, cb_arg); 5533 } 5534 5535 static int 5536 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5537 struct iovec *iov, int iovcnt, void *md_buf, 5538 uint64_t offset_blocks, uint64_t num_blocks, 5539 struct spdk_memory_domain *domain, void *domain_ctx, 5540 struct spdk_accel_sequence *seq, 5541 spdk_bdev_io_completion_cb cb, void *cb_arg) 5542 { 5543 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5544 struct spdk_bdev_io *bdev_io; 5545 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5546 5547 if (!desc->write) { 5548 return -EBADF; 5549 } 5550 5551 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5552 return -EINVAL; 5553 } 5554 5555 bdev_io = bdev_channel_get_io(channel); 5556 if (!bdev_io) { 5557 return -ENOMEM; 5558 } 5559 5560 bdev_io->internal.ch = channel; 5561 bdev_io->internal.desc = desc; 5562 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5563 bdev_io->u.bdev.iovs = iov; 5564 bdev_io->u.bdev.iovcnt = iovcnt; 5565 bdev_io->u.bdev.md_buf = md_buf; 5566 bdev_io->u.bdev.num_blocks = num_blocks; 5567 bdev_io->u.bdev.offset_blocks = offset_blocks; 5568 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5569 bdev_io->internal.memory_domain = domain; 5570 bdev_io->internal.memory_domain_ctx = domain_ctx; 5571 bdev_io->internal.accel_sequence = seq; 5572 bdev_io->internal.has_accel_sequence = seq != NULL; 5573 bdev_io->u.bdev.memory_domain = domain; 5574 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5575 bdev_io->u.bdev.accel_sequence = seq; 5576 5577 _bdev_io_submit_ext(desc, bdev_io); 5578 5579 return 0; 5580 } 5581 5582 int 5583 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5584 struct iovec *iov, int iovcnt, 5585 uint64_t offset, uint64_t len, 5586 spdk_bdev_io_completion_cb cb, void *cb_arg) 5587 { 5588 uint64_t offset_blocks, num_blocks; 5589 5590 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5591 len, &num_blocks) != 0) { 5592 return -EINVAL; 5593 } 5594 5595 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5596 } 5597 5598 int 5599 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5600 struct iovec *iov, int iovcnt, 5601 uint64_t offset_blocks, uint64_t num_blocks, 5602 spdk_bdev_io_completion_cb cb, void *cb_arg) 5603 { 5604 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5605 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5606 } 5607 5608 int 5609 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5610 struct iovec *iov, int iovcnt, void *md_buf, 5611 uint64_t offset_blocks, uint64_t num_blocks, 5612 spdk_bdev_io_completion_cb cb, void *cb_arg) 5613 { 5614 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5615 return -EINVAL; 5616 } 5617 5618 if (md_buf && !_is_buf_allocated(iov)) { 5619 return -EINVAL; 5620 } 5621 5622 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5623 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5624 } 5625 5626 int 5627 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5628 struct iovec *iov, int iovcnt, 5629 uint64_t offset_blocks, uint64_t num_blocks, 5630 spdk_bdev_io_completion_cb cb, void *cb_arg, 5631 struct spdk_bdev_ext_io_opts *opts) 5632 { 5633 void *md = NULL; 5634 5635 if (opts) { 5636 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5637 return -EINVAL; 5638 } 5639 md = opts->metadata; 5640 } 5641 5642 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5643 return -EINVAL; 5644 } 5645 5646 if (md && !_is_buf_allocated(iov)) { 5647 return -EINVAL; 5648 } 5649 5650 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5651 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5652 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5653 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5654 cb, cb_arg); 5655 } 5656 5657 static void 5658 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5659 { 5660 struct spdk_bdev_io *parent_io = cb_arg; 5661 struct spdk_bdev *bdev = parent_io->bdev; 5662 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5663 int i, rc = 0; 5664 5665 if (!success) { 5666 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5667 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5668 spdk_bdev_free_io(bdev_io); 5669 return; 5670 } 5671 5672 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5673 rc = memcmp(read_buf, 5674 parent_io->u.bdev.iovs[i].iov_base, 5675 parent_io->u.bdev.iovs[i].iov_len); 5676 if (rc) { 5677 break; 5678 } 5679 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5680 } 5681 5682 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5683 rc = memcmp(bdev_io->u.bdev.md_buf, 5684 parent_io->u.bdev.md_buf, 5685 spdk_bdev_get_md_size(bdev)); 5686 } 5687 5688 spdk_bdev_free_io(bdev_io); 5689 5690 if (rc == 0) { 5691 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5692 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5693 } else { 5694 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5695 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5696 } 5697 } 5698 5699 static void 5700 bdev_compare_do_read(void *_bdev_io) 5701 { 5702 struct spdk_bdev_io *bdev_io = _bdev_io; 5703 int rc; 5704 5705 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5706 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5707 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5708 bdev_compare_do_read_done, bdev_io); 5709 5710 if (rc == -ENOMEM) { 5711 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5712 } else if (rc != 0) { 5713 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5714 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5715 } 5716 } 5717 5718 static int 5719 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5720 struct iovec *iov, int iovcnt, void *md_buf, 5721 uint64_t offset_blocks, uint64_t num_blocks, 5722 spdk_bdev_io_completion_cb cb, void *cb_arg) 5723 { 5724 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5725 struct spdk_bdev_io *bdev_io; 5726 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5727 5728 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5729 return -EINVAL; 5730 } 5731 5732 bdev_io = bdev_channel_get_io(channel); 5733 if (!bdev_io) { 5734 return -ENOMEM; 5735 } 5736 5737 bdev_io->internal.ch = channel; 5738 bdev_io->internal.desc = desc; 5739 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5740 bdev_io->u.bdev.iovs = iov; 5741 bdev_io->u.bdev.iovcnt = iovcnt; 5742 bdev_io->u.bdev.md_buf = md_buf; 5743 bdev_io->u.bdev.num_blocks = num_blocks; 5744 bdev_io->u.bdev.offset_blocks = offset_blocks; 5745 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5746 bdev_io->u.bdev.memory_domain = NULL; 5747 bdev_io->u.bdev.memory_domain_ctx = NULL; 5748 bdev_io->u.bdev.accel_sequence = NULL; 5749 5750 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5751 bdev_io_submit(bdev_io); 5752 return 0; 5753 } 5754 5755 bdev_compare_do_read(bdev_io); 5756 5757 return 0; 5758 } 5759 5760 int 5761 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5762 struct iovec *iov, int iovcnt, 5763 uint64_t offset_blocks, uint64_t num_blocks, 5764 spdk_bdev_io_completion_cb cb, void *cb_arg) 5765 { 5766 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5767 num_blocks, cb, cb_arg); 5768 } 5769 5770 int 5771 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5772 struct iovec *iov, int iovcnt, void *md_buf, 5773 uint64_t offset_blocks, uint64_t num_blocks, 5774 spdk_bdev_io_completion_cb cb, void *cb_arg) 5775 { 5776 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5777 return -EINVAL; 5778 } 5779 5780 if (md_buf && !_is_buf_allocated(iov)) { 5781 return -EINVAL; 5782 } 5783 5784 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5785 num_blocks, cb, cb_arg); 5786 } 5787 5788 static int 5789 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5790 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5791 spdk_bdev_io_completion_cb cb, void *cb_arg) 5792 { 5793 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5794 struct spdk_bdev_io *bdev_io; 5795 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5796 5797 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5798 return -EINVAL; 5799 } 5800 5801 bdev_io = bdev_channel_get_io(channel); 5802 if (!bdev_io) { 5803 return -ENOMEM; 5804 } 5805 5806 bdev_io->internal.ch = channel; 5807 bdev_io->internal.desc = desc; 5808 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5809 bdev_io->u.bdev.iovs = &bdev_io->iov; 5810 bdev_io->u.bdev.iovs[0].iov_base = buf; 5811 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5812 bdev_io->u.bdev.iovcnt = 1; 5813 bdev_io->u.bdev.md_buf = md_buf; 5814 bdev_io->u.bdev.num_blocks = num_blocks; 5815 bdev_io->u.bdev.offset_blocks = offset_blocks; 5816 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5817 bdev_io->u.bdev.memory_domain = NULL; 5818 bdev_io->u.bdev.memory_domain_ctx = NULL; 5819 bdev_io->u.bdev.accel_sequence = NULL; 5820 5821 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5822 bdev_io_submit(bdev_io); 5823 return 0; 5824 } 5825 5826 bdev_compare_do_read(bdev_io); 5827 5828 return 0; 5829 } 5830 5831 int 5832 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5833 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5834 spdk_bdev_io_completion_cb cb, void *cb_arg) 5835 { 5836 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5837 cb, cb_arg); 5838 } 5839 5840 int 5841 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5842 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5843 spdk_bdev_io_completion_cb cb, void *cb_arg) 5844 { 5845 struct iovec iov = { 5846 .iov_base = buf, 5847 }; 5848 5849 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5850 return -EINVAL; 5851 } 5852 5853 if (md_buf && !_is_buf_allocated(&iov)) { 5854 return -EINVAL; 5855 } 5856 5857 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5858 cb, cb_arg); 5859 } 5860 5861 static void 5862 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5863 { 5864 struct spdk_bdev_io *bdev_io = ctx; 5865 5866 if (unlock_status) { 5867 SPDK_ERRLOG("LBA range unlock failed\n"); 5868 } 5869 5870 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5871 false, bdev_io->internal.caller_ctx); 5872 } 5873 5874 static void 5875 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5876 { 5877 bdev_io->internal.status = status; 5878 5879 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5880 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5881 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5882 } 5883 5884 static void 5885 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5886 { 5887 struct spdk_bdev_io *parent_io = cb_arg; 5888 5889 if (!success) { 5890 SPDK_ERRLOG("Compare and write operation failed\n"); 5891 } 5892 5893 spdk_bdev_free_io(bdev_io); 5894 5895 bdev_comparev_and_writev_blocks_unlock(parent_io, 5896 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5897 } 5898 5899 static void 5900 bdev_compare_and_write_do_write(void *_bdev_io) 5901 { 5902 struct spdk_bdev_io *bdev_io = _bdev_io; 5903 int rc; 5904 5905 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5906 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5907 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5908 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5909 bdev_compare_and_write_do_write_done, bdev_io); 5910 5911 5912 if (rc == -ENOMEM) { 5913 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5914 } else if (rc != 0) { 5915 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5916 } 5917 } 5918 5919 static void 5920 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5921 { 5922 struct spdk_bdev_io *parent_io = cb_arg; 5923 5924 spdk_bdev_free_io(bdev_io); 5925 5926 if (!success) { 5927 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5928 return; 5929 } 5930 5931 bdev_compare_and_write_do_write(parent_io); 5932 } 5933 5934 static void 5935 bdev_compare_and_write_do_compare(void *_bdev_io) 5936 { 5937 struct spdk_bdev_io *bdev_io = _bdev_io; 5938 int rc; 5939 5940 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5941 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5942 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5943 bdev_compare_and_write_do_compare_done, bdev_io); 5944 5945 if (rc == -ENOMEM) { 5946 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5947 } else if (rc != 0) { 5948 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5949 } 5950 } 5951 5952 static void 5953 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5954 { 5955 struct spdk_bdev_io *bdev_io = ctx; 5956 5957 if (status) { 5958 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5959 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5960 return; 5961 } 5962 5963 bdev_compare_and_write_do_compare(bdev_io); 5964 } 5965 5966 int 5967 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5968 struct iovec *compare_iov, int compare_iovcnt, 5969 struct iovec *write_iov, int write_iovcnt, 5970 uint64_t offset_blocks, uint64_t num_blocks, 5971 spdk_bdev_io_completion_cb cb, void *cb_arg) 5972 { 5973 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5974 struct spdk_bdev_io *bdev_io; 5975 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5976 5977 if (!desc->write) { 5978 return -EBADF; 5979 } 5980 5981 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5982 return -EINVAL; 5983 } 5984 5985 if (num_blocks > bdev->acwu) { 5986 return -EINVAL; 5987 } 5988 5989 bdev_io = bdev_channel_get_io(channel); 5990 if (!bdev_io) { 5991 return -ENOMEM; 5992 } 5993 5994 bdev_io->internal.ch = channel; 5995 bdev_io->internal.desc = desc; 5996 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5997 bdev_io->u.bdev.iovs = compare_iov; 5998 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5999 bdev_io->u.bdev.fused_iovs = write_iov; 6000 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6001 bdev_io->u.bdev.md_buf = NULL; 6002 bdev_io->u.bdev.num_blocks = num_blocks; 6003 bdev_io->u.bdev.offset_blocks = offset_blocks; 6004 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6005 bdev_io->u.bdev.memory_domain = NULL; 6006 bdev_io->u.bdev.memory_domain_ctx = NULL; 6007 bdev_io->u.bdev.accel_sequence = NULL; 6008 6009 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6010 bdev_io_submit(bdev_io); 6011 return 0; 6012 } 6013 6014 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6015 bdev_comparev_and_writev_blocks_locked, bdev_io); 6016 } 6017 6018 int 6019 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6020 struct iovec *iov, int iovcnt, 6021 uint64_t offset_blocks, uint64_t num_blocks, 6022 bool populate, 6023 spdk_bdev_io_completion_cb cb, void *cb_arg) 6024 { 6025 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6026 struct spdk_bdev_io *bdev_io; 6027 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6028 6029 if (!desc->write) { 6030 return -EBADF; 6031 } 6032 6033 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6034 return -EINVAL; 6035 } 6036 6037 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6038 return -ENOTSUP; 6039 } 6040 6041 bdev_io = bdev_channel_get_io(channel); 6042 if (!bdev_io) { 6043 return -ENOMEM; 6044 } 6045 6046 bdev_io->internal.ch = channel; 6047 bdev_io->internal.desc = desc; 6048 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6049 bdev_io->u.bdev.num_blocks = num_blocks; 6050 bdev_io->u.bdev.offset_blocks = offset_blocks; 6051 bdev_io->u.bdev.iovs = iov; 6052 bdev_io->u.bdev.iovcnt = iovcnt; 6053 bdev_io->u.bdev.md_buf = NULL; 6054 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6055 bdev_io->u.bdev.zcopy.commit = 0; 6056 bdev_io->u.bdev.zcopy.start = 1; 6057 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6058 bdev_io->u.bdev.memory_domain = NULL; 6059 bdev_io->u.bdev.memory_domain_ctx = NULL; 6060 bdev_io->u.bdev.accel_sequence = NULL; 6061 6062 bdev_io_submit(bdev_io); 6063 6064 return 0; 6065 } 6066 6067 int 6068 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6069 spdk_bdev_io_completion_cb cb, void *cb_arg) 6070 { 6071 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6072 return -EINVAL; 6073 } 6074 6075 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6076 bdev_io->u.bdev.zcopy.start = 0; 6077 bdev_io->internal.caller_ctx = cb_arg; 6078 bdev_io->internal.cb = cb; 6079 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6080 6081 bdev_io_submit(bdev_io); 6082 6083 return 0; 6084 } 6085 6086 int 6087 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6088 uint64_t offset, uint64_t len, 6089 spdk_bdev_io_completion_cb cb, void *cb_arg) 6090 { 6091 uint64_t offset_blocks, num_blocks; 6092 6093 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6094 len, &num_blocks) != 0) { 6095 return -EINVAL; 6096 } 6097 6098 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6099 } 6100 6101 int 6102 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6103 uint64_t offset_blocks, uint64_t num_blocks, 6104 spdk_bdev_io_completion_cb cb, void *cb_arg) 6105 { 6106 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6107 struct spdk_bdev_io *bdev_io; 6108 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6109 6110 if (!desc->write) { 6111 return -EBADF; 6112 } 6113 6114 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6115 return -EINVAL; 6116 } 6117 6118 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6119 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6120 return -ENOTSUP; 6121 } 6122 6123 bdev_io = bdev_channel_get_io(channel); 6124 6125 if (!bdev_io) { 6126 return -ENOMEM; 6127 } 6128 6129 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6130 bdev_io->internal.ch = channel; 6131 bdev_io->internal.desc = desc; 6132 bdev_io->u.bdev.offset_blocks = offset_blocks; 6133 bdev_io->u.bdev.num_blocks = num_blocks; 6134 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6135 bdev_io->u.bdev.memory_domain = NULL; 6136 bdev_io->u.bdev.memory_domain_ctx = NULL; 6137 bdev_io->u.bdev.accel_sequence = NULL; 6138 6139 /* If the write_zeroes size is large and should be split, use the generic split 6140 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6141 * 6142 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6143 * or emulate it using regular write request otherwise. 6144 */ 6145 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6146 bdev_io->internal.split) { 6147 bdev_io_submit(bdev_io); 6148 return 0; 6149 } 6150 6151 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6152 6153 return bdev_write_zero_buffer(bdev_io); 6154 } 6155 6156 int 6157 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6158 uint64_t offset, uint64_t nbytes, 6159 spdk_bdev_io_completion_cb cb, void *cb_arg) 6160 { 6161 uint64_t offset_blocks, num_blocks; 6162 6163 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6164 nbytes, &num_blocks) != 0) { 6165 return -EINVAL; 6166 } 6167 6168 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6169 } 6170 6171 int 6172 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6173 uint64_t offset_blocks, uint64_t num_blocks, 6174 spdk_bdev_io_completion_cb cb, void *cb_arg) 6175 { 6176 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6177 struct spdk_bdev_io *bdev_io; 6178 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6179 6180 if (!desc->write) { 6181 return -EBADF; 6182 } 6183 6184 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6185 return -EINVAL; 6186 } 6187 6188 if (num_blocks == 0) { 6189 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6190 return -EINVAL; 6191 } 6192 6193 bdev_io = bdev_channel_get_io(channel); 6194 if (!bdev_io) { 6195 return -ENOMEM; 6196 } 6197 6198 bdev_io->internal.ch = channel; 6199 bdev_io->internal.desc = desc; 6200 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6201 6202 bdev_io->u.bdev.iovs = &bdev_io->iov; 6203 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6204 bdev_io->u.bdev.iovs[0].iov_len = 0; 6205 bdev_io->u.bdev.iovcnt = 1; 6206 6207 bdev_io->u.bdev.offset_blocks = offset_blocks; 6208 bdev_io->u.bdev.num_blocks = num_blocks; 6209 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6210 bdev_io->u.bdev.memory_domain = NULL; 6211 bdev_io->u.bdev.memory_domain_ctx = NULL; 6212 bdev_io->u.bdev.accel_sequence = NULL; 6213 6214 bdev_io_submit(bdev_io); 6215 return 0; 6216 } 6217 6218 int 6219 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6220 uint64_t offset, uint64_t length, 6221 spdk_bdev_io_completion_cb cb, void *cb_arg) 6222 { 6223 uint64_t offset_blocks, num_blocks; 6224 6225 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6226 length, &num_blocks) != 0) { 6227 return -EINVAL; 6228 } 6229 6230 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6231 } 6232 6233 int 6234 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6235 uint64_t offset_blocks, uint64_t num_blocks, 6236 spdk_bdev_io_completion_cb cb, void *cb_arg) 6237 { 6238 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6239 struct spdk_bdev_io *bdev_io; 6240 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6241 6242 if (!desc->write) { 6243 return -EBADF; 6244 } 6245 6246 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6247 return -EINVAL; 6248 } 6249 6250 bdev_io = bdev_channel_get_io(channel); 6251 if (!bdev_io) { 6252 return -ENOMEM; 6253 } 6254 6255 bdev_io->internal.ch = channel; 6256 bdev_io->internal.desc = desc; 6257 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6258 bdev_io->u.bdev.iovs = NULL; 6259 bdev_io->u.bdev.iovcnt = 0; 6260 bdev_io->u.bdev.offset_blocks = offset_blocks; 6261 bdev_io->u.bdev.num_blocks = num_blocks; 6262 bdev_io->u.bdev.memory_domain = NULL; 6263 bdev_io->u.bdev.memory_domain_ctx = NULL; 6264 bdev_io->u.bdev.accel_sequence = NULL; 6265 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6266 6267 bdev_io_submit(bdev_io); 6268 return 0; 6269 } 6270 6271 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6272 6273 static void 6274 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6275 { 6276 struct spdk_bdev_channel *ch = _ctx; 6277 struct spdk_bdev_io *bdev_io; 6278 6279 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6280 6281 if (status == -EBUSY) { 6282 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6283 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6284 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6285 } else { 6286 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6287 6288 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6289 /* If outstanding IOs are still present and reset_io_drain_timeout 6290 * seconds passed, start the reset. */ 6291 bdev_io_submit_reset(bdev_io); 6292 } else { 6293 /* We still have in progress memory domain pull/push or we're 6294 * executing accel sequence. Since we cannot abort either of those 6295 * operaions, fail the reset request. */ 6296 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6297 } 6298 } 6299 } else { 6300 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6301 SPDK_DEBUGLOG(bdev, 6302 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6303 ch->bdev->name); 6304 /* Mark the completion status as a SUCCESS and complete the reset. */ 6305 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6306 } 6307 } 6308 6309 static void 6310 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6311 struct spdk_io_channel *io_ch, void *_ctx) 6312 { 6313 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6314 int status = 0; 6315 6316 if (cur_ch->io_outstanding > 0 || 6317 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6318 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6319 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6320 * further iteration over the rest of the channels and pass non-zero status 6321 * to the callback function. */ 6322 status = -EBUSY; 6323 } 6324 spdk_bdev_for_each_channel_continue(i, status); 6325 } 6326 6327 static int 6328 bdev_reset_poll_for_outstanding_io(void *ctx) 6329 { 6330 struct spdk_bdev_channel *ch = ctx; 6331 struct spdk_bdev_io *bdev_io; 6332 6333 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6334 6335 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6336 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6337 bdev_reset_check_outstanding_io_done); 6338 6339 return SPDK_POLLER_BUSY; 6340 } 6341 6342 static void 6343 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6344 { 6345 struct spdk_bdev_channel *ch = _ctx; 6346 struct spdk_bdev_io *bdev_io; 6347 6348 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6349 6350 if (bdev->reset_io_drain_timeout == 0) { 6351 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6352 6353 bdev_io_submit_reset(bdev_io); 6354 return; 6355 } 6356 6357 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6358 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6359 6360 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6361 * submit the reset to the underlying module only if outstanding I/O 6362 * remain after reset_io_drain_timeout seconds have passed. */ 6363 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6364 bdev_reset_check_outstanding_io_done); 6365 } 6366 6367 static void 6368 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6369 struct spdk_io_channel *ch, void *_ctx) 6370 { 6371 struct spdk_bdev_channel *channel; 6372 struct spdk_bdev_mgmt_channel *mgmt_channel; 6373 struct spdk_bdev_shared_resource *shared_resource; 6374 bdev_io_tailq_t tmp_queued; 6375 6376 TAILQ_INIT(&tmp_queued); 6377 6378 channel = __io_ch_to_bdev_ch(ch); 6379 shared_resource = channel->shared_resource; 6380 mgmt_channel = shared_resource->mgmt_ch; 6381 6382 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6383 6384 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6385 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6386 } 6387 6388 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6389 bdev_abort_all_buf_io(mgmt_channel, channel); 6390 bdev_abort_all_queued_io(&tmp_queued, channel); 6391 6392 spdk_bdev_for_each_channel_continue(i, 0); 6393 } 6394 6395 static void 6396 bdev_start_reset(void *ctx) 6397 { 6398 struct spdk_bdev_channel *ch = ctx; 6399 6400 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6401 bdev_reset_freeze_channel_done); 6402 } 6403 6404 static void 6405 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6406 { 6407 struct spdk_bdev *bdev = ch->bdev; 6408 6409 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6410 6411 spdk_spin_lock(&bdev->internal.spinlock); 6412 if (bdev->internal.reset_in_progress == NULL) { 6413 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6414 /* 6415 * Take a channel reference for the target bdev for the life of this 6416 * reset. This guards against the channel getting destroyed while 6417 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6418 * progress. We will release the reference when this reset is 6419 * completed. 6420 */ 6421 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6422 bdev_start_reset(ch); 6423 } 6424 spdk_spin_unlock(&bdev->internal.spinlock); 6425 } 6426 6427 int 6428 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6429 spdk_bdev_io_completion_cb cb, void *cb_arg) 6430 { 6431 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6432 struct spdk_bdev_io *bdev_io; 6433 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6434 6435 bdev_io = bdev_channel_get_io(channel); 6436 if (!bdev_io) { 6437 return -ENOMEM; 6438 } 6439 6440 bdev_io->internal.ch = channel; 6441 bdev_io->internal.desc = desc; 6442 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6443 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6444 bdev_io->u.reset.ch_ref = NULL; 6445 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6446 6447 spdk_spin_lock(&bdev->internal.spinlock); 6448 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6449 spdk_spin_unlock(&bdev->internal.spinlock); 6450 6451 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6452 internal.ch_link); 6453 6454 bdev_channel_start_reset(channel); 6455 6456 return 0; 6457 } 6458 6459 void 6460 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6461 struct spdk_bdev_io_stat *stat) 6462 { 6463 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6464 6465 bdev_get_io_stat(stat, channel->stat); 6466 } 6467 6468 static void 6469 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6470 { 6471 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6472 6473 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6474 bdev_iostat_ctx->cb_arg, 0); 6475 free(bdev_iostat_ctx); 6476 } 6477 6478 static void 6479 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6480 struct spdk_io_channel *ch, void *_ctx) 6481 { 6482 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6483 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6484 6485 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6486 spdk_bdev_for_each_channel_continue(i, 0); 6487 } 6488 6489 void 6490 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6491 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6492 { 6493 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6494 6495 assert(bdev != NULL); 6496 assert(stat != NULL); 6497 assert(cb != NULL); 6498 6499 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6500 if (bdev_iostat_ctx == NULL) { 6501 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6502 cb(bdev, stat, cb_arg, -ENOMEM); 6503 return; 6504 } 6505 6506 bdev_iostat_ctx->stat = stat; 6507 bdev_iostat_ctx->cb = cb; 6508 bdev_iostat_ctx->cb_arg = cb_arg; 6509 6510 /* Start with the statistics from previously deleted channels. */ 6511 spdk_spin_lock(&bdev->internal.spinlock); 6512 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6513 spdk_spin_unlock(&bdev->internal.spinlock); 6514 6515 /* Then iterate and add the statistics from each existing channel. */ 6516 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6517 bdev_get_device_stat_done); 6518 } 6519 6520 struct bdev_iostat_reset_ctx { 6521 enum spdk_bdev_reset_stat_mode mode; 6522 bdev_reset_device_stat_cb cb; 6523 void *cb_arg; 6524 }; 6525 6526 static void 6527 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6528 { 6529 struct bdev_iostat_reset_ctx *ctx = _ctx; 6530 6531 ctx->cb(bdev, ctx->cb_arg, 0); 6532 6533 free(ctx); 6534 } 6535 6536 static void 6537 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6538 struct spdk_io_channel *ch, void *_ctx) 6539 { 6540 struct bdev_iostat_reset_ctx *ctx = _ctx; 6541 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6542 6543 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6544 6545 spdk_bdev_for_each_channel_continue(i, 0); 6546 } 6547 6548 void 6549 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6550 bdev_reset_device_stat_cb cb, void *cb_arg) 6551 { 6552 struct bdev_iostat_reset_ctx *ctx; 6553 6554 assert(bdev != NULL); 6555 assert(cb != NULL); 6556 6557 ctx = calloc(1, sizeof(*ctx)); 6558 if (ctx == NULL) { 6559 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6560 cb(bdev, cb_arg, -ENOMEM); 6561 return; 6562 } 6563 6564 ctx->mode = mode; 6565 ctx->cb = cb; 6566 ctx->cb_arg = cb_arg; 6567 6568 spdk_spin_lock(&bdev->internal.spinlock); 6569 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6570 spdk_spin_unlock(&bdev->internal.spinlock); 6571 6572 spdk_bdev_for_each_channel(bdev, 6573 bdev_reset_each_channel_stat, 6574 ctx, 6575 bdev_reset_device_stat_done); 6576 } 6577 6578 int 6579 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6580 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6581 spdk_bdev_io_completion_cb cb, void *cb_arg) 6582 { 6583 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6584 struct spdk_bdev_io *bdev_io; 6585 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6586 6587 if (!desc->write) { 6588 return -EBADF; 6589 } 6590 6591 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6592 return -ENOTSUP; 6593 } 6594 6595 bdev_io = bdev_channel_get_io(channel); 6596 if (!bdev_io) { 6597 return -ENOMEM; 6598 } 6599 6600 bdev_io->internal.ch = channel; 6601 bdev_io->internal.desc = desc; 6602 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6603 bdev_io->u.nvme_passthru.cmd = *cmd; 6604 bdev_io->u.nvme_passthru.buf = buf; 6605 bdev_io->u.nvme_passthru.nbytes = nbytes; 6606 bdev_io->u.nvme_passthru.md_buf = NULL; 6607 bdev_io->u.nvme_passthru.md_len = 0; 6608 6609 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6610 6611 bdev_io_submit(bdev_io); 6612 return 0; 6613 } 6614 6615 int 6616 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6617 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6618 spdk_bdev_io_completion_cb cb, void *cb_arg) 6619 { 6620 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6621 struct spdk_bdev_io *bdev_io; 6622 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6623 6624 if (!desc->write) { 6625 /* 6626 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6627 * to easily determine if the command is a read or write, but for now just 6628 * do not allow io_passthru with a read-only descriptor. 6629 */ 6630 return -EBADF; 6631 } 6632 6633 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6634 return -ENOTSUP; 6635 } 6636 6637 bdev_io = bdev_channel_get_io(channel); 6638 if (!bdev_io) { 6639 return -ENOMEM; 6640 } 6641 6642 bdev_io->internal.ch = channel; 6643 bdev_io->internal.desc = desc; 6644 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6645 bdev_io->u.nvme_passthru.cmd = *cmd; 6646 bdev_io->u.nvme_passthru.buf = buf; 6647 bdev_io->u.nvme_passthru.nbytes = nbytes; 6648 bdev_io->u.nvme_passthru.md_buf = NULL; 6649 bdev_io->u.nvme_passthru.md_len = 0; 6650 6651 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6652 6653 bdev_io_submit(bdev_io); 6654 return 0; 6655 } 6656 6657 int 6658 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6659 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6660 spdk_bdev_io_completion_cb cb, void *cb_arg) 6661 { 6662 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6663 struct spdk_bdev_io *bdev_io; 6664 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6665 6666 if (!desc->write) { 6667 /* 6668 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6669 * to easily determine if the command is a read or write, but for now just 6670 * do not allow io_passthru with a read-only descriptor. 6671 */ 6672 return -EBADF; 6673 } 6674 6675 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6676 return -ENOTSUP; 6677 } 6678 6679 bdev_io = bdev_channel_get_io(channel); 6680 if (!bdev_io) { 6681 return -ENOMEM; 6682 } 6683 6684 bdev_io->internal.ch = channel; 6685 bdev_io->internal.desc = desc; 6686 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6687 bdev_io->u.nvme_passthru.cmd = *cmd; 6688 bdev_io->u.nvme_passthru.buf = buf; 6689 bdev_io->u.nvme_passthru.nbytes = nbytes; 6690 bdev_io->u.nvme_passthru.md_buf = md_buf; 6691 bdev_io->u.nvme_passthru.md_len = md_len; 6692 6693 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6694 6695 bdev_io_submit(bdev_io); 6696 return 0; 6697 } 6698 6699 int 6700 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6701 struct spdk_io_channel *ch, 6702 const struct spdk_nvme_cmd *cmd, 6703 struct iovec *iov, int iovcnt, size_t nbytes, 6704 void *md_buf, size_t md_len, 6705 spdk_bdev_io_completion_cb cb, void *cb_arg) 6706 { 6707 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6708 struct spdk_bdev_io *bdev_io; 6709 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6710 6711 if (!desc->write) { 6712 /* 6713 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6714 * to easily determine if the command is a read or write, but for now just 6715 * do not allow io_passthru with a read-only descriptor. 6716 */ 6717 return -EBADF; 6718 } 6719 6720 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6721 return -ENOTSUP; 6722 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6723 return -ENOTSUP; 6724 } 6725 6726 bdev_io = bdev_channel_get_io(channel); 6727 if (!bdev_io) { 6728 return -ENOMEM; 6729 } 6730 6731 bdev_io->internal.ch = channel; 6732 bdev_io->internal.desc = desc; 6733 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6734 bdev_io->u.nvme_passthru.cmd = *cmd; 6735 bdev_io->u.nvme_passthru.iovs = iov; 6736 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6737 bdev_io->u.nvme_passthru.nbytes = nbytes; 6738 bdev_io->u.nvme_passthru.md_buf = md_buf; 6739 bdev_io->u.nvme_passthru.md_len = md_len; 6740 6741 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6742 6743 bdev_io_submit(bdev_io); 6744 return 0; 6745 } 6746 6747 static void bdev_abort_retry(void *ctx); 6748 static void bdev_abort(struct spdk_bdev_io *parent_io); 6749 6750 static void 6751 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6752 { 6753 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6754 struct spdk_bdev_io *parent_io = cb_arg; 6755 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6756 6757 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6758 6759 spdk_bdev_free_io(bdev_io); 6760 6761 if (!success) { 6762 /* Check if the target I/O completed in the meantime. */ 6763 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6764 if (tmp_io == bio_to_abort) { 6765 break; 6766 } 6767 } 6768 6769 /* If the target I/O still exists, set the parent to failed. */ 6770 if (tmp_io != NULL) { 6771 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6772 } 6773 } 6774 6775 parent_io->u.bdev.split_outstanding--; 6776 if (parent_io->u.bdev.split_outstanding == 0) { 6777 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6778 bdev_abort_retry(parent_io); 6779 } else { 6780 bdev_io_complete(parent_io); 6781 } 6782 } 6783 } 6784 6785 static int 6786 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6787 struct spdk_bdev_io *bio_to_abort, 6788 spdk_bdev_io_completion_cb cb, void *cb_arg) 6789 { 6790 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6791 struct spdk_bdev_io *bdev_io; 6792 6793 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6794 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6795 /* TODO: Abort reset or abort request. */ 6796 return -ENOTSUP; 6797 } 6798 6799 bdev_io = bdev_channel_get_io(channel); 6800 if (bdev_io == NULL) { 6801 return -ENOMEM; 6802 } 6803 6804 bdev_io->internal.ch = channel; 6805 bdev_io->internal.desc = desc; 6806 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6807 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6808 6809 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6810 assert(bdev_io_should_split(bio_to_abort)); 6811 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6812 6813 /* Parent abort request is not submitted directly, but to manage its 6814 * execution add it to the submitted list here. 6815 */ 6816 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6817 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6818 6819 bdev_abort(bdev_io); 6820 6821 return 0; 6822 } 6823 6824 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6825 6826 /* Submit the abort request to the underlying bdev module. */ 6827 bdev_io_submit(bdev_io); 6828 6829 return 0; 6830 } 6831 6832 static bool 6833 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6834 { 6835 struct spdk_bdev_io *iter; 6836 6837 TAILQ_FOREACH(iter, tailq, internal.link) { 6838 if (iter == bdev_io) { 6839 return true; 6840 } 6841 } 6842 6843 return false; 6844 } 6845 6846 static uint32_t 6847 _bdev_abort(struct spdk_bdev_io *parent_io) 6848 { 6849 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6850 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6851 void *bio_cb_arg; 6852 struct spdk_bdev_io *bio_to_abort; 6853 uint32_t matched_ios; 6854 int rc; 6855 6856 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6857 6858 /* matched_ios is returned and will be kept by the caller. 6859 * 6860 * This function will be used for two cases, 1) the same cb_arg is used for 6861 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6862 * Incrementing split_outstanding directly here may confuse readers especially 6863 * for the 1st case. 6864 * 6865 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6866 * works as expected. 6867 */ 6868 matched_ios = 0; 6869 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6870 6871 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6872 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6873 continue; 6874 } 6875 6876 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6877 /* Any I/O which was submitted after this abort command should be excluded. */ 6878 continue; 6879 } 6880 6881 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6882 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6883 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6884 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6885 break; 6886 } 6887 6888 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6889 if (rc != 0) { 6890 if (rc == -ENOMEM) { 6891 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6892 } else { 6893 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6894 } 6895 break; 6896 } 6897 matched_ios++; 6898 } 6899 6900 return matched_ios; 6901 } 6902 6903 static void 6904 bdev_abort_retry(void *ctx) 6905 { 6906 struct spdk_bdev_io *parent_io = ctx; 6907 uint32_t matched_ios; 6908 6909 matched_ios = _bdev_abort(parent_io); 6910 6911 if (matched_ios == 0) { 6912 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6913 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6914 } else { 6915 /* For retry, the case that no target I/O was found is success 6916 * because it means target I/Os completed in the meantime. 6917 */ 6918 bdev_io_complete(parent_io); 6919 } 6920 return; 6921 } 6922 6923 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6924 parent_io->u.bdev.split_outstanding = matched_ios; 6925 } 6926 6927 static void 6928 bdev_abort(struct spdk_bdev_io *parent_io) 6929 { 6930 uint32_t matched_ios; 6931 6932 matched_ios = _bdev_abort(parent_io); 6933 6934 if (matched_ios == 0) { 6935 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6936 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6937 } else { 6938 /* The case the no target I/O was found is failure. */ 6939 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6940 bdev_io_complete(parent_io); 6941 } 6942 return; 6943 } 6944 6945 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6946 parent_io->u.bdev.split_outstanding = matched_ios; 6947 } 6948 6949 int 6950 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6951 void *bio_cb_arg, 6952 spdk_bdev_io_completion_cb cb, void *cb_arg) 6953 { 6954 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6955 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6956 struct spdk_bdev_io *bdev_io; 6957 6958 if (bio_cb_arg == NULL) { 6959 return -EINVAL; 6960 } 6961 6962 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6963 return -ENOTSUP; 6964 } 6965 6966 bdev_io = bdev_channel_get_io(channel); 6967 if (bdev_io == NULL) { 6968 return -ENOMEM; 6969 } 6970 6971 bdev_io->internal.ch = channel; 6972 bdev_io->internal.desc = desc; 6973 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6974 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6975 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6976 6977 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6978 6979 /* Parent abort request is not submitted directly, but to manage its execution, 6980 * add it to the submitted list here. 6981 */ 6982 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6983 6984 bdev_abort(bdev_io); 6985 6986 return 0; 6987 } 6988 6989 int 6990 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6991 struct spdk_bdev_io_wait_entry *entry) 6992 { 6993 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6994 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6995 6996 if (bdev != entry->bdev) { 6997 SPDK_ERRLOG("bdevs do not match\n"); 6998 return -EINVAL; 6999 } 7000 7001 if (mgmt_ch->per_thread_cache_count > 0) { 7002 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7003 return -EINVAL; 7004 } 7005 7006 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7007 return 0; 7008 } 7009 7010 static inline void 7011 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7012 { 7013 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7014 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7015 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7016 uint32_t blocklen = bdev_io->bdev->blocklen; 7017 7018 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7019 switch (bdev_io->type) { 7020 case SPDK_BDEV_IO_TYPE_READ: 7021 io_stat->bytes_read += num_blocks * blocklen; 7022 io_stat->num_read_ops++; 7023 io_stat->read_latency_ticks += tsc_diff; 7024 if (io_stat->max_read_latency_ticks < tsc_diff) { 7025 io_stat->max_read_latency_ticks = tsc_diff; 7026 } 7027 if (io_stat->min_read_latency_ticks > tsc_diff) { 7028 io_stat->min_read_latency_ticks = tsc_diff; 7029 } 7030 break; 7031 case SPDK_BDEV_IO_TYPE_WRITE: 7032 io_stat->bytes_written += num_blocks * blocklen; 7033 io_stat->num_write_ops++; 7034 io_stat->write_latency_ticks += tsc_diff; 7035 if (io_stat->max_write_latency_ticks < tsc_diff) { 7036 io_stat->max_write_latency_ticks = tsc_diff; 7037 } 7038 if (io_stat->min_write_latency_ticks > tsc_diff) { 7039 io_stat->min_write_latency_ticks = tsc_diff; 7040 } 7041 break; 7042 case SPDK_BDEV_IO_TYPE_UNMAP: 7043 io_stat->bytes_unmapped += num_blocks * blocklen; 7044 io_stat->num_unmap_ops++; 7045 io_stat->unmap_latency_ticks += tsc_diff; 7046 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7047 io_stat->max_unmap_latency_ticks = tsc_diff; 7048 } 7049 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7050 io_stat->min_unmap_latency_ticks = tsc_diff; 7051 } 7052 break; 7053 case SPDK_BDEV_IO_TYPE_ZCOPY: 7054 /* Track the data in the start phase only */ 7055 if (bdev_io->u.bdev.zcopy.start) { 7056 if (bdev_io->u.bdev.zcopy.populate) { 7057 io_stat->bytes_read += num_blocks * blocklen; 7058 io_stat->num_read_ops++; 7059 io_stat->read_latency_ticks += tsc_diff; 7060 if (io_stat->max_read_latency_ticks < tsc_diff) { 7061 io_stat->max_read_latency_ticks = tsc_diff; 7062 } 7063 if (io_stat->min_read_latency_ticks > tsc_diff) { 7064 io_stat->min_read_latency_ticks = tsc_diff; 7065 } 7066 } else { 7067 io_stat->bytes_written += num_blocks * blocklen; 7068 io_stat->num_write_ops++; 7069 io_stat->write_latency_ticks += tsc_diff; 7070 if (io_stat->max_write_latency_ticks < tsc_diff) { 7071 io_stat->max_write_latency_ticks = tsc_diff; 7072 } 7073 if (io_stat->min_write_latency_ticks > tsc_diff) { 7074 io_stat->min_write_latency_ticks = tsc_diff; 7075 } 7076 } 7077 } 7078 break; 7079 case SPDK_BDEV_IO_TYPE_COPY: 7080 io_stat->bytes_copied += num_blocks * blocklen; 7081 io_stat->num_copy_ops++; 7082 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7083 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7084 io_stat->max_copy_latency_ticks = tsc_diff; 7085 } 7086 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7087 io_stat->min_copy_latency_ticks = tsc_diff; 7088 } 7089 break; 7090 default: 7091 break; 7092 } 7093 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7094 io_stat = bdev_io->bdev->internal.stat; 7095 assert(io_stat->io_error != NULL); 7096 7097 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7098 io_stat->io_error->error_status[-io_status - 1]++; 7099 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7100 } 7101 7102 #ifdef SPDK_CONFIG_VTUNE 7103 uint64_t now_tsc = spdk_get_ticks(); 7104 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7105 uint64_t data[5]; 7106 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7107 7108 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7109 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7110 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7111 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7112 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7113 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7114 7115 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7116 __itt_metadata_u64, 5, data); 7117 7118 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7119 bdev_io->internal.ch->start_tsc = now_tsc; 7120 } 7121 #endif 7122 } 7123 7124 static inline void 7125 _bdev_io_complete(void *ctx) 7126 { 7127 struct spdk_bdev_io *bdev_io = ctx; 7128 7129 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 7130 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7131 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7132 } 7133 7134 assert(bdev_io->internal.cb != NULL); 7135 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7136 7137 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7138 bdev_io->internal.caller_ctx); 7139 } 7140 7141 static inline void 7142 bdev_io_complete(void *ctx) 7143 { 7144 struct spdk_bdev_io *bdev_io = ctx; 7145 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7146 uint64_t tsc, tsc_diff; 7147 7148 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7149 /* 7150 * Defer completion to avoid potential infinite recursion if the 7151 * user's completion callback issues a new I/O. 7152 */ 7153 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7154 bdev_io_complete, bdev_io); 7155 return; 7156 } 7157 7158 tsc = spdk_get_ticks(); 7159 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7160 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7161 bdev_io->internal.caller_ctx); 7162 7163 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7164 7165 if (bdev_io->internal.ch->histogram) { 7166 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7167 } 7168 7169 bdev_io_update_io_stat(bdev_io, tsc_diff); 7170 _bdev_io_complete(bdev_io); 7171 } 7172 7173 /* The difference between this function and bdev_io_complete() is that this should be called to 7174 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7175 * io_submitted list and don't have submit_tsc updated. 7176 */ 7177 static inline void 7178 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7179 { 7180 /* Since the IO hasn't been submitted it's bound to be failed */ 7181 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7182 7183 /* At this point we don't know if the IO is completed from submission context or not, but, 7184 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7185 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7186 _bdev_io_complete, bdev_io); 7187 } 7188 7189 static void bdev_destroy_cb(void *io_device); 7190 7191 static void 7192 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7193 { 7194 struct spdk_bdev_io *bdev_io = _ctx; 7195 7196 if (bdev_io->u.reset.ch_ref != NULL) { 7197 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7198 bdev_io->u.reset.ch_ref = NULL; 7199 } 7200 7201 bdev_io_complete(bdev_io); 7202 7203 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7204 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7205 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7206 } 7207 } 7208 7209 static void 7210 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7211 struct spdk_io_channel *_ch, void *_ctx) 7212 { 7213 struct spdk_bdev_io *bdev_io = _ctx; 7214 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7215 struct spdk_bdev_io *queued_reset; 7216 7217 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7218 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7219 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7220 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7221 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7222 } 7223 7224 spdk_bdev_for_each_channel_continue(i, 0); 7225 } 7226 7227 static void 7228 bdev_io_complete_sequence_cb(void *ctx, int status) 7229 { 7230 struct spdk_bdev_io *bdev_io = ctx; 7231 7232 /* u.bdev.accel_sequence should have already been cleared at this point */ 7233 assert(bdev_io->u.bdev.accel_sequence == NULL); 7234 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7235 bdev_io->internal.accel_sequence = NULL; 7236 7237 if (spdk_unlikely(status != 0)) { 7238 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7239 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7240 } 7241 7242 bdev_io_complete(bdev_io); 7243 } 7244 7245 void 7246 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7247 { 7248 struct spdk_bdev *bdev = bdev_io->bdev; 7249 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7250 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7251 7252 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7253 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7254 spdk_bdev_get_module_name(bdev), 7255 bdev_io_status_get_string(bdev_io->internal.status)); 7256 assert(false); 7257 } 7258 bdev_io->internal.status = status; 7259 7260 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7261 bool unlock_channels = false; 7262 7263 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7264 SPDK_ERRLOG("NOMEM returned for reset\n"); 7265 } 7266 spdk_spin_lock(&bdev->internal.spinlock); 7267 if (bdev_io == bdev->internal.reset_in_progress) { 7268 bdev->internal.reset_in_progress = NULL; 7269 unlock_channels = true; 7270 } 7271 spdk_spin_unlock(&bdev->internal.spinlock); 7272 7273 if (unlock_channels) { 7274 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7275 bdev_reset_complete); 7276 return; 7277 } 7278 } else { 7279 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7280 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7281 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7282 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7283 return; 7284 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7285 !bdev_io_use_accel_sequence(bdev_io))) { 7286 _bdev_io_push_bounce_data_buffer(bdev_io, 7287 _bdev_io_complete_push_bounce_done); 7288 /* bdev IO will be completed in the callback */ 7289 return; 7290 } 7291 } 7292 7293 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7294 return; 7295 } 7296 } 7297 7298 bdev_io_complete(bdev_io); 7299 } 7300 7301 void 7302 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7303 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7304 { 7305 enum spdk_bdev_io_status status; 7306 7307 if (sc == SPDK_SCSI_STATUS_GOOD) { 7308 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7309 } else { 7310 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7311 bdev_io->internal.error.scsi.sc = sc; 7312 bdev_io->internal.error.scsi.sk = sk; 7313 bdev_io->internal.error.scsi.asc = asc; 7314 bdev_io->internal.error.scsi.ascq = ascq; 7315 } 7316 7317 spdk_bdev_io_complete(bdev_io, status); 7318 } 7319 7320 void 7321 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7322 int *sc, int *sk, int *asc, int *ascq) 7323 { 7324 assert(sc != NULL); 7325 assert(sk != NULL); 7326 assert(asc != NULL); 7327 assert(ascq != NULL); 7328 7329 switch (bdev_io->internal.status) { 7330 case SPDK_BDEV_IO_STATUS_SUCCESS: 7331 *sc = SPDK_SCSI_STATUS_GOOD; 7332 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7333 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7334 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7335 break; 7336 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7337 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7338 break; 7339 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7340 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7341 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7342 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7343 *ascq = bdev_io->internal.error.scsi.ascq; 7344 break; 7345 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7346 *sc = bdev_io->internal.error.scsi.sc; 7347 *sk = bdev_io->internal.error.scsi.sk; 7348 *asc = bdev_io->internal.error.scsi.asc; 7349 *ascq = bdev_io->internal.error.scsi.ascq; 7350 break; 7351 default: 7352 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7353 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7354 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7355 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7356 break; 7357 } 7358 } 7359 7360 void 7361 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7362 { 7363 enum spdk_bdev_io_status status; 7364 7365 if (aio_result == 0) { 7366 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7367 } else { 7368 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7369 } 7370 7371 bdev_io->internal.error.aio_result = aio_result; 7372 7373 spdk_bdev_io_complete(bdev_io, status); 7374 } 7375 7376 void 7377 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7378 { 7379 assert(aio_result != NULL); 7380 7381 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7382 *aio_result = bdev_io->internal.error.aio_result; 7383 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7384 *aio_result = 0; 7385 } else { 7386 *aio_result = -EIO; 7387 } 7388 } 7389 7390 void 7391 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7392 { 7393 enum spdk_bdev_io_status status; 7394 7395 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7396 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7397 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7398 status = SPDK_BDEV_IO_STATUS_ABORTED; 7399 } else { 7400 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7401 } 7402 7403 bdev_io->internal.error.nvme.cdw0 = cdw0; 7404 bdev_io->internal.error.nvme.sct = sct; 7405 bdev_io->internal.error.nvme.sc = sc; 7406 7407 spdk_bdev_io_complete(bdev_io, status); 7408 } 7409 7410 void 7411 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7412 { 7413 assert(sct != NULL); 7414 assert(sc != NULL); 7415 assert(cdw0 != NULL); 7416 7417 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7418 *sct = SPDK_NVME_SCT_GENERIC; 7419 *sc = SPDK_NVME_SC_SUCCESS; 7420 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7421 *cdw0 = 0; 7422 } else { 7423 *cdw0 = 1U; 7424 } 7425 return; 7426 } 7427 7428 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7429 *sct = bdev_io->internal.error.nvme.sct; 7430 *sc = bdev_io->internal.error.nvme.sc; 7431 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7432 *sct = SPDK_NVME_SCT_GENERIC; 7433 *sc = SPDK_NVME_SC_SUCCESS; 7434 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7435 *sct = SPDK_NVME_SCT_GENERIC; 7436 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7437 } else { 7438 *sct = SPDK_NVME_SCT_GENERIC; 7439 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7440 } 7441 7442 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7443 } 7444 7445 void 7446 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7447 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7448 { 7449 assert(first_sct != NULL); 7450 assert(first_sc != NULL); 7451 assert(second_sct != NULL); 7452 assert(second_sc != NULL); 7453 assert(cdw0 != NULL); 7454 7455 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7456 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7457 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7458 *first_sct = bdev_io->internal.error.nvme.sct; 7459 *first_sc = bdev_io->internal.error.nvme.sc; 7460 *second_sct = SPDK_NVME_SCT_GENERIC; 7461 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7462 } else { 7463 *first_sct = SPDK_NVME_SCT_GENERIC; 7464 *first_sc = SPDK_NVME_SC_SUCCESS; 7465 *second_sct = bdev_io->internal.error.nvme.sct; 7466 *second_sc = bdev_io->internal.error.nvme.sc; 7467 } 7468 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7469 *first_sct = SPDK_NVME_SCT_GENERIC; 7470 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7471 *second_sct = SPDK_NVME_SCT_GENERIC; 7472 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7473 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7474 *first_sct = SPDK_NVME_SCT_GENERIC; 7475 *first_sc = SPDK_NVME_SC_SUCCESS; 7476 *second_sct = SPDK_NVME_SCT_GENERIC; 7477 *second_sc = SPDK_NVME_SC_SUCCESS; 7478 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7479 *first_sct = SPDK_NVME_SCT_GENERIC; 7480 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7481 *second_sct = SPDK_NVME_SCT_GENERIC; 7482 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7483 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7484 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7485 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7486 *second_sct = SPDK_NVME_SCT_GENERIC; 7487 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7488 } else { 7489 *first_sct = SPDK_NVME_SCT_GENERIC; 7490 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7491 *second_sct = SPDK_NVME_SCT_GENERIC; 7492 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7493 } 7494 7495 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7496 } 7497 7498 struct spdk_thread * 7499 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7500 { 7501 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7502 } 7503 7504 struct spdk_io_channel * 7505 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7506 { 7507 return bdev_io->internal.ch->channel; 7508 } 7509 7510 static int 7511 bdev_register(struct spdk_bdev *bdev) 7512 { 7513 char *bdev_name; 7514 char uuid[SPDK_UUID_STRING_LEN]; 7515 struct spdk_iobuf_opts iobuf_opts; 7516 int ret, i; 7517 7518 assert(bdev->module != NULL); 7519 7520 if (!bdev->name) { 7521 SPDK_ERRLOG("Bdev name is NULL\n"); 7522 return -EINVAL; 7523 } 7524 7525 if (!strlen(bdev->name)) { 7526 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7527 return -EINVAL; 7528 } 7529 7530 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7531 if (bdev->fn_table->accel_sequence_supported == NULL) { 7532 continue; 7533 } 7534 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7535 (enum spdk_bdev_io_type)i)) { 7536 continue; 7537 } 7538 7539 if (spdk_bdev_is_md_separate(bdev)) { 7540 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7541 "accel sequence support\n"); 7542 return -EINVAL; 7543 } 7544 } 7545 7546 /* Users often register their own I/O devices using the bdev name. In 7547 * order to avoid conflicts, prepend bdev_. */ 7548 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7549 if (!bdev_name) { 7550 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7551 return -ENOMEM; 7552 } 7553 7554 bdev->internal.stat = bdev_alloc_io_stat(true); 7555 if (!bdev->internal.stat) { 7556 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7557 free(bdev_name); 7558 return -ENOMEM; 7559 } 7560 7561 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7562 bdev->internal.measured_queue_depth = UINT64_MAX; 7563 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7564 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7565 bdev->internal.qd_poller = NULL; 7566 bdev->internal.qos = NULL; 7567 7568 TAILQ_INIT(&bdev->internal.open_descs); 7569 TAILQ_INIT(&bdev->internal.locked_ranges); 7570 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7571 TAILQ_INIT(&bdev->aliases); 7572 7573 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7574 if (ret != 0) { 7575 bdev_free_io_stat(bdev->internal.stat); 7576 free(bdev_name); 7577 return ret; 7578 } 7579 7580 /* UUID may be specified by the user or defined by bdev itself. 7581 * Otherwise it will be generated here, so this field will never be empty. */ 7582 if (spdk_uuid_is_null(&bdev->uuid)) { 7583 spdk_uuid_generate(&bdev->uuid); 7584 } 7585 7586 /* Add the UUID alias only if it's different than the name */ 7587 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7588 if (strcmp(bdev->name, uuid) != 0) { 7589 ret = spdk_bdev_alias_add(bdev, uuid); 7590 if (ret != 0) { 7591 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7592 bdev_name_del(&bdev->internal.bdev_name); 7593 bdev_free_io_stat(bdev->internal.stat); 7594 free(bdev_name); 7595 return ret; 7596 } 7597 } 7598 7599 spdk_iobuf_get_opts(&iobuf_opts); 7600 if (spdk_bdev_get_buf_align(bdev) > 1) { 7601 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7602 iobuf_opts.large_bufsize / bdev->blocklen); 7603 } 7604 7605 /* If the user didn't specify a write unit size, set it to one. */ 7606 if (bdev->write_unit_size == 0) { 7607 bdev->write_unit_size = 1; 7608 } 7609 7610 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7611 if (bdev->acwu == 0) { 7612 bdev->acwu = bdev->write_unit_size; 7613 } 7614 7615 if (bdev->phys_blocklen == 0) { 7616 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7617 } 7618 7619 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7620 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7621 } 7622 7623 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7624 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7625 } 7626 7627 bdev->internal.reset_in_progress = NULL; 7628 bdev->internal.qd_poll_in_progress = false; 7629 bdev->internal.period = 0; 7630 bdev->internal.new_period = 0; 7631 7632 spdk_io_device_register(__bdev_to_io_dev(bdev), 7633 bdev_channel_create, bdev_channel_destroy, 7634 sizeof(struct spdk_bdev_channel), 7635 bdev_name); 7636 7637 free(bdev_name); 7638 7639 spdk_spin_init(&bdev->internal.spinlock); 7640 7641 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7642 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7643 7644 return 0; 7645 } 7646 7647 static void 7648 bdev_destroy_cb(void *io_device) 7649 { 7650 int rc; 7651 struct spdk_bdev *bdev; 7652 spdk_bdev_unregister_cb cb_fn; 7653 void *cb_arg; 7654 7655 bdev = __bdev_from_io_dev(io_device); 7656 7657 if (bdev->internal.unregister_td != spdk_get_thread()) { 7658 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7659 return; 7660 } 7661 7662 cb_fn = bdev->internal.unregister_cb; 7663 cb_arg = bdev->internal.unregister_ctx; 7664 7665 spdk_spin_destroy(&bdev->internal.spinlock); 7666 free(bdev->internal.qos); 7667 bdev_free_io_stat(bdev->internal.stat); 7668 7669 rc = bdev->fn_table->destruct(bdev->ctxt); 7670 if (rc < 0) { 7671 SPDK_ERRLOG("destruct failed\n"); 7672 } 7673 if (rc <= 0 && cb_fn != NULL) { 7674 cb_fn(cb_arg, rc); 7675 } 7676 } 7677 7678 void 7679 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7680 { 7681 if (bdev->internal.unregister_cb != NULL) { 7682 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7683 } 7684 } 7685 7686 static void 7687 _remove_notify(void *arg) 7688 { 7689 struct spdk_bdev_desc *desc = arg; 7690 7691 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7692 } 7693 7694 /* returns: 0 - bdev removed and ready to be destructed. 7695 * -EBUSY - bdev can't be destructed yet. */ 7696 static int 7697 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7698 { 7699 struct spdk_bdev_desc *desc, *tmp; 7700 int rc = 0; 7701 char uuid[SPDK_UUID_STRING_LEN]; 7702 7703 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7704 assert(spdk_spin_held(&bdev->internal.spinlock)); 7705 7706 /* Notify each descriptor about hotremoval */ 7707 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7708 rc = -EBUSY; 7709 /* 7710 * Defer invocation of the event_cb to a separate message that will 7711 * run later on its thread. This ensures this context unwinds and 7712 * we don't recursively unregister this bdev again if the event_cb 7713 * immediately closes its descriptor. 7714 */ 7715 event_notify(desc, _remove_notify); 7716 } 7717 7718 /* If there are no descriptors, proceed removing the bdev */ 7719 if (rc == 0) { 7720 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7721 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7722 7723 /* Delete the name and the UUID alias */ 7724 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7725 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7726 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7727 7728 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7729 7730 if (bdev->internal.reset_in_progress != NULL) { 7731 /* If reset is in progress, let the completion callback for reset 7732 * unregister the bdev. 7733 */ 7734 rc = -EBUSY; 7735 } 7736 } 7737 7738 return rc; 7739 } 7740 7741 static void 7742 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7743 struct spdk_io_channel *io_ch, void *_ctx) 7744 { 7745 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7746 7747 bdev_channel_abort_queued_ios(bdev_ch); 7748 spdk_bdev_for_each_channel_continue(i, 0); 7749 } 7750 7751 static void 7752 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7753 { 7754 int rc; 7755 7756 spdk_spin_lock(&g_bdev_mgr.spinlock); 7757 spdk_spin_lock(&bdev->internal.spinlock); 7758 /* 7759 * Set the status to REMOVING after completing to abort channels. Otherwise, 7760 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7761 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7762 * may fail. 7763 */ 7764 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7765 rc = bdev_unregister_unsafe(bdev); 7766 spdk_spin_unlock(&bdev->internal.spinlock); 7767 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7768 7769 if (rc == 0) { 7770 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7771 } 7772 } 7773 7774 void 7775 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7776 { 7777 struct spdk_thread *thread; 7778 7779 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7780 7781 thread = spdk_get_thread(); 7782 if (!thread) { 7783 /* The user called this from a non-SPDK thread. */ 7784 if (cb_fn != NULL) { 7785 cb_fn(cb_arg, -ENOTSUP); 7786 } 7787 return; 7788 } 7789 7790 spdk_spin_lock(&g_bdev_mgr.spinlock); 7791 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7792 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7793 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7794 if (cb_fn) { 7795 cb_fn(cb_arg, -EBUSY); 7796 } 7797 return; 7798 } 7799 7800 spdk_spin_lock(&bdev->internal.spinlock); 7801 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7802 bdev->internal.unregister_cb = cb_fn; 7803 bdev->internal.unregister_ctx = cb_arg; 7804 bdev->internal.unregister_td = thread; 7805 spdk_spin_unlock(&bdev->internal.spinlock); 7806 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7807 7808 spdk_bdev_set_qd_sampling_period(bdev, 0); 7809 7810 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7811 bdev_unregister); 7812 } 7813 7814 int 7815 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7816 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7817 { 7818 struct spdk_bdev_desc *desc; 7819 struct spdk_bdev *bdev; 7820 int rc; 7821 7822 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7823 if (rc != 0) { 7824 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7825 return rc; 7826 } 7827 7828 bdev = spdk_bdev_desc_get_bdev(desc); 7829 7830 if (bdev->module != module) { 7831 spdk_bdev_close(desc); 7832 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7833 bdev_name); 7834 return -ENODEV; 7835 } 7836 7837 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7838 7839 spdk_bdev_close(desc); 7840 7841 return 0; 7842 } 7843 7844 static int 7845 bdev_start_qos(struct spdk_bdev *bdev) 7846 { 7847 struct set_qos_limit_ctx *ctx; 7848 7849 /* Enable QoS */ 7850 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7851 ctx = calloc(1, sizeof(*ctx)); 7852 if (ctx == NULL) { 7853 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7854 return -ENOMEM; 7855 } 7856 ctx->bdev = bdev; 7857 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7858 } 7859 7860 return 0; 7861 } 7862 7863 static void 7864 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7865 struct spdk_bdev *bdev) 7866 { 7867 enum spdk_bdev_claim_type type; 7868 const char *typename, *modname; 7869 extern struct spdk_log_flag SPDK_LOG_bdev; 7870 7871 assert(spdk_spin_held(&bdev->internal.spinlock)); 7872 7873 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7874 return; 7875 } 7876 7877 type = bdev->internal.claim_type; 7878 typename = spdk_bdev_claim_get_name(type); 7879 7880 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7881 modname = bdev->internal.claim.v1.module->name; 7882 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7883 bdev->name, detail, typename, modname); 7884 return; 7885 } 7886 7887 if (claim_type_is_v2(type)) { 7888 struct spdk_bdev_module_claim *claim; 7889 7890 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7891 modname = claim->module->name; 7892 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7893 bdev->name, detail, typename, modname); 7894 } 7895 return; 7896 } 7897 7898 assert(false); 7899 } 7900 7901 static int 7902 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7903 { 7904 struct spdk_thread *thread; 7905 int rc = 0; 7906 7907 thread = spdk_get_thread(); 7908 if (!thread) { 7909 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7910 return -ENOTSUP; 7911 } 7912 7913 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7914 spdk_get_thread()); 7915 7916 desc->bdev = bdev; 7917 desc->thread = thread; 7918 desc->write = write; 7919 7920 spdk_spin_lock(&bdev->internal.spinlock); 7921 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7922 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7923 spdk_spin_unlock(&bdev->internal.spinlock); 7924 return -ENODEV; 7925 } 7926 7927 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7928 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7929 spdk_spin_unlock(&bdev->internal.spinlock); 7930 return -EPERM; 7931 } 7932 7933 rc = bdev_start_qos(bdev); 7934 if (rc != 0) { 7935 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7936 spdk_spin_unlock(&bdev->internal.spinlock); 7937 return rc; 7938 } 7939 7940 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7941 7942 spdk_spin_unlock(&bdev->internal.spinlock); 7943 7944 return 0; 7945 } 7946 7947 static int 7948 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7949 struct spdk_bdev_desc **_desc) 7950 { 7951 struct spdk_bdev_desc *desc; 7952 unsigned int i; 7953 7954 desc = calloc(1, sizeof(*desc)); 7955 if (desc == NULL) { 7956 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7957 return -ENOMEM; 7958 } 7959 7960 TAILQ_INIT(&desc->pending_media_events); 7961 TAILQ_INIT(&desc->free_media_events); 7962 7963 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7964 desc->callback.event_fn = event_cb; 7965 desc->callback.ctx = event_ctx; 7966 spdk_spin_init(&desc->spinlock); 7967 7968 if (bdev->media_events) { 7969 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7970 sizeof(*desc->media_events_buffer)); 7971 if (desc->media_events_buffer == NULL) { 7972 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7973 bdev_desc_free(desc); 7974 return -ENOMEM; 7975 } 7976 7977 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7978 TAILQ_INSERT_TAIL(&desc->free_media_events, 7979 &desc->media_events_buffer[i], tailq); 7980 } 7981 } 7982 7983 if (bdev->fn_table->accel_sequence_supported != NULL) { 7984 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7985 desc->accel_sequence_supported[i] = 7986 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7987 (enum spdk_bdev_io_type)i); 7988 } 7989 } 7990 7991 *_desc = desc; 7992 7993 return 0; 7994 } 7995 7996 static int 7997 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7998 void *event_ctx, struct spdk_bdev_desc **_desc) 7999 { 8000 struct spdk_bdev_desc *desc; 8001 struct spdk_bdev *bdev; 8002 int rc; 8003 8004 bdev = bdev_get_by_name(bdev_name); 8005 8006 if (bdev == NULL) { 8007 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8008 return -ENODEV; 8009 } 8010 8011 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8012 if (rc != 0) { 8013 return rc; 8014 } 8015 8016 rc = bdev_open(bdev, write, desc); 8017 if (rc != 0) { 8018 bdev_desc_free(desc); 8019 desc = NULL; 8020 } 8021 8022 *_desc = desc; 8023 8024 return rc; 8025 } 8026 8027 int 8028 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8029 void *event_ctx, struct spdk_bdev_desc **_desc) 8030 { 8031 int rc; 8032 8033 if (event_cb == NULL) { 8034 SPDK_ERRLOG("Missing event callback function\n"); 8035 return -EINVAL; 8036 } 8037 8038 spdk_spin_lock(&g_bdev_mgr.spinlock); 8039 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8040 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8041 8042 return rc; 8043 } 8044 8045 struct spdk_bdev_open_async_ctx { 8046 char *bdev_name; 8047 spdk_bdev_event_cb_t event_cb; 8048 void *event_ctx; 8049 bool write; 8050 int rc; 8051 spdk_bdev_open_async_cb_t cb_fn; 8052 void *cb_arg; 8053 struct spdk_bdev_desc *desc; 8054 struct spdk_bdev_open_async_opts opts; 8055 uint64_t start_ticks; 8056 struct spdk_thread *orig_thread; 8057 struct spdk_poller *poller; 8058 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8059 }; 8060 8061 static void 8062 bdev_open_async_done(void *arg) 8063 { 8064 struct spdk_bdev_open_async_ctx *ctx = arg; 8065 8066 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8067 8068 free(ctx->bdev_name); 8069 free(ctx); 8070 } 8071 8072 static void 8073 bdev_open_async_cancel(void *arg) 8074 { 8075 struct spdk_bdev_open_async_ctx *ctx = arg; 8076 8077 assert(ctx->rc == -ESHUTDOWN); 8078 8079 spdk_poller_unregister(&ctx->poller); 8080 8081 bdev_open_async_done(ctx); 8082 } 8083 8084 /* This is called when the bdev library finishes at shutdown. */ 8085 static void 8086 bdev_open_async_fini(void) 8087 { 8088 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8089 8090 spdk_spin_lock(&g_bdev_mgr.spinlock); 8091 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8092 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8093 /* 8094 * We have to move to ctx->orig_thread to unregister ctx->poller. 8095 * However, there is a chance that ctx->poller is executed before 8096 * message is executed, which could result in bdev_open_async_done() 8097 * being called twice. To avoid such race condition, set ctx->rc to 8098 * -ESHUTDOWN. 8099 */ 8100 ctx->rc = -ESHUTDOWN; 8101 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8102 } 8103 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8104 } 8105 8106 static int bdev_open_async(void *arg); 8107 8108 static void 8109 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8110 { 8111 uint64_t timeout_ticks; 8112 8113 if (ctx->rc == -ESHUTDOWN) { 8114 /* This context is being canceled. Do nothing. */ 8115 return; 8116 } 8117 8118 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8119 &ctx->desc); 8120 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8121 goto exit; 8122 } 8123 8124 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8125 if (spdk_get_ticks() >= timeout_ticks) { 8126 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8127 ctx->rc = -ETIMEDOUT; 8128 goto exit; 8129 } 8130 8131 return; 8132 8133 exit: 8134 spdk_poller_unregister(&ctx->poller); 8135 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8136 8137 /* Completion callback is processed after stack unwinding. */ 8138 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8139 } 8140 8141 static int 8142 bdev_open_async(void *arg) 8143 { 8144 struct spdk_bdev_open_async_ctx *ctx = arg; 8145 8146 spdk_spin_lock(&g_bdev_mgr.spinlock); 8147 8148 _bdev_open_async(ctx); 8149 8150 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8151 8152 return SPDK_POLLER_BUSY; 8153 } 8154 8155 static void 8156 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8157 struct spdk_bdev_open_async_opts *opts_src, 8158 size_t size) 8159 { 8160 assert(opts); 8161 assert(opts_src); 8162 8163 opts->size = size; 8164 8165 #define SET_FIELD(field) \ 8166 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8167 opts->field = opts_src->field; \ 8168 } \ 8169 8170 SET_FIELD(timeout_ms); 8171 8172 /* Do not remove this statement, you should always update this statement when you adding a new field, 8173 * and do not forget to add the SET_FIELD statement for your added field. */ 8174 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8175 8176 #undef SET_FIELD 8177 } 8178 8179 static void 8180 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8181 { 8182 assert(opts); 8183 8184 opts->size = size; 8185 8186 #define SET_FIELD(field, value) \ 8187 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8188 opts->field = value; \ 8189 } \ 8190 8191 SET_FIELD(timeout_ms, 0); 8192 8193 #undef SET_FIELD 8194 } 8195 8196 int 8197 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8198 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8199 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8200 { 8201 struct spdk_bdev_open_async_ctx *ctx; 8202 8203 if (event_cb == NULL) { 8204 SPDK_ERRLOG("Missing event callback function\n"); 8205 return -EINVAL; 8206 } 8207 8208 if (open_cb == NULL) { 8209 SPDK_ERRLOG("Missing open callback function\n"); 8210 return -EINVAL; 8211 } 8212 8213 if (opts != NULL && opts->size == 0) { 8214 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8215 return -EINVAL; 8216 } 8217 8218 ctx = calloc(1, sizeof(*ctx)); 8219 if (ctx == NULL) { 8220 SPDK_ERRLOG("Failed to allocate open context\n"); 8221 return -ENOMEM; 8222 } 8223 8224 ctx->bdev_name = strdup(bdev_name); 8225 if (ctx->bdev_name == NULL) { 8226 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8227 free(ctx); 8228 return -ENOMEM; 8229 } 8230 8231 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8232 if (ctx->poller == NULL) { 8233 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8234 free(ctx->bdev_name); 8235 free(ctx); 8236 return -ENOMEM; 8237 } 8238 8239 ctx->cb_fn = open_cb; 8240 ctx->cb_arg = open_cb_arg; 8241 ctx->write = write; 8242 ctx->event_cb = event_cb; 8243 ctx->event_ctx = event_ctx; 8244 ctx->orig_thread = spdk_get_thread(); 8245 ctx->start_ticks = spdk_get_ticks(); 8246 8247 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8248 if (opts != NULL) { 8249 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8250 } 8251 8252 spdk_spin_lock(&g_bdev_mgr.spinlock); 8253 8254 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8255 _bdev_open_async(ctx); 8256 8257 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8258 8259 return 0; 8260 } 8261 8262 static void 8263 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8264 { 8265 int rc; 8266 8267 spdk_spin_lock(&bdev->internal.spinlock); 8268 spdk_spin_lock(&desc->spinlock); 8269 8270 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8271 8272 desc->closed = true; 8273 8274 if (desc->claim != NULL) { 8275 bdev_desc_release_claims(desc); 8276 } 8277 8278 if (0 == desc->refs) { 8279 spdk_spin_unlock(&desc->spinlock); 8280 bdev_desc_free(desc); 8281 } else { 8282 spdk_spin_unlock(&desc->spinlock); 8283 } 8284 8285 /* If no more descriptors, kill QoS channel */ 8286 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8287 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8288 bdev->name, spdk_get_thread()); 8289 8290 if (bdev_qos_destroy(bdev)) { 8291 /* There isn't anything we can do to recover here. Just let the 8292 * old QoS poller keep running. The QoS handling won't change 8293 * cores when the user allocates a new channel, but it won't break. */ 8294 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8295 } 8296 } 8297 8298 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8299 rc = bdev_unregister_unsafe(bdev); 8300 spdk_spin_unlock(&bdev->internal.spinlock); 8301 8302 if (rc == 0) { 8303 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8304 } 8305 } else { 8306 spdk_spin_unlock(&bdev->internal.spinlock); 8307 } 8308 } 8309 8310 void 8311 spdk_bdev_close(struct spdk_bdev_desc *desc) 8312 { 8313 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8314 8315 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8316 spdk_get_thread()); 8317 8318 assert(desc->thread == spdk_get_thread()); 8319 8320 spdk_poller_unregister(&desc->io_timeout_poller); 8321 8322 spdk_spin_lock(&g_bdev_mgr.spinlock); 8323 8324 bdev_close(bdev, desc); 8325 8326 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8327 } 8328 8329 static void 8330 bdev_register_finished(void *arg) 8331 { 8332 struct spdk_bdev_desc *desc = arg; 8333 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8334 8335 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8336 8337 spdk_spin_lock(&g_bdev_mgr.spinlock); 8338 8339 bdev_close(bdev, desc); 8340 8341 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8342 } 8343 8344 int 8345 spdk_bdev_register(struct spdk_bdev *bdev) 8346 { 8347 struct spdk_bdev_desc *desc; 8348 struct spdk_thread *thread = spdk_get_thread(); 8349 int rc; 8350 8351 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8352 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 8353 thread ? spdk_thread_get_name(thread) : "null"); 8354 return -EINVAL; 8355 } 8356 8357 rc = bdev_register(bdev); 8358 if (rc != 0) { 8359 return rc; 8360 } 8361 8362 /* A descriptor is opened to prevent bdev deletion during examination */ 8363 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8364 if (rc != 0) { 8365 spdk_bdev_unregister(bdev, NULL, NULL); 8366 return rc; 8367 } 8368 8369 rc = bdev_open(bdev, false, desc); 8370 if (rc != 0) { 8371 bdev_desc_free(desc); 8372 spdk_bdev_unregister(bdev, NULL, NULL); 8373 return rc; 8374 } 8375 8376 /* Examine configuration before initializing I/O */ 8377 bdev_examine(bdev); 8378 8379 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8380 if (rc != 0) { 8381 bdev_close(bdev, desc); 8382 spdk_bdev_unregister(bdev, NULL, NULL); 8383 } 8384 8385 return rc; 8386 } 8387 8388 int 8389 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8390 struct spdk_bdev_module *module) 8391 { 8392 spdk_spin_lock(&bdev->internal.spinlock); 8393 8394 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8395 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8396 spdk_spin_unlock(&bdev->internal.spinlock); 8397 return -EPERM; 8398 } 8399 8400 if (desc && !desc->write) { 8401 desc->write = true; 8402 } 8403 8404 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8405 bdev->internal.claim.v1.module = module; 8406 8407 spdk_spin_unlock(&bdev->internal.spinlock); 8408 return 0; 8409 } 8410 8411 void 8412 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8413 { 8414 spdk_spin_lock(&bdev->internal.spinlock); 8415 8416 assert(bdev->internal.claim.v1.module != NULL); 8417 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8418 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8419 bdev->internal.claim.v1.module = NULL; 8420 8421 spdk_spin_unlock(&bdev->internal.spinlock); 8422 } 8423 8424 /* 8425 * Start claims v2 8426 */ 8427 8428 const char * 8429 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8430 { 8431 switch (type) { 8432 case SPDK_BDEV_CLAIM_NONE: 8433 return "not_claimed"; 8434 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8435 return "exclusive_write"; 8436 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8437 return "read_many_write_one"; 8438 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8439 return "read_many_write_none"; 8440 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8441 return "read_many_write_many"; 8442 default: 8443 break; 8444 } 8445 return "invalid_claim"; 8446 } 8447 8448 static bool 8449 claim_type_is_v2(enum spdk_bdev_claim_type type) 8450 { 8451 switch (type) { 8452 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8453 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8454 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8455 return true; 8456 default: 8457 break; 8458 } 8459 return false; 8460 } 8461 8462 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8463 static bool 8464 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8465 { 8466 switch (type) { 8467 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8468 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8469 return true; 8470 default: 8471 break; 8472 } 8473 return false; 8474 } 8475 8476 void 8477 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8478 { 8479 if (opts == NULL) { 8480 SPDK_ERRLOG("opts should not be NULL\n"); 8481 assert(opts != NULL); 8482 return; 8483 } 8484 if (size == 0) { 8485 SPDK_ERRLOG("size should not be zero\n"); 8486 assert(size != 0); 8487 return; 8488 } 8489 8490 memset(opts, 0, size); 8491 opts->opts_size = size; 8492 8493 #define FIELD_OK(field) \ 8494 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8495 8496 #define SET_FIELD(field, value) \ 8497 if (FIELD_OK(field)) { \ 8498 opts->field = value; \ 8499 } \ 8500 8501 SET_FIELD(shared_claim_key, 0); 8502 8503 #undef FIELD_OK 8504 #undef SET_FIELD 8505 } 8506 8507 static int 8508 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8509 { 8510 if (src->opts_size == 0) { 8511 SPDK_ERRLOG("size should not be zero\n"); 8512 return -1; 8513 } 8514 8515 memset(dst, 0, sizeof(*dst)); 8516 dst->opts_size = src->opts_size; 8517 8518 #define FIELD_OK(field) \ 8519 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8520 8521 #define SET_FIELD(field) \ 8522 if (FIELD_OK(field)) { \ 8523 dst->field = src->field; \ 8524 } \ 8525 8526 if (FIELD_OK(name)) { 8527 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8528 } 8529 8530 SET_FIELD(shared_claim_key); 8531 8532 /* You should not remove this statement, but need to update the assert statement 8533 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8534 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8535 8536 #undef FIELD_OK 8537 #undef SET_FIELD 8538 return 0; 8539 } 8540 8541 /* Returns 0 if a read-write-once claim can be taken. */ 8542 static int 8543 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8544 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8545 { 8546 struct spdk_bdev *bdev = desc->bdev; 8547 struct spdk_bdev_desc *open_desc; 8548 8549 assert(spdk_spin_held(&bdev->internal.spinlock)); 8550 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8551 8552 if (opts->shared_claim_key != 0) { 8553 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8554 bdev->name); 8555 return -EINVAL; 8556 } 8557 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8558 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8559 return -EPERM; 8560 } 8561 if (desc->claim != NULL) { 8562 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8563 bdev->name, desc->claim->module->name); 8564 return -EPERM; 8565 } 8566 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8567 if (desc != open_desc && open_desc->write) { 8568 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8569 "another descriptor is open for writing\n", 8570 bdev->name); 8571 return -EPERM; 8572 } 8573 } 8574 8575 return 0; 8576 } 8577 8578 /* Returns 0 if a read-only-many claim can be taken. */ 8579 static int 8580 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8581 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8582 { 8583 struct spdk_bdev *bdev = desc->bdev; 8584 struct spdk_bdev_desc *open_desc; 8585 8586 assert(spdk_spin_held(&bdev->internal.spinlock)); 8587 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8588 assert(desc->claim == NULL); 8589 8590 if (desc->write) { 8591 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8592 bdev->name); 8593 return -EINVAL; 8594 } 8595 if (opts->shared_claim_key != 0) { 8596 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8597 return -EINVAL; 8598 } 8599 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8600 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8601 if (open_desc->write) { 8602 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8603 "another descriptor is open for writing\n", 8604 bdev->name); 8605 return -EPERM; 8606 } 8607 } 8608 } 8609 8610 return 0; 8611 } 8612 8613 /* Returns 0 if a read-write-many claim can be taken. */ 8614 static int 8615 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8616 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8617 { 8618 struct spdk_bdev *bdev = desc->bdev; 8619 struct spdk_bdev_desc *open_desc; 8620 8621 assert(spdk_spin_held(&bdev->internal.spinlock)); 8622 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8623 assert(desc->claim == NULL); 8624 8625 if (opts->shared_claim_key == 0) { 8626 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8627 bdev->name); 8628 return -EINVAL; 8629 } 8630 switch (bdev->internal.claim_type) { 8631 case SPDK_BDEV_CLAIM_NONE: 8632 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8633 if (open_desc == desc) { 8634 continue; 8635 } 8636 if (open_desc->write) { 8637 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8638 "another descriptor is open for writing without a " 8639 "claim\n", bdev->name); 8640 return -EPERM; 8641 } 8642 } 8643 break; 8644 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8645 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8646 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8647 return -EPERM; 8648 } 8649 break; 8650 default: 8651 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8652 return -EBUSY; 8653 } 8654 8655 return 0; 8656 } 8657 8658 /* Updates desc and its bdev with a v2 claim. */ 8659 static int 8660 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8661 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8662 { 8663 struct spdk_bdev *bdev = desc->bdev; 8664 struct spdk_bdev_module_claim *claim; 8665 8666 assert(spdk_spin_held(&bdev->internal.spinlock)); 8667 assert(claim_type_is_v2(type)); 8668 assert(desc->claim == NULL); 8669 8670 claim = calloc(1, sizeof(*desc->claim)); 8671 if (claim == NULL) { 8672 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8673 return -ENOMEM; 8674 } 8675 claim->module = module; 8676 claim->desc = desc; 8677 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8678 memcpy(claim->name, opts->name, sizeof(claim->name)); 8679 desc->claim = claim; 8680 8681 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8682 bdev->internal.claim_type = type; 8683 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8684 bdev->internal.claim.v2.key = opts->shared_claim_key; 8685 } 8686 assert(type == bdev->internal.claim_type); 8687 8688 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8689 8690 if (!desc->write && claim_type_promotes_to_write(type)) { 8691 desc->write = true; 8692 } 8693 8694 return 0; 8695 } 8696 8697 int 8698 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8699 struct spdk_bdev_claim_opts *_opts, 8700 struct spdk_bdev_module *module) 8701 { 8702 struct spdk_bdev *bdev; 8703 struct spdk_bdev_claim_opts opts; 8704 int rc = 0; 8705 8706 if (desc == NULL) { 8707 SPDK_ERRLOG("descriptor must not be NULL\n"); 8708 return -EINVAL; 8709 } 8710 8711 bdev = desc->bdev; 8712 8713 if (_opts == NULL) { 8714 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8715 } else if (claim_opts_copy(_opts, &opts) != 0) { 8716 return -EINVAL; 8717 } 8718 8719 spdk_spin_lock(&bdev->internal.spinlock); 8720 8721 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8722 bdev->internal.claim_type != type) { 8723 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8724 spdk_spin_unlock(&bdev->internal.spinlock); 8725 return -EPERM; 8726 } 8727 8728 if (claim_type_is_v2(type) && desc->claim != NULL) { 8729 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8730 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8731 spdk_spin_unlock(&bdev->internal.spinlock); 8732 return -EPERM; 8733 } 8734 8735 switch (type) { 8736 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8737 spdk_spin_unlock(&bdev->internal.spinlock); 8738 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8739 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8740 rc = claim_verify_rwo(desc, type, &opts, module); 8741 break; 8742 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8743 rc = claim_verify_rom(desc, type, &opts, module); 8744 break; 8745 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8746 rc = claim_verify_rwm(desc, type, &opts, module); 8747 break; 8748 default: 8749 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8750 rc = -ENOTSUP; 8751 } 8752 8753 if (rc == 0) { 8754 rc = claim_bdev(desc, type, &opts, module); 8755 } 8756 8757 spdk_spin_unlock(&bdev->internal.spinlock); 8758 return rc; 8759 } 8760 8761 static void 8762 claim_reset(struct spdk_bdev *bdev) 8763 { 8764 assert(spdk_spin_held(&bdev->internal.spinlock)); 8765 assert(claim_type_is_v2(bdev->internal.claim_type)); 8766 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8767 8768 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8769 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8770 } 8771 8772 static void 8773 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8774 { 8775 struct spdk_bdev *bdev = desc->bdev; 8776 8777 assert(spdk_spin_held(&bdev->internal.spinlock)); 8778 assert(claim_type_is_v2(bdev->internal.claim_type)); 8779 8780 if (bdev->internal.examine_in_progress == 0) { 8781 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8782 free(desc->claim); 8783 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8784 claim_reset(bdev); 8785 } 8786 } else { 8787 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8788 desc->claim->module = NULL; 8789 desc->claim->desc = NULL; 8790 } 8791 desc->claim = NULL; 8792 } 8793 8794 /* 8795 * End claims v2 8796 */ 8797 8798 struct spdk_bdev * 8799 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8800 { 8801 assert(desc != NULL); 8802 return desc->bdev; 8803 } 8804 8805 int 8806 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8807 { 8808 struct spdk_bdev *bdev, *tmp; 8809 struct spdk_bdev_desc *desc; 8810 int rc = 0; 8811 8812 assert(fn != NULL); 8813 8814 spdk_spin_lock(&g_bdev_mgr.spinlock); 8815 bdev = spdk_bdev_first(); 8816 while (bdev != NULL) { 8817 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8818 if (rc != 0) { 8819 break; 8820 } 8821 rc = bdev_open(bdev, false, desc); 8822 if (rc != 0) { 8823 bdev_desc_free(desc); 8824 if (rc == -ENODEV) { 8825 /* Ignore the error and move to the next bdev. */ 8826 rc = 0; 8827 bdev = spdk_bdev_next(bdev); 8828 continue; 8829 } 8830 break; 8831 } 8832 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8833 8834 rc = fn(ctx, bdev); 8835 8836 spdk_spin_lock(&g_bdev_mgr.spinlock); 8837 tmp = spdk_bdev_next(bdev); 8838 bdev_close(bdev, desc); 8839 if (rc != 0) { 8840 break; 8841 } 8842 bdev = tmp; 8843 } 8844 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8845 8846 return rc; 8847 } 8848 8849 int 8850 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8851 { 8852 struct spdk_bdev *bdev, *tmp; 8853 struct spdk_bdev_desc *desc; 8854 int rc = 0; 8855 8856 assert(fn != NULL); 8857 8858 spdk_spin_lock(&g_bdev_mgr.spinlock); 8859 bdev = spdk_bdev_first_leaf(); 8860 while (bdev != NULL) { 8861 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8862 if (rc != 0) { 8863 break; 8864 } 8865 rc = bdev_open(bdev, false, desc); 8866 if (rc != 0) { 8867 bdev_desc_free(desc); 8868 if (rc == -ENODEV) { 8869 /* Ignore the error and move to the next bdev. */ 8870 rc = 0; 8871 bdev = spdk_bdev_next_leaf(bdev); 8872 continue; 8873 } 8874 break; 8875 } 8876 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8877 8878 rc = fn(ctx, bdev); 8879 8880 spdk_spin_lock(&g_bdev_mgr.spinlock); 8881 tmp = spdk_bdev_next_leaf(bdev); 8882 bdev_close(bdev, desc); 8883 if (rc != 0) { 8884 break; 8885 } 8886 bdev = tmp; 8887 } 8888 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8889 8890 return rc; 8891 } 8892 8893 void 8894 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8895 { 8896 struct iovec *iovs; 8897 int iovcnt; 8898 8899 if (bdev_io == NULL) { 8900 return; 8901 } 8902 8903 switch (bdev_io->type) { 8904 case SPDK_BDEV_IO_TYPE_READ: 8905 case SPDK_BDEV_IO_TYPE_WRITE: 8906 case SPDK_BDEV_IO_TYPE_ZCOPY: 8907 iovs = bdev_io->u.bdev.iovs; 8908 iovcnt = bdev_io->u.bdev.iovcnt; 8909 break; 8910 default: 8911 iovs = NULL; 8912 iovcnt = 0; 8913 break; 8914 } 8915 8916 if (iovp) { 8917 *iovp = iovs; 8918 } 8919 if (iovcntp) { 8920 *iovcntp = iovcnt; 8921 } 8922 } 8923 8924 void * 8925 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8926 { 8927 if (bdev_io == NULL) { 8928 return NULL; 8929 } 8930 8931 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8932 return NULL; 8933 } 8934 8935 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8936 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8937 return bdev_io->u.bdev.md_buf; 8938 } 8939 8940 return NULL; 8941 } 8942 8943 void * 8944 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8945 { 8946 if (bdev_io == NULL) { 8947 assert(false); 8948 return NULL; 8949 } 8950 8951 return bdev_io->internal.caller_ctx; 8952 } 8953 8954 void 8955 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8956 { 8957 8958 if (spdk_bdev_module_list_find(bdev_module->name)) { 8959 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8960 assert(false); 8961 } 8962 8963 spdk_spin_init(&bdev_module->internal.spinlock); 8964 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8965 8966 /* 8967 * Modules with examine callbacks must be initialized first, so they are 8968 * ready to handle examine callbacks from later modules that will 8969 * register physical bdevs. 8970 */ 8971 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8972 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8973 } else { 8974 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8975 } 8976 } 8977 8978 struct spdk_bdev_module * 8979 spdk_bdev_module_list_find(const char *name) 8980 { 8981 struct spdk_bdev_module *bdev_module; 8982 8983 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8984 if (strcmp(name, bdev_module->name) == 0) { 8985 break; 8986 } 8987 } 8988 8989 return bdev_module; 8990 } 8991 8992 static int 8993 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8994 { 8995 uint64_t num_blocks; 8996 void *md_buf = NULL; 8997 8998 num_blocks = bdev_io->u.bdev.num_blocks; 8999 9000 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9001 md_buf = (char *)g_bdev_mgr.zero_buffer + 9002 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9003 } 9004 9005 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9006 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9007 g_bdev_mgr.zero_buffer, md_buf, 9008 bdev_io->u.bdev.offset_blocks, num_blocks, 9009 bdev_write_zero_buffer_done, bdev_io); 9010 } 9011 9012 static void 9013 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9014 { 9015 struct spdk_bdev_io *parent_io = cb_arg; 9016 9017 spdk_bdev_free_io(bdev_io); 9018 9019 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9020 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9021 } 9022 9023 static void 9024 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9025 { 9026 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9027 ctx->bdev->internal.qos_mod_in_progress = false; 9028 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9029 9030 if (ctx->cb_fn) { 9031 ctx->cb_fn(ctx->cb_arg, status); 9032 } 9033 free(ctx); 9034 } 9035 9036 static void 9037 bdev_disable_qos_done(void *cb_arg) 9038 { 9039 struct set_qos_limit_ctx *ctx = cb_arg; 9040 struct spdk_bdev *bdev = ctx->bdev; 9041 struct spdk_bdev_qos *qos; 9042 9043 spdk_spin_lock(&bdev->internal.spinlock); 9044 qos = bdev->internal.qos; 9045 bdev->internal.qos = NULL; 9046 spdk_spin_unlock(&bdev->internal.spinlock); 9047 9048 if (qos->thread != NULL) { 9049 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9050 spdk_poller_unregister(&qos->poller); 9051 } 9052 9053 free(qos); 9054 9055 bdev_set_qos_limit_done(ctx, 0); 9056 } 9057 9058 static void 9059 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9060 { 9061 struct set_qos_limit_ctx *ctx = _ctx; 9062 struct spdk_thread *thread; 9063 9064 spdk_spin_lock(&bdev->internal.spinlock); 9065 thread = bdev->internal.qos->thread; 9066 spdk_spin_unlock(&bdev->internal.spinlock); 9067 9068 if (thread != NULL) { 9069 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9070 } else { 9071 bdev_disable_qos_done(ctx); 9072 } 9073 } 9074 9075 static void 9076 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9077 struct spdk_io_channel *ch, void *_ctx) 9078 { 9079 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9080 struct spdk_bdev_io *bdev_io; 9081 9082 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9083 9084 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9085 /* Re-submit the queued I/O. */ 9086 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9087 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9088 _bdev_io_submit(bdev_io); 9089 } 9090 9091 spdk_bdev_for_each_channel_continue(i, 0); 9092 } 9093 9094 static void 9095 bdev_update_qos_rate_limit_msg(void *cb_arg) 9096 { 9097 struct set_qos_limit_ctx *ctx = cb_arg; 9098 struct spdk_bdev *bdev = ctx->bdev; 9099 9100 spdk_spin_lock(&bdev->internal.spinlock); 9101 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9102 spdk_spin_unlock(&bdev->internal.spinlock); 9103 9104 bdev_set_qos_limit_done(ctx, 0); 9105 } 9106 9107 static void 9108 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9109 struct spdk_io_channel *ch, void *_ctx) 9110 { 9111 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9112 9113 spdk_spin_lock(&bdev->internal.spinlock); 9114 bdev_enable_qos(bdev, bdev_ch); 9115 spdk_spin_unlock(&bdev->internal.spinlock); 9116 spdk_bdev_for_each_channel_continue(i, 0); 9117 } 9118 9119 static void 9120 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9121 { 9122 struct set_qos_limit_ctx *ctx = _ctx; 9123 9124 bdev_set_qos_limit_done(ctx, status); 9125 } 9126 9127 static void 9128 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9129 { 9130 int i; 9131 9132 assert(bdev->internal.qos != NULL); 9133 9134 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9135 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9136 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9137 9138 if (limits[i] == 0) { 9139 bdev->internal.qos->rate_limits[i].limit = 9140 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9141 } 9142 } 9143 } 9144 } 9145 9146 void 9147 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9148 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9149 { 9150 struct set_qos_limit_ctx *ctx; 9151 uint32_t limit_set_complement; 9152 uint64_t min_limit_per_sec; 9153 int i; 9154 bool disable_rate_limit = true; 9155 9156 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9157 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9158 continue; 9159 } 9160 9161 if (limits[i] > 0) { 9162 disable_rate_limit = false; 9163 } 9164 9165 if (bdev_qos_is_iops_rate_limit(i) == true) { 9166 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9167 } else { 9168 /* Change from megabyte to byte rate limit */ 9169 limits[i] = limits[i] * 1024 * 1024; 9170 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9171 } 9172 9173 limit_set_complement = limits[i] % min_limit_per_sec; 9174 if (limit_set_complement) { 9175 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9176 limits[i], min_limit_per_sec); 9177 limits[i] += min_limit_per_sec - limit_set_complement; 9178 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9179 } 9180 } 9181 9182 ctx = calloc(1, sizeof(*ctx)); 9183 if (ctx == NULL) { 9184 cb_fn(cb_arg, -ENOMEM); 9185 return; 9186 } 9187 9188 ctx->cb_fn = cb_fn; 9189 ctx->cb_arg = cb_arg; 9190 ctx->bdev = bdev; 9191 9192 spdk_spin_lock(&bdev->internal.spinlock); 9193 if (bdev->internal.qos_mod_in_progress) { 9194 spdk_spin_unlock(&bdev->internal.spinlock); 9195 free(ctx); 9196 cb_fn(cb_arg, -EAGAIN); 9197 return; 9198 } 9199 bdev->internal.qos_mod_in_progress = true; 9200 9201 if (disable_rate_limit == true && bdev->internal.qos) { 9202 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9203 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9204 (bdev->internal.qos->rate_limits[i].limit > 0 && 9205 bdev->internal.qos->rate_limits[i].limit != 9206 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9207 disable_rate_limit = false; 9208 break; 9209 } 9210 } 9211 } 9212 9213 if (disable_rate_limit == false) { 9214 if (bdev->internal.qos == NULL) { 9215 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9216 if (!bdev->internal.qos) { 9217 spdk_spin_unlock(&bdev->internal.spinlock); 9218 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9219 bdev_set_qos_limit_done(ctx, -ENOMEM); 9220 return; 9221 } 9222 } 9223 9224 if (bdev->internal.qos->thread == NULL) { 9225 /* Enabling */ 9226 bdev_set_qos_rate_limits(bdev, limits); 9227 9228 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9229 bdev_enable_qos_done); 9230 } else { 9231 /* Updating */ 9232 bdev_set_qos_rate_limits(bdev, limits); 9233 9234 spdk_thread_send_msg(bdev->internal.qos->thread, 9235 bdev_update_qos_rate_limit_msg, ctx); 9236 } 9237 } else { 9238 if (bdev->internal.qos != NULL) { 9239 bdev_set_qos_rate_limits(bdev, limits); 9240 9241 /* Disabling */ 9242 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9243 bdev_disable_qos_msg_done); 9244 } else { 9245 spdk_spin_unlock(&bdev->internal.spinlock); 9246 bdev_set_qos_limit_done(ctx, 0); 9247 return; 9248 } 9249 } 9250 9251 spdk_spin_unlock(&bdev->internal.spinlock); 9252 } 9253 9254 struct spdk_bdev_histogram_ctx { 9255 spdk_bdev_histogram_status_cb cb_fn; 9256 void *cb_arg; 9257 struct spdk_bdev *bdev; 9258 int status; 9259 }; 9260 9261 static void 9262 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9263 { 9264 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9265 9266 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9267 ctx->bdev->internal.histogram_in_progress = false; 9268 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9269 ctx->cb_fn(ctx->cb_arg, ctx->status); 9270 free(ctx); 9271 } 9272 9273 static void 9274 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9275 struct spdk_io_channel *_ch, void *_ctx) 9276 { 9277 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9278 9279 if (ch->histogram != NULL) { 9280 spdk_histogram_data_free(ch->histogram); 9281 ch->histogram = NULL; 9282 } 9283 spdk_bdev_for_each_channel_continue(i, 0); 9284 } 9285 9286 static void 9287 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9288 { 9289 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9290 9291 if (status != 0) { 9292 ctx->status = status; 9293 ctx->bdev->internal.histogram_enabled = false; 9294 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9295 bdev_histogram_disable_channel_cb); 9296 } else { 9297 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9298 ctx->bdev->internal.histogram_in_progress = false; 9299 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9300 ctx->cb_fn(ctx->cb_arg, ctx->status); 9301 free(ctx); 9302 } 9303 } 9304 9305 static void 9306 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9307 struct spdk_io_channel *_ch, void *_ctx) 9308 { 9309 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9310 int status = 0; 9311 9312 if (ch->histogram == NULL) { 9313 ch->histogram = spdk_histogram_data_alloc(); 9314 if (ch->histogram == NULL) { 9315 status = -ENOMEM; 9316 } 9317 } 9318 9319 spdk_bdev_for_each_channel_continue(i, status); 9320 } 9321 9322 void 9323 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9324 void *cb_arg, bool enable) 9325 { 9326 struct spdk_bdev_histogram_ctx *ctx; 9327 9328 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9329 if (ctx == NULL) { 9330 cb_fn(cb_arg, -ENOMEM); 9331 return; 9332 } 9333 9334 ctx->bdev = bdev; 9335 ctx->status = 0; 9336 ctx->cb_fn = cb_fn; 9337 ctx->cb_arg = cb_arg; 9338 9339 spdk_spin_lock(&bdev->internal.spinlock); 9340 if (bdev->internal.histogram_in_progress) { 9341 spdk_spin_unlock(&bdev->internal.spinlock); 9342 free(ctx); 9343 cb_fn(cb_arg, -EAGAIN); 9344 return; 9345 } 9346 9347 bdev->internal.histogram_in_progress = true; 9348 spdk_spin_unlock(&bdev->internal.spinlock); 9349 9350 bdev->internal.histogram_enabled = enable; 9351 9352 if (enable) { 9353 /* Allocate histogram for each channel */ 9354 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9355 bdev_histogram_enable_channel_cb); 9356 } else { 9357 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9358 bdev_histogram_disable_channel_cb); 9359 } 9360 } 9361 9362 struct spdk_bdev_histogram_data_ctx { 9363 spdk_bdev_histogram_data_cb cb_fn; 9364 void *cb_arg; 9365 struct spdk_bdev *bdev; 9366 /** merged histogram data from all channels */ 9367 struct spdk_histogram_data *histogram; 9368 }; 9369 9370 static void 9371 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9372 { 9373 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9374 9375 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9376 free(ctx); 9377 } 9378 9379 static void 9380 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9381 struct spdk_io_channel *_ch, void *_ctx) 9382 { 9383 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9384 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9385 int status = 0; 9386 9387 if (ch->histogram == NULL) { 9388 status = -EFAULT; 9389 } else { 9390 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9391 } 9392 9393 spdk_bdev_for_each_channel_continue(i, status); 9394 } 9395 9396 void 9397 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9398 spdk_bdev_histogram_data_cb cb_fn, 9399 void *cb_arg) 9400 { 9401 struct spdk_bdev_histogram_data_ctx *ctx; 9402 9403 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9404 if (ctx == NULL) { 9405 cb_fn(cb_arg, -ENOMEM, NULL); 9406 return; 9407 } 9408 9409 ctx->bdev = bdev; 9410 ctx->cb_fn = cb_fn; 9411 ctx->cb_arg = cb_arg; 9412 9413 ctx->histogram = histogram; 9414 9415 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9416 bdev_histogram_get_channel_cb); 9417 } 9418 9419 void 9420 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9421 void *cb_arg) 9422 { 9423 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9424 int status = 0; 9425 9426 assert(cb_fn != NULL); 9427 9428 if (bdev_ch->histogram == NULL) { 9429 status = -EFAULT; 9430 } 9431 cb_fn(cb_arg, status, bdev_ch->histogram); 9432 } 9433 9434 size_t 9435 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9436 size_t max_events) 9437 { 9438 struct media_event_entry *entry; 9439 size_t num_events = 0; 9440 9441 for (; num_events < max_events; ++num_events) { 9442 entry = TAILQ_FIRST(&desc->pending_media_events); 9443 if (entry == NULL) { 9444 break; 9445 } 9446 9447 events[num_events] = entry->event; 9448 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9449 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9450 } 9451 9452 return num_events; 9453 } 9454 9455 int 9456 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9457 size_t num_events) 9458 { 9459 struct spdk_bdev_desc *desc; 9460 struct media_event_entry *entry; 9461 size_t event_id; 9462 int rc = 0; 9463 9464 assert(bdev->media_events); 9465 9466 spdk_spin_lock(&bdev->internal.spinlock); 9467 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9468 if (desc->write) { 9469 break; 9470 } 9471 } 9472 9473 if (desc == NULL || desc->media_events_buffer == NULL) { 9474 rc = -ENODEV; 9475 goto out; 9476 } 9477 9478 for (event_id = 0; event_id < num_events; ++event_id) { 9479 entry = TAILQ_FIRST(&desc->free_media_events); 9480 if (entry == NULL) { 9481 break; 9482 } 9483 9484 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9485 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9486 entry->event = events[event_id]; 9487 } 9488 9489 rc = event_id; 9490 out: 9491 spdk_spin_unlock(&bdev->internal.spinlock); 9492 return rc; 9493 } 9494 9495 static void 9496 _media_management_notify(void *arg) 9497 { 9498 struct spdk_bdev_desc *desc = arg; 9499 9500 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9501 } 9502 9503 void 9504 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9505 { 9506 struct spdk_bdev_desc *desc; 9507 9508 spdk_spin_lock(&bdev->internal.spinlock); 9509 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9510 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9511 event_notify(desc, _media_management_notify); 9512 } 9513 } 9514 spdk_spin_unlock(&bdev->internal.spinlock); 9515 } 9516 9517 struct locked_lba_range_ctx { 9518 struct lba_range range; 9519 struct lba_range *current_range; 9520 struct lba_range *owner_range; 9521 struct spdk_poller *poller; 9522 lock_range_cb cb_fn; 9523 void *cb_arg; 9524 }; 9525 9526 static void 9527 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9528 { 9529 struct locked_lba_range_ctx *ctx = _ctx; 9530 9531 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9532 free(ctx); 9533 } 9534 9535 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9536 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9537 9538 static void 9539 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9540 { 9541 struct locked_lba_range_ctx *ctx = _ctx; 9542 9543 if (status == -ENOMEM) { 9544 /* One of the channels could not allocate a range object. 9545 * So we have to go back and clean up any ranges that were 9546 * allocated successfully before we return error status to 9547 * the caller. We can reuse the unlock function to do that 9548 * clean up. 9549 */ 9550 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9551 bdev_lock_error_cleanup_cb); 9552 return; 9553 } 9554 9555 /* All channels have locked this range and no I/O overlapping the range 9556 * are outstanding! Set the owner_ch for the range object for the 9557 * locking channel, so that this channel will know that it is allowed 9558 * to write to this range. 9559 */ 9560 if (ctx->owner_range != NULL) { 9561 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9562 } 9563 9564 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9565 9566 /* Don't free the ctx here. Its range is in the bdev's global list of 9567 * locked ranges still, and will be removed and freed when this range 9568 * is later unlocked. 9569 */ 9570 } 9571 9572 static int 9573 bdev_lock_lba_range_check_io(void *_i) 9574 { 9575 struct spdk_bdev_channel_iter *i = _i; 9576 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9577 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9578 struct locked_lba_range_ctx *ctx = i->ctx; 9579 struct lba_range *range = ctx->current_range; 9580 struct spdk_bdev_io *bdev_io; 9581 9582 spdk_poller_unregister(&ctx->poller); 9583 9584 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9585 * range. But we need to wait until any outstanding IO overlapping with this range 9586 * are completed. 9587 */ 9588 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9589 if (bdev_io_range_is_locked(bdev_io, range)) { 9590 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9591 return SPDK_POLLER_BUSY; 9592 } 9593 } 9594 9595 spdk_bdev_for_each_channel_continue(i, 0); 9596 return SPDK_POLLER_BUSY; 9597 } 9598 9599 static void 9600 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9601 struct spdk_io_channel *_ch, void *_ctx) 9602 { 9603 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9604 struct locked_lba_range_ctx *ctx = _ctx; 9605 struct lba_range *range; 9606 9607 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9608 if (range->length == ctx->range.length && 9609 range->offset == ctx->range.offset && 9610 range->locked_ctx == ctx->range.locked_ctx) { 9611 /* This range already exists on this channel, so don't add 9612 * it again. This can happen when a new channel is created 9613 * while the for_each_channel operation is in progress. 9614 * Do not check for outstanding I/O in that case, since the 9615 * range was locked before any I/O could be submitted to the 9616 * new channel. 9617 */ 9618 spdk_bdev_for_each_channel_continue(i, 0); 9619 return; 9620 } 9621 } 9622 9623 range = calloc(1, sizeof(*range)); 9624 if (range == NULL) { 9625 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9626 return; 9627 } 9628 9629 range->length = ctx->range.length; 9630 range->offset = ctx->range.offset; 9631 range->locked_ctx = ctx->range.locked_ctx; 9632 range->quiesce = ctx->range.quiesce; 9633 ctx->current_range = range; 9634 if (ctx->range.owner_ch == ch) { 9635 /* This is the range object for the channel that will hold 9636 * the lock. Store it in the ctx object so that we can easily 9637 * set its owner_ch after the lock is finally acquired. 9638 */ 9639 ctx->owner_range = range; 9640 } 9641 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9642 bdev_lock_lba_range_check_io(i); 9643 } 9644 9645 static void 9646 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9647 { 9648 assert(spdk_get_thread() == ctx->range.owner_thread); 9649 assert(ctx->range.owner_ch == NULL || 9650 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9651 9652 /* We will add a copy of this range to each channel now. */ 9653 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9654 bdev_lock_lba_range_cb); 9655 } 9656 9657 static bool 9658 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9659 { 9660 struct lba_range *r; 9661 9662 TAILQ_FOREACH(r, tailq, tailq) { 9663 if (bdev_lba_range_overlapped(range, r)) { 9664 return true; 9665 } 9666 } 9667 return false; 9668 } 9669 9670 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9671 9672 static int 9673 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9674 uint64_t offset, uint64_t length, 9675 lock_range_cb cb_fn, void *cb_arg) 9676 { 9677 struct locked_lba_range_ctx *ctx; 9678 9679 ctx = calloc(1, sizeof(*ctx)); 9680 if (ctx == NULL) { 9681 return -ENOMEM; 9682 } 9683 9684 ctx->range.offset = offset; 9685 ctx->range.length = length; 9686 ctx->range.owner_thread = spdk_get_thread(); 9687 ctx->range.owner_ch = ch; 9688 ctx->range.locked_ctx = cb_arg; 9689 ctx->range.bdev = bdev; 9690 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 9691 ctx->cb_fn = cb_fn; 9692 ctx->cb_arg = cb_arg; 9693 9694 spdk_spin_lock(&bdev->internal.spinlock); 9695 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9696 /* There is an active lock overlapping with this range. 9697 * Put it on the pending list until this range no 9698 * longer overlaps with another. 9699 */ 9700 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9701 } else { 9702 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9703 bdev_lock_lba_range_ctx(bdev, ctx); 9704 } 9705 spdk_spin_unlock(&bdev->internal.spinlock); 9706 return 0; 9707 } 9708 9709 static int 9710 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9711 uint64_t offset, uint64_t length, 9712 lock_range_cb cb_fn, void *cb_arg) 9713 { 9714 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9715 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9716 9717 if (cb_arg == NULL) { 9718 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9719 return -EINVAL; 9720 } 9721 9722 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9723 } 9724 9725 static void 9726 bdev_lock_lba_range_ctx_msg(void *_ctx) 9727 { 9728 struct locked_lba_range_ctx *ctx = _ctx; 9729 9730 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9731 } 9732 9733 static void 9734 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9735 { 9736 struct locked_lba_range_ctx *ctx = _ctx; 9737 struct locked_lba_range_ctx *pending_ctx; 9738 struct lba_range *range, *tmp; 9739 9740 spdk_spin_lock(&bdev->internal.spinlock); 9741 /* Check if there are any pending locked ranges that overlap with this range 9742 * that was just unlocked. If there are, check that it doesn't overlap with any 9743 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9744 * the lock process. 9745 */ 9746 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9747 if (bdev_lba_range_overlapped(range, &ctx->range) && 9748 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9749 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9750 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9751 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9752 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9753 bdev_lock_lba_range_ctx_msg, pending_ctx); 9754 } 9755 } 9756 spdk_spin_unlock(&bdev->internal.spinlock); 9757 9758 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9759 free(ctx); 9760 } 9761 9762 static void 9763 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9764 struct spdk_io_channel *_ch, void *_ctx) 9765 { 9766 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9767 struct locked_lba_range_ctx *ctx = _ctx; 9768 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9769 struct spdk_bdev_io *bdev_io; 9770 struct lba_range *range; 9771 9772 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9773 if (ctx->range.offset == range->offset && 9774 ctx->range.length == range->length && 9775 ctx->range.locked_ctx == range->locked_ctx) { 9776 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9777 free(range); 9778 break; 9779 } 9780 } 9781 9782 /* Note: we should almost always be able to assert that the range specified 9783 * was found. But there are some very rare corner cases where a new channel 9784 * gets created simultaneously with a range unlock, where this function 9785 * would execute on that new channel and wouldn't have the range. 9786 * We also use this to clean up range allocations when a later allocation 9787 * fails in the locking path. 9788 * So we can't actually assert() here. 9789 */ 9790 9791 /* Swap the locked IO into a temporary list, and then try to submit them again. 9792 * We could hyper-optimize this to only resubmit locked I/O that overlap 9793 * with the range that was just unlocked, but this isn't a performance path so 9794 * we go for simplicity here. 9795 */ 9796 TAILQ_INIT(&io_locked); 9797 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9798 while (!TAILQ_EMPTY(&io_locked)) { 9799 bdev_io = TAILQ_FIRST(&io_locked); 9800 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9801 bdev_io_submit(bdev_io); 9802 } 9803 9804 spdk_bdev_for_each_channel_continue(i, 0); 9805 } 9806 9807 static int 9808 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9809 lock_range_cb cb_fn, void *cb_arg) 9810 { 9811 struct locked_lba_range_ctx *ctx; 9812 struct lba_range *range; 9813 9814 spdk_spin_lock(&bdev->internal.spinlock); 9815 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9816 * and remove it. This ensures new channels don't inherit the locked range. 9817 * Then we will send a message to each channel to remove the range from its 9818 * per-channel list. 9819 */ 9820 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9821 if (range->offset == offset && range->length == length && 9822 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9823 break; 9824 } 9825 } 9826 if (range == NULL) { 9827 assert(false); 9828 spdk_spin_unlock(&bdev->internal.spinlock); 9829 return -EINVAL; 9830 } 9831 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9832 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9833 spdk_spin_unlock(&bdev->internal.spinlock); 9834 9835 ctx->cb_fn = cb_fn; 9836 ctx->cb_arg = cb_arg; 9837 9838 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9839 bdev_unlock_lba_range_cb); 9840 return 0; 9841 } 9842 9843 static int 9844 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9845 uint64_t offset, uint64_t length, 9846 lock_range_cb cb_fn, void *cb_arg) 9847 { 9848 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9849 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9850 struct lba_range *range; 9851 bool range_found = false; 9852 9853 /* Let's make sure the specified channel actually has a lock on 9854 * the specified range. Note that the range must match exactly. 9855 */ 9856 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9857 if (range->offset == offset && range->length == length && 9858 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9859 range_found = true; 9860 break; 9861 } 9862 } 9863 9864 if (!range_found) { 9865 return -EINVAL; 9866 } 9867 9868 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9869 } 9870 9871 struct bdev_quiesce_ctx { 9872 spdk_bdev_quiesce_cb cb_fn; 9873 void *cb_arg; 9874 }; 9875 9876 static void 9877 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9878 { 9879 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9880 9881 if (quiesce_ctx->cb_fn != NULL) { 9882 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9883 } 9884 9885 free(quiesce_ctx); 9886 } 9887 9888 static void 9889 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9890 { 9891 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9892 struct spdk_bdev_module *module = range->bdev->module; 9893 9894 if (status != 0) { 9895 if (quiesce_ctx->cb_fn != NULL) { 9896 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9897 } 9898 free(quiesce_ctx); 9899 return; 9900 } 9901 9902 spdk_spin_lock(&module->internal.spinlock); 9903 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9904 spdk_spin_unlock(&module->internal.spinlock); 9905 9906 if (quiesce_ctx->cb_fn != NULL) { 9907 /* copy the context in case the range is unlocked by the callback */ 9908 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 9909 9910 quiesce_ctx->cb_fn = NULL; 9911 quiesce_ctx->cb_arg = NULL; 9912 9913 tmp.cb_fn(tmp.cb_arg, status); 9914 } 9915 /* quiesce_ctx will be freed on unquiesce */ 9916 } 9917 9918 static int 9919 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9920 uint64_t offset, uint64_t length, 9921 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9922 bool unquiesce) 9923 { 9924 struct bdev_quiesce_ctx *quiesce_ctx; 9925 int rc; 9926 9927 if (module != bdev->module) { 9928 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9929 return -EINVAL; 9930 } 9931 9932 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9933 return -EINVAL; 9934 } 9935 9936 if (unquiesce) { 9937 struct lba_range *range; 9938 9939 /* Make sure the specified range is actually quiesced in the specified module and 9940 * then remove it from the list. Note that the range must match exactly. 9941 */ 9942 spdk_spin_lock(&module->internal.spinlock); 9943 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9944 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9945 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9946 break; 9947 } 9948 } 9949 spdk_spin_unlock(&module->internal.spinlock); 9950 9951 if (range == NULL) { 9952 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9953 return -EINVAL; 9954 } 9955 9956 quiesce_ctx = range->locked_ctx; 9957 quiesce_ctx->cb_fn = cb_fn; 9958 quiesce_ctx->cb_arg = cb_arg; 9959 9960 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9961 } else { 9962 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9963 if (quiesce_ctx == NULL) { 9964 return -ENOMEM; 9965 } 9966 9967 quiesce_ctx->cb_fn = cb_fn; 9968 quiesce_ctx->cb_arg = cb_arg; 9969 9970 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9971 if (rc != 0) { 9972 free(quiesce_ctx); 9973 } 9974 } 9975 9976 return rc; 9977 } 9978 9979 int 9980 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9981 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9982 { 9983 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9984 } 9985 9986 int 9987 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9988 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9989 { 9990 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9991 } 9992 9993 int 9994 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9995 uint64_t offset, uint64_t length, 9996 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9997 { 9998 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9999 } 10000 10001 int 10002 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10003 uint64_t offset, uint64_t length, 10004 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10005 { 10006 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10007 } 10008 10009 int 10010 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10011 int array_size) 10012 { 10013 if (!bdev) { 10014 return -EINVAL; 10015 } 10016 10017 if (bdev->fn_table->get_memory_domains) { 10018 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10019 } 10020 10021 return 0; 10022 } 10023 10024 struct spdk_bdev_for_each_io_ctx { 10025 void *ctx; 10026 spdk_bdev_io_fn fn; 10027 spdk_bdev_for_each_io_cb cb; 10028 }; 10029 10030 static void 10031 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10032 struct spdk_io_channel *io_ch, void *_ctx) 10033 { 10034 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10035 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10036 struct spdk_bdev_io *bdev_io; 10037 int rc = 0; 10038 10039 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10040 rc = ctx->fn(ctx->ctx, bdev_io); 10041 if (rc != 0) { 10042 break; 10043 } 10044 } 10045 10046 spdk_bdev_for_each_channel_continue(i, rc); 10047 } 10048 10049 static void 10050 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10051 { 10052 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10053 10054 ctx->cb(ctx->ctx, status); 10055 10056 free(ctx); 10057 } 10058 10059 void 10060 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10061 spdk_bdev_for_each_io_cb cb) 10062 { 10063 struct spdk_bdev_for_each_io_ctx *ctx; 10064 10065 assert(fn != NULL && cb != NULL); 10066 10067 ctx = calloc(1, sizeof(*ctx)); 10068 if (ctx == NULL) { 10069 SPDK_ERRLOG("Failed to allocate context.\n"); 10070 cb(_ctx, -ENOMEM); 10071 return; 10072 } 10073 10074 ctx->ctx = _ctx; 10075 ctx->fn = fn; 10076 ctx->cb = cb; 10077 10078 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10079 bdev_for_each_io_done); 10080 } 10081 10082 void 10083 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10084 { 10085 spdk_for_each_channel_continue(iter->i, status); 10086 } 10087 10088 static struct spdk_bdev * 10089 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10090 { 10091 void *io_device = spdk_io_channel_iter_get_io_device(i); 10092 10093 return __bdev_from_io_dev(io_device); 10094 } 10095 10096 static void 10097 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10098 { 10099 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10100 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10101 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10102 10103 iter->i = i; 10104 iter->fn(iter, bdev, ch, iter->ctx); 10105 } 10106 10107 static void 10108 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10109 { 10110 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10111 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10112 10113 iter->i = i; 10114 iter->cpl(bdev, iter->ctx, status); 10115 10116 free(iter); 10117 } 10118 10119 void 10120 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10121 void *ctx, spdk_bdev_for_each_channel_done cpl) 10122 { 10123 struct spdk_bdev_channel_iter *iter; 10124 10125 assert(bdev != NULL && fn != NULL && ctx != NULL); 10126 10127 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10128 if (iter == NULL) { 10129 SPDK_ERRLOG("Unable to allocate iterator\n"); 10130 assert(false); 10131 return; 10132 } 10133 10134 iter->fn = fn; 10135 iter->cpl = cpl; 10136 iter->ctx = ctx; 10137 10138 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10139 iter, bdev_each_channel_cpl); 10140 } 10141 10142 static void 10143 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10144 { 10145 struct spdk_bdev_io *parent_io = cb_arg; 10146 10147 spdk_bdev_free_io(bdev_io); 10148 10149 /* Check return status of write */ 10150 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10151 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10152 } 10153 10154 static void 10155 bdev_copy_do_write(void *_bdev_io) 10156 { 10157 struct spdk_bdev_io *bdev_io = _bdev_io; 10158 int rc; 10159 10160 /* Write blocks */ 10161 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10162 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10163 bdev_io->u.bdev.iovs[0].iov_base, 10164 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10165 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10166 10167 if (rc == -ENOMEM) { 10168 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10169 } else if (rc != 0) { 10170 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10171 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10172 } 10173 } 10174 10175 static void 10176 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10177 { 10178 struct spdk_bdev_io *parent_io = cb_arg; 10179 10180 spdk_bdev_free_io(bdev_io); 10181 10182 /* Check return status of read */ 10183 if (!success) { 10184 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10185 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10186 return; 10187 } 10188 10189 /* Do write */ 10190 bdev_copy_do_write(parent_io); 10191 } 10192 10193 static void 10194 bdev_copy_do_read(void *_bdev_io) 10195 { 10196 struct spdk_bdev_io *bdev_io = _bdev_io; 10197 int rc; 10198 10199 /* Read blocks */ 10200 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10201 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10202 bdev_io->u.bdev.iovs[0].iov_base, 10203 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10204 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10205 10206 if (rc == -ENOMEM) { 10207 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10208 } else if (rc != 0) { 10209 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10210 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10211 } 10212 } 10213 10214 static void 10215 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10216 { 10217 if (!success) { 10218 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10219 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10220 return; 10221 } 10222 10223 bdev_copy_do_read(bdev_io); 10224 } 10225 10226 int 10227 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10228 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10229 spdk_bdev_io_completion_cb cb, void *cb_arg) 10230 { 10231 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10232 struct spdk_bdev_io *bdev_io; 10233 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10234 10235 if (!desc->write) { 10236 return -EBADF; 10237 } 10238 10239 if (num_blocks == 0) { 10240 SPDK_ERRLOG("Can't copy 0 blocks\n"); 10241 return -EINVAL; 10242 } 10243 10244 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10245 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10246 SPDK_DEBUGLOG(bdev, 10247 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10248 dst_offset_blocks, src_offset_blocks, num_blocks); 10249 return -EINVAL; 10250 } 10251 10252 bdev_io = bdev_channel_get_io(channel); 10253 if (!bdev_io) { 10254 return -ENOMEM; 10255 } 10256 10257 bdev_io->internal.ch = channel; 10258 bdev_io->internal.desc = desc; 10259 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10260 10261 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10262 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10263 bdev_io->u.bdev.num_blocks = num_blocks; 10264 bdev_io->u.bdev.memory_domain = NULL; 10265 bdev_io->u.bdev.memory_domain_ctx = NULL; 10266 bdev_io->u.bdev.iovs = NULL; 10267 bdev_io->u.bdev.iovcnt = 0; 10268 bdev_io->u.bdev.md_buf = NULL; 10269 bdev_io->u.bdev.accel_sequence = NULL; 10270 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10271 10272 if (dst_offset_blocks == src_offset_blocks) { 10273 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 10274 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 10275 10276 return 0; 10277 } 10278 10279 10280 /* If the copy size is large and should be split, use the generic split logic 10281 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10282 * 10283 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10284 * emulate it using regular read and write requests otherwise. 10285 */ 10286 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10287 bdev_io->internal.split) { 10288 bdev_io_submit(bdev_io); 10289 return 0; 10290 } 10291 10292 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10293 10294 return 0; 10295 } 10296 10297 SPDK_LOG_REGISTER_COMPONENT(bdev) 10298 10299 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10300 { 10301 struct spdk_trace_tpoint_opts opts[] = { 10302 { 10303 "BDEV_IO_START", TRACE_BDEV_IO_START, 10304 OWNER_BDEV, OBJECT_BDEV_IO, 1, 10305 { 10306 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10307 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10308 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10309 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10310 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10311 } 10312 }, 10313 { 10314 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10315 OWNER_BDEV, OBJECT_BDEV_IO, 0, 10316 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10317 }, 10318 { 10319 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10320 OWNER_BDEV, OBJECT_NONE, 1, 10321 { 10322 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10323 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10324 } 10325 }, 10326 { 10327 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10328 OWNER_BDEV, OBJECT_NONE, 0, 10329 { 10330 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10331 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10332 } 10333 }, 10334 }; 10335 10336 10337 spdk_trace_register_owner(OWNER_BDEV, 'b'); 10338 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10339 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10340 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10341 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10342 } 10343