1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 79 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 80 }; 81 82 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 83 84 RB_HEAD(bdev_name_tree, spdk_bdev_name); 85 86 static int 87 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 88 { 89 return strcmp(name1->name, name2->name); 90 } 91 92 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 93 94 struct spdk_bdev_mgr { 95 struct spdk_mempool *bdev_io_pool; 96 97 void *zero_buffer; 98 99 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 100 101 struct spdk_bdev_list bdevs; 102 struct bdev_name_tree bdev_names; 103 104 bool init_complete; 105 bool module_init_complete; 106 107 struct spdk_spinlock spinlock; 108 109 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 123 }; 124 125 static void 126 __attribute__((constructor)) 127 _bdev_init(void) 128 { 129 spdk_spin_init(&g_bdev_mgr.spinlock); 130 } 131 132 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 133 134 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 135 136 struct lba_range { 137 struct spdk_bdev *bdev; 138 uint64_t offset; 139 uint64_t length; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 }; 152 153 static spdk_bdev_init_cb g_init_cb_fn = NULL; 154 static void *g_init_cb_arg = NULL; 155 156 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 157 static void *g_fini_cb_arg = NULL; 158 static struct spdk_thread *g_fini_thread = NULL; 159 160 struct spdk_bdev_qos_limit { 161 /** IOs or bytes allowed per second (i.e., 1s). */ 162 uint64_t limit; 163 164 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 165 * For remaining bytes, allowed to run negative if an I/O is submitted when 166 * some bytes are remaining, but the I/O is bigger than that amount. The 167 * excess will be deducted from the next timeslice. 168 */ 169 int64_t remaining_this_timeslice; 170 171 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 172 uint32_t min_per_timeslice; 173 174 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 175 uint32_t max_per_timeslice; 176 177 /** Function to check whether to queue the IO. */ 178 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 179 180 /** Function to update for the submitted IO. */ 181 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 182 }; 183 184 struct spdk_bdev_qos { 185 /** Types of structure of rate limits. */ 186 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 187 188 /** The channel that all I/O are funneled through. */ 189 struct spdk_bdev_channel *ch; 190 191 /** The thread on which the poller is running. */ 192 struct spdk_thread *thread; 193 194 /** Queue of I/O waiting to be issued. */ 195 bdev_io_tailq_t queued; 196 197 /** Size of a timeslice in tsc ticks. */ 198 uint64_t timeslice_size; 199 200 /** Timestamp of start of last timeslice. */ 201 uint64_t last_timeslice; 202 203 /** Poller that processes queued I/O commands each time slice. */ 204 struct spdk_poller *poller; 205 }; 206 207 struct spdk_bdev_mgmt_channel { 208 /* 209 * Each thread keeps a cache of bdev_io - this allows 210 * bdev threads which are *not* DPDK threads to still 211 * benefit from a per-thread bdev_io cache. Without 212 * this, non-DPDK threads fetching from the mempool 213 * incur a cmpxchg on get and put. 214 */ 215 bdev_io_stailq_t per_thread_cache; 216 uint32_t per_thread_cache_count; 217 uint32_t bdev_io_cache_size; 218 219 struct spdk_iobuf_channel iobuf; 220 221 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 222 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 223 }; 224 225 /* 226 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 227 * will queue here their IO that awaits retry. It makes it possible to retry sending 228 * IO to one bdev after IO from other bdev completes. 229 */ 230 struct spdk_bdev_shared_resource { 231 /* The bdev management channel */ 232 struct spdk_bdev_mgmt_channel *mgmt_ch; 233 234 /* 235 * Count of I/O submitted to bdev module and waiting for completion. 236 * Incremented before submit_request() is called on an spdk_bdev_io. 237 */ 238 uint64_t io_outstanding; 239 240 /* 241 * Queue of IO awaiting retry because of a previous NOMEM status returned 242 * on this channel. 243 */ 244 bdev_io_tailq_t nomem_io; 245 246 /* 247 * Threshold which io_outstanding must drop to before retrying nomem_io. 248 */ 249 uint64_t nomem_threshold; 250 251 /* I/O channel allocated by a bdev module */ 252 struct spdk_io_channel *shared_ch; 253 254 struct spdk_poller *nomem_poller; 255 256 /* Refcount of bdev channels using this resource */ 257 uint32_t ref; 258 259 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 260 }; 261 262 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 263 #define BDEV_CH_QOS_ENABLED (1 << 1) 264 265 struct spdk_bdev_channel { 266 struct spdk_bdev *bdev; 267 268 /* The channel for the underlying device */ 269 struct spdk_io_channel *channel; 270 271 /* Accel channel */ 272 struct spdk_io_channel *accel_channel; 273 274 /* Per io_device per thread data */ 275 struct spdk_bdev_shared_resource *shared_resource; 276 277 struct spdk_bdev_io_stat *stat; 278 279 /* 280 * Count of I/O submitted to the underlying dev module through this channel 281 * and waiting for completion. 282 */ 283 uint64_t io_outstanding; 284 285 /* 286 * List of all submitted I/Os including I/O that are generated via splitting. 287 */ 288 bdev_io_tailq_t io_submitted; 289 290 /* 291 * List of spdk_bdev_io that are currently queued because they write to a locked 292 * LBA range. 293 */ 294 bdev_io_tailq_t io_locked; 295 296 /* List of I/Os with accel sequence being currently executed */ 297 bdev_io_tailq_t io_accel_exec; 298 299 /* List of I/Os doing memory domain pull/push */ 300 bdev_io_tailq_t io_memory_domain; 301 302 uint32_t flags; 303 304 struct spdk_histogram_data *histogram; 305 306 #ifdef SPDK_CONFIG_VTUNE 307 uint64_t start_tsc; 308 uint64_t interval_tsc; 309 __itt_string_handle *handle; 310 struct spdk_bdev_io_stat *prev_stat; 311 #endif 312 313 bdev_io_tailq_t queued_resets; 314 315 lba_range_tailq_t locked_ranges; 316 }; 317 318 struct media_event_entry { 319 struct spdk_bdev_media_event event; 320 TAILQ_ENTRY(media_event_entry) tailq; 321 }; 322 323 #define MEDIA_EVENT_POOL_SIZE 64 324 325 struct spdk_bdev_desc { 326 struct spdk_bdev *bdev; 327 struct spdk_thread *thread; 328 struct { 329 spdk_bdev_event_cb_t event_fn; 330 void *ctx; 331 } callback; 332 bool closed; 333 bool write; 334 bool memory_domains_supported; 335 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 336 struct spdk_spinlock spinlock; 337 uint32_t refs; 338 TAILQ_HEAD(, media_event_entry) pending_media_events; 339 TAILQ_HEAD(, media_event_entry) free_media_events; 340 struct media_event_entry *media_events_buffer; 341 TAILQ_ENTRY(spdk_bdev_desc) link; 342 343 uint64_t timeout_in_sec; 344 spdk_bdev_io_timeout_cb cb_fn; 345 void *cb_arg; 346 struct spdk_poller *io_timeout_poller; 347 struct spdk_bdev_module_claim *claim; 348 }; 349 350 struct spdk_bdev_iostat_ctx { 351 struct spdk_bdev_io_stat *stat; 352 spdk_bdev_get_device_stat_cb cb; 353 void *cb_arg; 354 }; 355 356 struct set_qos_limit_ctx { 357 void (*cb_fn)(void *cb_arg, int status); 358 void *cb_arg; 359 struct spdk_bdev *bdev; 360 }; 361 362 struct spdk_bdev_channel_iter { 363 spdk_bdev_for_each_channel_msg fn; 364 spdk_bdev_for_each_channel_done cpl; 365 struct spdk_io_channel_iter *i; 366 void *ctx; 367 }; 368 369 struct spdk_bdev_io_error_stat { 370 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 371 }; 372 373 enum bdev_io_retry_state { 374 BDEV_IO_RETRY_STATE_INVALID, 375 BDEV_IO_RETRY_STATE_PULL, 376 BDEV_IO_RETRY_STATE_PULL_MD, 377 BDEV_IO_RETRY_STATE_SUBMIT, 378 BDEV_IO_RETRY_STATE_PUSH, 379 BDEV_IO_RETRY_STATE_PUSH_MD, 380 }; 381 382 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 383 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 384 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 385 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 386 387 static inline void bdev_io_complete(void *ctx); 388 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 389 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 390 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 391 392 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 393 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 394 395 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 396 struct spdk_io_channel *ch, void *_ctx); 397 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 398 399 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 400 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 401 uint64_t num_blocks, 402 struct spdk_memory_domain *domain, void *domain_ctx, 403 struct spdk_accel_sequence *seq, 404 spdk_bdev_io_completion_cb cb, void *cb_arg); 405 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 406 struct iovec *iov, int iovcnt, void *md_buf, 407 uint64_t offset_blocks, uint64_t num_blocks, 408 struct spdk_memory_domain *domain, void *domain_ctx, 409 struct spdk_accel_sequence *seq, 410 spdk_bdev_io_completion_cb cb, void *cb_arg); 411 412 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 413 uint64_t offset, uint64_t length, 414 lock_range_cb cb_fn, void *cb_arg); 415 416 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 417 uint64_t offset, uint64_t length, 418 lock_range_cb cb_fn, void *cb_arg); 419 420 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 421 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 422 423 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 424 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 425 static void claim_reset(struct spdk_bdev *bdev); 426 427 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 428 429 #define bdev_get_ext_io_opt(opts, field, defval) \ 430 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 431 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 432 433 void 434 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 435 { 436 if (!opts) { 437 SPDK_ERRLOG("opts should not be NULL\n"); 438 return; 439 } 440 441 if (!opts_size) { 442 SPDK_ERRLOG("opts_size should not be zero value\n"); 443 return; 444 } 445 446 opts->opts_size = opts_size; 447 448 #define SET_FIELD(field) \ 449 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 450 opts->field = g_bdev_opts.field; \ 451 } \ 452 453 SET_FIELD(bdev_io_pool_size); 454 SET_FIELD(bdev_io_cache_size); 455 SET_FIELD(bdev_auto_examine); 456 457 /* Do not remove this statement, you should always update this statement when you adding a new field, 458 * and do not forget to add the SET_FIELD statement for your added field. */ 459 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 460 461 #undef SET_FIELD 462 } 463 464 int 465 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 466 { 467 uint32_t min_pool_size; 468 469 if (!opts) { 470 SPDK_ERRLOG("opts cannot be NULL\n"); 471 return -1; 472 } 473 474 if (!opts->opts_size) { 475 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 476 return -1; 477 } 478 479 /* 480 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 481 * initialization. A second mgmt_ch will be created on the same thread when the application starts 482 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 483 */ 484 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 485 if (opts->bdev_io_pool_size < min_pool_size) { 486 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 487 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 488 spdk_thread_get_count()); 489 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 490 return -1; 491 } 492 493 #define SET_FIELD(field) \ 494 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 495 g_bdev_opts.field = opts->field; \ 496 } \ 497 498 SET_FIELD(bdev_io_pool_size); 499 SET_FIELD(bdev_io_cache_size); 500 SET_FIELD(bdev_auto_examine); 501 502 g_bdev_opts.opts_size = opts->opts_size; 503 504 #undef SET_FIELD 505 506 return 0; 507 } 508 509 static struct spdk_bdev * 510 bdev_get_by_name(const char *bdev_name) 511 { 512 struct spdk_bdev_name find; 513 struct spdk_bdev_name *res; 514 515 find.name = (char *)bdev_name; 516 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 517 if (res != NULL) { 518 return res->bdev; 519 } 520 521 return NULL; 522 } 523 524 struct spdk_bdev * 525 spdk_bdev_get_by_name(const char *bdev_name) 526 { 527 struct spdk_bdev *bdev; 528 529 spdk_spin_lock(&g_bdev_mgr.spinlock); 530 bdev = bdev_get_by_name(bdev_name); 531 spdk_spin_unlock(&g_bdev_mgr.spinlock); 532 533 return bdev; 534 } 535 536 struct bdev_io_status_string { 537 enum spdk_bdev_io_status status; 538 const char *str; 539 }; 540 541 static const struct bdev_io_status_string bdev_io_status_strings[] = { 542 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 543 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 544 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 545 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 546 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 547 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 548 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 549 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 550 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 551 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 552 }; 553 554 static const char * 555 bdev_io_status_get_string(enum spdk_bdev_io_status status) 556 { 557 uint32_t i; 558 559 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 560 if (bdev_io_status_strings[i].status == status) { 561 return bdev_io_status_strings[i].str; 562 } 563 } 564 565 return "reserved"; 566 } 567 568 struct spdk_bdev_wait_for_examine_ctx { 569 struct spdk_poller *poller; 570 spdk_bdev_wait_for_examine_cb cb_fn; 571 void *cb_arg; 572 }; 573 574 static bool bdev_module_all_actions_completed(void); 575 576 static int 577 bdev_wait_for_examine_cb(void *arg) 578 { 579 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 580 581 if (!bdev_module_all_actions_completed()) { 582 return SPDK_POLLER_IDLE; 583 } 584 585 spdk_poller_unregister(&ctx->poller); 586 ctx->cb_fn(ctx->cb_arg); 587 free(ctx); 588 589 return SPDK_POLLER_BUSY; 590 } 591 592 int 593 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 594 { 595 struct spdk_bdev_wait_for_examine_ctx *ctx; 596 597 ctx = calloc(1, sizeof(*ctx)); 598 if (ctx == NULL) { 599 return -ENOMEM; 600 } 601 ctx->cb_fn = cb_fn; 602 ctx->cb_arg = cb_arg; 603 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 604 605 return 0; 606 } 607 608 struct spdk_bdev_examine_item { 609 char *name; 610 TAILQ_ENTRY(spdk_bdev_examine_item) link; 611 }; 612 613 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 614 615 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 616 g_bdev_examine_allowlist); 617 618 static inline bool 619 bdev_examine_allowlist_check(const char *name) 620 { 621 struct spdk_bdev_examine_item *item; 622 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 623 if (strcmp(name, item->name) == 0) { 624 return true; 625 } 626 } 627 return false; 628 } 629 630 static inline void 631 bdev_examine_allowlist_free(void) 632 { 633 struct spdk_bdev_examine_item *item; 634 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 635 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 636 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 637 free(item->name); 638 free(item); 639 } 640 } 641 642 static inline bool 643 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 644 { 645 struct spdk_bdev_alias *tmp; 646 if (bdev_examine_allowlist_check(bdev->name)) { 647 return true; 648 } 649 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 650 if (bdev_examine_allowlist_check(tmp->alias.name)) { 651 return true; 652 } 653 } 654 return false; 655 } 656 657 static inline bool 658 bdev_ok_to_examine(struct spdk_bdev *bdev) 659 { 660 if (g_bdev_opts.bdev_auto_examine) { 661 return true; 662 } else { 663 return bdev_in_examine_allowlist(bdev); 664 } 665 } 666 667 static void 668 bdev_examine(struct spdk_bdev *bdev) 669 { 670 struct spdk_bdev_module *module; 671 struct spdk_bdev_module_claim *claim, *tmpclaim; 672 uint32_t action; 673 674 if (!bdev_ok_to_examine(bdev)) { 675 return; 676 } 677 678 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 679 if (module->examine_config) { 680 spdk_spin_lock(&module->internal.spinlock); 681 action = module->internal.action_in_progress; 682 module->internal.action_in_progress++; 683 spdk_spin_unlock(&module->internal.spinlock); 684 module->examine_config(bdev); 685 if (action != module->internal.action_in_progress) { 686 SPDK_ERRLOG("examine_config for module %s did not call " 687 "spdk_bdev_module_examine_done()\n", module->name); 688 } 689 } 690 } 691 692 spdk_spin_lock(&bdev->internal.spinlock); 693 694 switch (bdev->internal.claim_type) { 695 case SPDK_BDEV_CLAIM_NONE: 696 /* Examine by all bdev modules */ 697 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 698 if (module->examine_disk) { 699 spdk_spin_lock(&module->internal.spinlock); 700 module->internal.action_in_progress++; 701 spdk_spin_unlock(&module->internal.spinlock); 702 spdk_spin_unlock(&bdev->internal.spinlock); 703 module->examine_disk(bdev); 704 spdk_spin_lock(&bdev->internal.spinlock); 705 } 706 } 707 break; 708 case SPDK_BDEV_CLAIM_EXCL_WRITE: 709 /* Examine by the one bdev module with a v1 claim */ 710 module = bdev->internal.claim.v1.module; 711 if (module->examine_disk) { 712 spdk_spin_lock(&module->internal.spinlock); 713 module->internal.action_in_progress++; 714 spdk_spin_unlock(&module->internal.spinlock); 715 spdk_spin_unlock(&bdev->internal.spinlock); 716 module->examine_disk(bdev); 717 return; 718 } 719 break; 720 default: 721 /* Examine by all bdev modules with a v2 claim */ 722 assert(claim_type_is_v2(bdev->internal.claim_type)); 723 /* 724 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 725 * list, perhaps accessing freed memory. Without protection, this could happen 726 * while the lock is dropped during the examine callback. 727 */ 728 bdev->internal.examine_in_progress++; 729 730 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 731 module = claim->module; 732 733 if (module == NULL) { 734 /* This is a vestigial claim, held by examine_count */ 735 continue; 736 } 737 738 if (module->examine_disk == NULL) { 739 continue; 740 } 741 742 spdk_spin_lock(&module->internal.spinlock); 743 module->internal.action_in_progress++; 744 spdk_spin_unlock(&module->internal.spinlock); 745 746 /* Call examine_disk without holding internal.spinlock. */ 747 spdk_spin_unlock(&bdev->internal.spinlock); 748 module->examine_disk(bdev); 749 spdk_spin_lock(&bdev->internal.spinlock); 750 } 751 752 assert(bdev->internal.examine_in_progress > 0); 753 bdev->internal.examine_in_progress--; 754 if (bdev->internal.examine_in_progress == 0) { 755 /* Remove any claims that were released during examine_disk */ 756 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 757 if (claim->desc != NULL) { 758 continue; 759 } 760 761 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 762 free(claim); 763 } 764 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 765 claim_reset(bdev); 766 } 767 } 768 } 769 770 spdk_spin_unlock(&bdev->internal.spinlock); 771 } 772 773 int 774 spdk_bdev_examine(const char *name) 775 { 776 struct spdk_bdev *bdev; 777 struct spdk_bdev_examine_item *item; 778 struct spdk_thread *thread = spdk_get_thread(); 779 780 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 781 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 782 thread ? spdk_thread_get_name(thread) : "null"); 783 return -EINVAL; 784 } 785 786 if (g_bdev_opts.bdev_auto_examine) { 787 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 788 return -EINVAL; 789 } 790 791 if (bdev_examine_allowlist_check(name)) { 792 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 793 return -EEXIST; 794 } 795 796 item = calloc(1, sizeof(*item)); 797 if (!item) { 798 return -ENOMEM; 799 } 800 item->name = strdup(name); 801 if (!item->name) { 802 free(item); 803 return -ENOMEM; 804 } 805 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 806 807 bdev = spdk_bdev_get_by_name(name); 808 if (bdev) { 809 bdev_examine(bdev); 810 } 811 return 0; 812 } 813 814 static inline void 815 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 816 { 817 struct spdk_bdev_examine_item *item; 818 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 819 spdk_json_write_object_begin(w); 820 spdk_json_write_named_string(w, "method", "bdev_examine"); 821 spdk_json_write_named_object_begin(w, "params"); 822 spdk_json_write_named_string(w, "name", item->name); 823 spdk_json_write_object_end(w); 824 spdk_json_write_object_end(w); 825 } 826 } 827 828 struct spdk_bdev * 829 spdk_bdev_first(void) 830 { 831 struct spdk_bdev *bdev; 832 833 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 834 if (bdev) { 835 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 836 } 837 838 return bdev; 839 } 840 841 struct spdk_bdev * 842 spdk_bdev_next(struct spdk_bdev *prev) 843 { 844 struct spdk_bdev *bdev; 845 846 bdev = TAILQ_NEXT(prev, internal.link); 847 if (bdev) { 848 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 849 } 850 851 return bdev; 852 } 853 854 static struct spdk_bdev * 855 _bdev_next_leaf(struct spdk_bdev *bdev) 856 { 857 while (bdev != NULL) { 858 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 859 return bdev; 860 } else { 861 bdev = TAILQ_NEXT(bdev, internal.link); 862 } 863 } 864 865 return bdev; 866 } 867 868 struct spdk_bdev * 869 spdk_bdev_first_leaf(void) 870 { 871 struct spdk_bdev *bdev; 872 873 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 874 875 if (bdev) { 876 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 877 } 878 879 return bdev; 880 } 881 882 struct spdk_bdev * 883 spdk_bdev_next_leaf(struct spdk_bdev *prev) 884 { 885 struct spdk_bdev *bdev; 886 887 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 888 889 if (bdev) { 890 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 891 } 892 893 return bdev; 894 } 895 896 static inline bool 897 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 898 { 899 return bdev_io->internal.memory_domain; 900 } 901 902 static inline bool 903 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 904 { 905 return bdev_io->internal.has_accel_sequence; 906 } 907 908 static inline void 909 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 910 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 911 { 912 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 913 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 914 * channels we will instead wait for half to complete. 915 */ 916 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 917 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 918 919 assert(state != BDEV_IO_RETRY_STATE_INVALID); 920 bdev_io->internal.retry_state = state; 921 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 922 } 923 924 static inline void 925 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 926 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 927 { 928 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 929 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 930 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 931 932 assert(state != BDEV_IO_RETRY_STATE_INVALID); 933 bdev_io->internal.retry_state = state; 934 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 935 } 936 937 void 938 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 939 { 940 struct iovec *iovs; 941 942 if (bdev_io->u.bdev.iovs == NULL) { 943 bdev_io->u.bdev.iovs = &bdev_io->iov; 944 bdev_io->u.bdev.iovcnt = 1; 945 } 946 947 iovs = bdev_io->u.bdev.iovs; 948 949 assert(iovs != NULL); 950 assert(bdev_io->u.bdev.iovcnt >= 1); 951 952 iovs[0].iov_base = buf; 953 iovs[0].iov_len = len; 954 } 955 956 void 957 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 958 { 959 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 960 bdev_io->u.bdev.md_buf = md_buf; 961 } 962 963 static bool 964 _is_buf_allocated(const struct iovec *iovs) 965 { 966 if (iovs == NULL) { 967 return false; 968 } 969 970 return iovs[0].iov_base != NULL; 971 } 972 973 static bool 974 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 975 { 976 int i; 977 uintptr_t iov_base; 978 979 if (spdk_likely(alignment == 1)) { 980 return true; 981 } 982 983 for (i = 0; i < iovcnt; i++) { 984 iov_base = (uintptr_t)iovs[i].iov_base; 985 if ((iov_base & (alignment - 1)) != 0) { 986 return false; 987 } 988 } 989 990 return true; 991 } 992 993 static inline bool 994 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 995 { 996 if (!bdev_io->internal.accel_sequence) { 997 return false; 998 } 999 1000 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1001 * bdev module didn't support accel sequences */ 1002 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1003 } 1004 1005 static inline void 1006 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1007 struct spdk_bdev_shared_resource *shared_resource) 1008 { 1009 bdev_ch->io_outstanding++; 1010 shared_resource->io_outstanding++; 1011 } 1012 1013 static inline void 1014 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1015 struct spdk_bdev_shared_resource *shared_resource) 1016 { 1017 assert(bdev_ch->io_outstanding > 0); 1018 assert(shared_resource->io_outstanding > 0); 1019 bdev_ch->io_outstanding--; 1020 shared_resource->io_outstanding--; 1021 } 1022 1023 static void 1024 bdev_io_submit_sequence_cb(void *ctx, int status) 1025 { 1026 struct spdk_bdev_io *bdev_io = ctx; 1027 1028 bdev_io->u.bdev.accel_sequence = NULL; 1029 bdev_io->internal.accel_sequence = NULL; 1030 1031 if (spdk_unlikely(status != 0)) { 1032 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1033 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1034 bdev_io_complete_unsubmitted(bdev_io); 1035 return; 1036 } 1037 1038 bdev_io_submit(bdev_io); 1039 } 1040 1041 static void 1042 bdev_io_exec_sequence_cb(void *ctx, int status) 1043 { 1044 struct spdk_bdev_io *bdev_io = ctx; 1045 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1046 1047 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1048 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1049 1050 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1051 bdev_ch_retry_io(ch); 1052 } 1053 1054 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1055 } 1056 1057 static void 1058 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1059 { 1060 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1061 1062 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1063 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1064 1065 /* Since the operations are appended during submission, they're in the opposite order than 1066 * how we want to execute them for reads (i.e. we need to execute the most recently added 1067 * operation first), so reverse the sequence before executing it. 1068 */ 1069 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1070 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1071 } 1072 1073 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1074 bdev_io_increment_outstanding(ch, ch->shared_resource); 1075 bdev_io->internal.data_transfer_cpl = cb_fn; 1076 1077 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1078 bdev_io_exec_sequence_cb, bdev_io); 1079 } 1080 1081 static void 1082 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1083 { 1084 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1085 void *buf; 1086 1087 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1088 buf = bdev_io->internal.buf; 1089 bdev_io->internal.buf = NULL; 1090 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1091 bdev_io->internal.get_aux_buf_cb = NULL; 1092 } else { 1093 assert(bdev_io->internal.get_buf_cb != NULL); 1094 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1095 bdev_io->internal.get_buf_cb = NULL; 1096 } 1097 } 1098 1099 static void 1100 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1101 { 1102 struct spdk_bdev_io *bdev_io = ctx; 1103 1104 if (rc) { 1105 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1106 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1107 } 1108 bdev_io_get_buf_complete(bdev_io, !rc); 1109 } 1110 1111 static void 1112 bdev_io_pull_md_buf_done(void *ctx, int status) 1113 { 1114 struct spdk_bdev_io *bdev_io = ctx; 1115 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1116 1117 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1118 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1119 1120 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1121 bdev_ch_retry_io(ch); 1122 } 1123 1124 assert(bdev_io->internal.data_transfer_cpl); 1125 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1126 } 1127 1128 static void 1129 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1130 { 1131 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1132 int rc = 0; 1133 1134 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1135 if (bdev_io_use_memory_domain(bdev_io)) { 1136 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1137 bdev_io_increment_outstanding(ch, ch->shared_resource); 1138 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1139 bdev_io->internal.memory_domain_ctx, 1140 &bdev_io->internal.orig_md_iov, 1, 1141 &bdev_io->internal.bounce_md_iov, 1, 1142 bdev_io_pull_md_buf_done, bdev_io); 1143 if (rc == 0) { 1144 /* Continue to submit IO in completion callback */ 1145 return; 1146 } 1147 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1148 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1149 if (rc != -ENOMEM) { 1150 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1151 spdk_memory_domain_get_dma_device_id( 1152 bdev_io->internal.memory_domain), rc); 1153 } 1154 } else { 1155 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1156 bdev_io->internal.orig_md_iov.iov_base, 1157 bdev_io->internal.orig_md_iov.iov_len); 1158 } 1159 } 1160 1161 if (spdk_unlikely(rc == -ENOMEM)) { 1162 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1163 } else { 1164 assert(bdev_io->internal.data_transfer_cpl); 1165 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1166 } 1167 } 1168 1169 static void 1170 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1171 { 1172 /* save original md_buf */ 1173 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1174 bdev_io->internal.orig_md_iov.iov_len = len; 1175 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1176 bdev_io->internal.bounce_md_iov.iov_len = len; 1177 /* set bounce md_buf */ 1178 bdev_io->u.bdev.md_buf = md_buf; 1179 1180 bdev_io_pull_md_buf(bdev_io); 1181 } 1182 1183 static void 1184 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1185 { 1186 struct spdk_bdev *bdev = bdev_io->bdev; 1187 uint64_t md_len; 1188 void *buf; 1189 1190 if (spdk_bdev_is_md_separate(bdev)) { 1191 assert(!bdev_io_use_accel_sequence(bdev_io)); 1192 1193 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1194 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1195 1196 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1197 1198 if (bdev_io->u.bdev.md_buf != NULL) { 1199 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1200 return; 1201 } else { 1202 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1203 } 1204 } 1205 1206 bdev_io_get_buf_complete(bdev_io, true); 1207 } 1208 1209 static inline void 1210 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1211 { 1212 if (rc) { 1213 SPDK_ERRLOG("Failed to get data buffer\n"); 1214 assert(bdev_io->internal.data_transfer_cpl); 1215 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1216 return; 1217 } 1218 1219 _bdev_io_set_md_buf(bdev_io); 1220 } 1221 1222 static void 1223 bdev_io_pull_data_done_and_track(void *ctx, int status) 1224 { 1225 struct spdk_bdev_io *bdev_io = ctx; 1226 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1227 1228 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1229 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1230 1231 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1232 bdev_ch_retry_io(ch); 1233 } 1234 1235 bdev_io_pull_data_done(bdev_io, status); 1236 } 1237 1238 static void 1239 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1240 { 1241 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1242 int rc = 0; 1243 1244 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1245 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1246 * operation */ 1247 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1248 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1249 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1250 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1251 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1252 NULL, NULL, 1253 bdev_io->internal.orig_iovs, 1254 bdev_io->internal.orig_iovcnt, 1255 bdev_io->internal.memory_domain, 1256 bdev_io->internal.memory_domain_ctx, 1257 0, NULL, NULL); 1258 } else { 1259 /* We need to reverse the src/dst for reads */ 1260 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1261 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1262 bdev_io->internal.orig_iovs, 1263 bdev_io->internal.orig_iovcnt, 1264 bdev_io->internal.memory_domain, 1265 bdev_io->internal.memory_domain_ctx, 1266 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1267 NULL, NULL, 0, NULL, NULL); 1268 } 1269 1270 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1271 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1272 bdev_io->internal.accel_sequence); 1273 } 1274 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1275 /* if this is write path, copy data from original buffer to bounce buffer */ 1276 if (bdev_io_use_memory_domain(bdev_io)) { 1277 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1278 bdev_io_increment_outstanding(ch, ch->shared_resource); 1279 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1280 bdev_io->internal.memory_domain_ctx, 1281 bdev_io->internal.orig_iovs, 1282 (uint32_t) bdev_io->internal.orig_iovcnt, 1283 bdev_io->u.bdev.iovs, 1, 1284 bdev_io_pull_data_done_and_track, 1285 bdev_io); 1286 if (rc == 0) { 1287 /* Continue to submit IO in completion callback */ 1288 return; 1289 } 1290 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1291 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1292 if (rc != -ENOMEM) { 1293 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1294 spdk_memory_domain_get_dma_device_id( 1295 bdev_io->internal.memory_domain)); 1296 } 1297 } else { 1298 assert(bdev_io->u.bdev.iovcnt == 1); 1299 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1300 bdev_io->u.bdev.iovs[0].iov_len, 1301 bdev_io->internal.orig_iovs, 1302 bdev_io->internal.orig_iovcnt); 1303 } 1304 } 1305 1306 if (spdk_unlikely(rc == -ENOMEM)) { 1307 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1308 } else { 1309 bdev_io_pull_data_done(bdev_io, rc); 1310 } 1311 } 1312 1313 static void 1314 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1315 bdev_copy_bounce_buffer_cpl cpl_cb) 1316 { 1317 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1318 1319 bdev_io->internal.data_transfer_cpl = cpl_cb; 1320 /* save original iovec */ 1321 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1322 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1323 /* set bounce iov */ 1324 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1325 bdev_io->u.bdev.iovcnt = 1; 1326 /* set bounce buffer for this operation */ 1327 bdev_io->u.bdev.iovs[0].iov_base = buf; 1328 bdev_io->u.bdev.iovs[0].iov_len = len; 1329 1330 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1331 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1332 } else { 1333 bdev_io_pull_data(bdev_io); 1334 } 1335 } 1336 1337 static void 1338 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1339 { 1340 struct spdk_bdev *bdev = bdev_io->bdev; 1341 bool buf_allocated; 1342 uint64_t alignment; 1343 void *aligned_buf; 1344 1345 bdev_io->internal.buf = buf; 1346 1347 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1348 bdev_io_get_buf_complete(bdev_io, true); 1349 return; 1350 } 1351 1352 alignment = spdk_bdev_get_buf_align(bdev); 1353 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1354 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1355 1356 if (buf_allocated) { 1357 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1358 /* Continue in completion callback */ 1359 return; 1360 } else { 1361 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1362 } 1363 1364 _bdev_io_set_md_buf(bdev_io); 1365 } 1366 1367 static inline uint64_t 1368 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1369 { 1370 struct spdk_bdev *bdev = bdev_io->bdev; 1371 uint64_t md_len, alignment; 1372 1373 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1374 1375 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1376 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1377 1378 return len + alignment + md_len; 1379 } 1380 1381 static void 1382 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1383 { 1384 struct spdk_bdev_mgmt_channel *ch; 1385 1386 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1387 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1388 } 1389 1390 static void 1391 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1392 { 1393 assert(bdev_io->internal.buf != NULL); 1394 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1395 bdev_io->internal.buf = NULL; 1396 } 1397 1398 void 1399 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1400 { 1401 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1402 1403 assert(buf != NULL); 1404 _bdev_io_put_buf(bdev_io, buf, len); 1405 } 1406 1407 static inline void 1408 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1409 struct spdk_bdev_io *bdev_io) 1410 { 1411 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1412 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1413 * sequence pointer to make sure we won't touch it anymore. */ 1414 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1415 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1416 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1417 bdev_io->internal.accel_sequence = NULL; 1418 } 1419 1420 bdev->fn_table->submit_request(ioch, bdev_io); 1421 } 1422 1423 static inline void 1424 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1425 { 1426 struct spdk_bdev *bdev = bdev_io->bdev; 1427 1428 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1429 bdev_io->internal.error.nvme.cdw0 = 0; 1430 bdev_io->num_retries++; 1431 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1432 } 1433 1434 static void 1435 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1436 { 1437 struct spdk_bdev_io *bdev_io; 1438 1439 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1440 /* 1441 * Allow some more I/O to complete before retrying the nomem_io queue. 1442 * Some drivers (such as nvme) cannot immediately take a new I/O in 1443 * the context of a completion, because the resources for the I/O are 1444 * not released until control returns to the bdev poller. Also, we 1445 * may require several small I/O to complete before a larger I/O 1446 * (that requires splitting) can be submitted. 1447 */ 1448 return; 1449 } 1450 1451 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1452 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1453 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1454 1455 switch (bdev_io->internal.retry_state) { 1456 case BDEV_IO_RETRY_STATE_SUBMIT: 1457 bdev_ch_resubmit_io(shared_resource, bdev_io); 1458 break; 1459 case BDEV_IO_RETRY_STATE_PULL: 1460 bdev_io_pull_data(bdev_io); 1461 break; 1462 case BDEV_IO_RETRY_STATE_PULL_MD: 1463 bdev_io_pull_md_buf(bdev_io); 1464 break; 1465 case BDEV_IO_RETRY_STATE_PUSH: 1466 bdev_io_push_bounce_data(bdev_io); 1467 break; 1468 case BDEV_IO_RETRY_STATE_PUSH_MD: 1469 bdev_io_push_bounce_md_buf(bdev_io); 1470 break; 1471 default: 1472 assert(0 && "invalid retry state"); 1473 break; 1474 } 1475 1476 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1477 /* This IO completed again with NOMEM status, so break the loop and 1478 * don't try anymore. Note that a bdev_io that fails with NOMEM 1479 * always gets requeued at the front of the list, to maintain 1480 * ordering. 1481 */ 1482 break; 1483 } 1484 } 1485 } 1486 1487 static void 1488 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1489 { 1490 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1491 } 1492 1493 static int 1494 bdev_no_mem_poller(void *ctx) 1495 { 1496 struct spdk_bdev_shared_resource *shared_resource = ctx; 1497 1498 spdk_poller_unregister(&shared_resource->nomem_poller); 1499 1500 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1501 bdev_shared_ch_retry_io(shared_resource); 1502 } 1503 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && shared_resource->io_outstanding == 0) { 1504 /* No IOs were submitted, try again */ 1505 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1506 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1507 } 1508 1509 return SPDK_POLLER_BUSY; 1510 } 1511 1512 static inline bool 1513 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1514 { 1515 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1516 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1517 1518 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1519 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1520 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1521 1522 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1523 /* Special case when we have nomem IOs and no outstanding IOs which completions 1524 * could trigger retry of queued IOs 1525 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1526 * new IOs submitted, e.g. qd==1 */ 1527 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1528 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1529 } 1530 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1531 * ownership of that sequence is transferred back to the bdev layer, so we need to 1532 * restore internal.accel_sequence to make sure that the sequence is handled 1533 * correctly in case the I/O is later aborted. */ 1534 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1535 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1536 assert(bdev_io->internal.accel_sequence == NULL); 1537 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1538 } 1539 1540 return true; 1541 } 1542 1543 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1544 bdev_ch_retry_io(bdev_ch); 1545 } 1546 1547 return false; 1548 } 1549 1550 static void 1551 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1552 { 1553 struct spdk_bdev_io *bdev_io = ctx; 1554 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1555 1556 if (rc) { 1557 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1558 } 1559 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1560 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1561 */ 1562 bdev_io_put_buf(bdev_io); 1563 1564 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1565 bdev_ch_retry_io(ch); 1566 } 1567 1568 /* Continue with IO completion flow */ 1569 bdev_io_complete(bdev_io); 1570 } 1571 1572 static void 1573 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1574 { 1575 struct spdk_bdev_io *bdev_io = ctx; 1576 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1577 1578 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1579 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1580 1581 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1582 bdev_ch_retry_io(ch); 1583 } 1584 1585 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1586 } 1587 1588 static inline void 1589 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1590 { 1591 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1592 int rc = 0; 1593 1594 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1595 /* do the same for metadata buffer */ 1596 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1597 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1598 1599 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1600 if (bdev_io_use_memory_domain(bdev_io)) { 1601 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1602 bdev_io_increment_outstanding(ch, ch->shared_resource); 1603 /* If memory domain is used then we need to call async push function */ 1604 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1605 bdev_io->internal.memory_domain_ctx, 1606 &bdev_io->internal.orig_md_iov, 1607 (uint32_t)bdev_io->internal.orig_iovcnt, 1608 &bdev_io->internal.bounce_md_iov, 1, 1609 bdev_io_push_bounce_md_buf_done, 1610 bdev_io); 1611 if (rc == 0) { 1612 /* Continue IO completion in async callback */ 1613 return; 1614 } 1615 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1616 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1617 if (rc != -ENOMEM) { 1618 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1619 spdk_memory_domain_get_dma_device_id( 1620 bdev_io->internal.memory_domain)); 1621 } 1622 } else { 1623 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1624 bdev_io->internal.orig_md_iov.iov_len); 1625 } 1626 } 1627 } 1628 1629 if (spdk_unlikely(rc == -ENOMEM)) { 1630 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1631 } else { 1632 assert(bdev_io->internal.data_transfer_cpl); 1633 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1634 } 1635 } 1636 1637 static inline void 1638 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1639 { 1640 assert(bdev_io->internal.data_transfer_cpl); 1641 if (rc) { 1642 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1643 return; 1644 } 1645 1646 /* set original buffer for this io */ 1647 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1648 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1649 /* disable bouncing buffer for this io */ 1650 bdev_io->internal.orig_iovcnt = 0; 1651 bdev_io->internal.orig_iovs = NULL; 1652 1653 bdev_io_push_bounce_md_buf(bdev_io); 1654 } 1655 1656 static void 1657 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1658 { 1659 struct spdk_bdev_io *bdev_io = ctx; 1660 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1661 1662 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1663 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1664 1665 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1666 bdev_ch_retry_io(ch); 1667 } 1668 1669 bdev_io_push_bounce_data_done(bdev_io, status); 1670 } 1671 1672 static inline void 1673 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1674 { 1675 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1676 int rc = 0; 1677 1678 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1679 assert(!bdev_io_use_accel_sequence(bdev_io)); 1680 1681 /* if this is read path, copy data from bounce buffer to original buffer */ 1682 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1683 if (bdev_io_use_memory_domain(bdev_io)) { 1684 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1685 bdev_io_increment_outstanding(ch, ch->shared_resource); 1686 /* If memory domain is used then we need to call async push function */ 1687 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1688 bdev_io->internal.memory_domain_ctx, 1689 bdev_io->internal.orig_iovs, 1690 (uint32_t)bdev_io->internal.orig_iovcnt, 1691 &bdev_io->internal.bounce_iov, 1, 1692 bdev_io_push_bounce_data_done_and_track, 1693 bdev_io); 1694 if (rc == 0) { 1695 /* Continue IO completion in async callback */ 1696 return; 1697 } 1698 1699 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1700 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1701 if (rc != -ENOMEM) { 1702 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1703 spdk_memory_domain_get_dma_device_id( 1704 bdev_io->internal.memory_domain)); 1705 } 1706 } else { 1707 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1708 bdev_io->internal.orig_iovcnt, 1709 bdev_io->internal.bounce_iov.iov_base, 1710 bdev_io->internal.bounce_iov.iov_len); 1711 } 1712 } 1713 1714 if (spdk_unlikely(rc == -ENOMEM)) { 1715 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1716 } else { 1717 bdev_io_push_bounce_data_done(bdev_io, rc); 1718 } 1719 } 1720 1721 static inline void 1722 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1723 { 1724 bdev_io->internal.data_transfer_cpl = cpl_cb; 1725 bdev_io_push_bounce_data(bdev_io); 1726 } 1727 1728 static void 1729 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1730 { 1731 struct spdk_bdev_io *bdev_io; 1732 1733 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1734 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1735 } 1736 1737 static void 1738 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1739 { 1740 struct spdk_bdev_mgmt_channel *mgmt_ch; 1741 uint64_t max_len; 1742 void *buf; 1743 1744 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1745 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1746 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1747 1748 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1749 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1750 bdev_io_get_buf_complete(bdev_io, false); 1751 return; 1752 } 1753 1754 bdev_io->internal.buf_len = len; 1755 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1756 bdev_io_get_iobuf_cb); 1757 if (buf != NULL) { 1758 _bdev_io_set_buf(bdev_io, buf, len); 1759 } 1760 } 1761 1762 void 1763 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1764 { 1765 struct spdk_bdev *bdev = bdev_io->bdev; 1766 uint64_t alignment; 1767 1768 assert(cb != NULL); 1769 bdev_io->internal.get_buf_cb = cb; 1770 1771 alignment = spdk_bdev_get_buf_align(bdev); 1772 1773 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1774 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1775 /* Buffer already present and aligned */ 1776 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1777 return; 1778 } 1779 1780 bdev_io_get_buf(bdev_io, len); 1781 } 1782 1783 static void 1784 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1785 bool success) 1786 { 1787 if (!success) { 1788 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1789 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1790 bdev_io_complete_unsubmitted(bdev_io); 1791 return; 1792 } 1793 1794 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1795 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1796 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1797 return; 1798 } 1799 /* For reads we'll execute the sequence after the data is read, so, for now, only 1800 * clear out accel_sequence pointer and submit the IO */ 1801 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1802 bdev_io->u.bdev.accel_sequence = NULL; 1803 } 1804 1805 bdev_io_submit(bdev_io); 1806 } 1807 1808 static void 1809 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1810 uint64_t len) 1811 { 1812 assert(cb != NULL); 1813 bdev_io->internal.get_buf_cb = cb; 1814 1815 bdev_io_get_buf(bdev_io, len); 1816 } 1817 1818 void 1819 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1820 { 1821 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1822 1823 assert(cb != NULL); 1824 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1825 bdev_io->internal.get_aux_buf_cb = cb; 1826 bdev_io_get_buf(bdev_io, len); 1827 } 1828 1829 static int 1830 bdev_module_get_max_ctx_size(void) 1831 { 1832 struct spdk_bdev_module *bdev_module; 1833 int max_bdev_module_size = 0; 1834 1835 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1836 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1837 max_bdev_module_size = bdev_module->get_ctx_size(); 1838 } 1839 } 1840 1841 return max_bdev_module_size; 1842 } 1843 1844 static void 1845 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1846 { 1847 int i; 1848 struct spdk_bdev_qos *qos = bdev->internal.qos; 1849 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1850 1851 if (!qos) { 1852 return; 1853 } 1854 1855 spdk_bdev_get_qos_rate_limits(bdev, limits); 1856 1857 spdk_json_write_object_begin(w); 1858 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1859 1860 spdk_json_write_named_object_begin(w, "params"); 1861 spdk_json_write_named_string(w, "name", bdev->name); 1862 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1863 if (limits[i] > 0) { 1864 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1865 } 1866 } 1867 spdk_json_write_object_end(w); 1868 1869 spdk_json_write_object_end(w); 1870 } 1871 1872 void 1873 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1874 { 1875 struct spdk_bdev_module *bdev_module; 1876 struct spdk_bdev *bdev; 1877 1878 assert(w != NULL); 1879 1880 spdk_json_write_array_begin(w); 1881 1882 spdk_json_write_object_begin(w); 1883 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1884 spdk_json_write_named_object_begin(w, "params"); 1885 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1886 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1887 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1888 spdk_json_write_object_end(w); 1889 spdk_json_write_object_end(w); 1890 1891 bdev_examine_allowlist_config_json(w); 1892 1893 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1894 if (bdev_module->config_json) { 1895 bdev_module->config_json(w); 1896 } 1897 } 1898 1899 spdk_spin_lock(&g_bdev_mgr.spinlock); 1900 1901 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1902 if (bdev->fn_table->write_config_json) { 1903 bdev->fn_table->write_config_json(bdev, w); 1904 } 1905 1906 bdev_qos_config_json(bdev, w); 1907 } 1908 1909 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1910 1911 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1912 spdk_json_write_object_begin(w); 1913 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1914 spdk_json_write_object_end(w); 1915 1916 spdk_json_write_array_end(w); 1917 } 1918 1919 static void 1920 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1921 { 1922 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1923 struct spdk_bdev_io *bdev_io; 1924 1925 spdk_iobuf_channel_fini(&ch->iobuf); 1926 1927 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1928 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1929 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1930 ch->per_thread_cache_count--; 1931 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1932 } 1933 1934 assert(ch->per_thread_cache_count == 0); 1935 } 1936 1937 static int 1938 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1939 { 1940 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1941 struct spdk_bdev_io *bdev_io; 1942 uint32_t i; 1943 int rc; 1944 1945 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1946 if (rc != 0) { 1947 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1948 return -1; 1949 } 1950 1951 STAILQ_INIT(&ch->per_thread_cache); 1952 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1953 1954 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1955 ch->per_thread_cache_count = 0; 1956 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1957 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1958 if (bdev_io == NULL) { 1959 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1960 assert(false); 1961 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1962 return -1; 1963 } 1964 ch->per_thread_cache_count++; 1965 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1966 } 1967 1968 TAILQ_INIT(&ch->shared_resources); 1969 TAILQ_INIT(&ch->io_wait_queue); 1970 1971 return 0; 1972 } 1973 1974 static void 1975 bdev_init_complete(int rc) 1976 { 1977 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1978 void *cb_arg = g_init_cb_arg; 1979 struct spdk_bdev_module *m; 1980 1981 g_bdev_mgr.init_complete = true; 1982 g_init_cb_fn = NULL; 1983 g_init_cb_arg = NULL; 1984 1985 /* 1986 * For modules that need to know when subsystem init is complete, 1987 * inform them now. 1988 */ 1989 if (rc == 0) { 1990 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1991 if (m->init_complete) { 1992 m->init_complete(); 1993 } 1994 } 1995 } 1996 1997 cb_fn(cb_arg, rc); 1998 } 1999 2000 static bool 2001 bdev_module_all_actions_completed(void) 2002 { 2003 struct spdk_bdev_module *m; 2004 2005 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2006 if (m->internal.action_in_progress > 0) { 2007 return false; 2008 } 2009 } 2010 return true; 2011 } 2012 2013 static void 2014 bdev_module_action_complete(void) 2015 { 2016 /* 2017 * Don't finish bdev subsystem initialization if 2018 * module pre-initialization is still in progress, or 2019 * the subsystem been already initialized. 2020 */ 2021 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2022 return; 2023 } 2024 2025 /* 2026 * Check all bdev modules for inits/examinations in progress. If any 2027 * exist, return immediately since we cannot finish bdev subsystem 2028 * initialization until all are completed. 2029 */ 2030 if (!bdev_module_all_actions_completed()) { 2031 return; 2032 } 2033 2034 /* 2035 * Modules already finished initialization - now that all 2036 * the bdev modules have finished their asynchronous I/O 2037 * processing, the entire bdev layer can be marked as complete. 2038 */ 2039 bdev_init_complete(0); 2040 } 2041 2042 static void 2043 bdev_module_action_done(struct spdk_bdev_module *module) 2044 { 2045 spdk_spin_lock(&module->internal.spinlock); 2046 assert(module->internal.action_in_progress > 0); 2047 module->internal.action_in_progress--; 2048 spdk_spin_unlock(&module->internal.spinlock); 2049 bdev_module_action_complete(); 2050 } 2051 2052 void 2053 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2054 { 2055 assert(module->async_init); 2056 bdev_module_action_done(module); 2057 } 2058 2059 void 2060 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2061 { 2062 bdev_module_action_done(module); 2063 } 2064 2065 /** The last initialized bdev module */ 2066 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2067 2068 static void 2069 bdev_init_failed(void *cb_arg) 2070 { 2071 struct spdk_bdev_module *module = cb_arg; 2072 2073 spdk_spin_lock(&module->internal.spinlock); 2074 assert(module->internal.action_in_progress > 0); 2075 module->internal.action_in_progress--; 2076 spdk_spin_unlock(&module->internal.spinlock); 2077 bdev_init_complete(-1); 2078 } 2079 2080 static int 2081 bdev_modules_init(void) 2082 { 2083 struct spdk_bdev_module *module; 2084 int rc = 0; 2085 2086 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2087 g_resume_bdev_module = module; 2088 if (module->async_init) { 2089 spdk_spin_lock(&module->internal.spinlock); 2090 module->internal.action_in_progress = 1; 2091 spdk_spin_unlock(&module->internal.spinlock); 2092 } 2093 rc = module->module_init(); 2094 if (rc != 0) { 2095 /* Bump action_in_progress to prevent other modules from completion of modules_init 2096 * Send message to defer application shutdown until resources are cleaned up */ 2097 spdk_spin_lock(&module->internal.spinlock); 2098 module->internal.action_in_progress = 1; 2099 spdk_spin_unlock(&module->internal.spinlock); 2100 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2101 return rc; 2102 } 2103 } 2104 2105 g_resume_bdev_module = NULL; 2106 return 0; 2107 } 2108 2109 void 2110 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2111 { 2112 int rc = 0; 2113 char mempool_name[32]; 2114 2115 assert(cb_fn != NULL); 2116 2117 g_init_cb_fn = cb_fn; 2118 g_init_cb_arg = cb_arg; 2119 2120 spdk_notify_type_register("bdev_register"); 2121 spdk_notify_type_register("bdev_unregister"); 2122 2123 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2124 2125 rc = spdk_iobuf_register_module("bdev"); 2126 if (rc != 0) { 2127 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2128 bdev_init_complete(-1); 2129 return; 2130 } 2131 2132 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2133 g_bdev_opts.bdev_io_pool_size, 2134 sizeof(struct spdk_bdev_io) + 2135 bdev_module_get_max_ctx_size(), 2136 0, 2137 SPDK_ENV_SOCKET_ID_ANY); 2138 2139 if (g_bdev_mgr.bdev_io_pool == NULL) { 2140 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2141 bdev_init_complete(-1); 2142 return; 2143 } 2144 2145 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2146 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2147 if (!g_bdev_mgr.zero_buffer) { 2148 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2149 bdev_init_complete(-1); 2150 return; 2151 } 2152 2153 #ifdef SPDK_CONFIG_VTUNE 2154 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2155 #endif 2156 2157 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2158 bdev_mgmt_channel_destroy, 2159 sizeof(struct spdk_bdev_mgmt_channel), 2160 "bdev_mgr"); 2161 2162 rc = bdev_modules_init(); 2163 g_bdev_mgr.module_init_complete = true; 2164 if (rc != 0) { 2165 SPDK_ERRLOG("bdev modules init failed\n"); 2166 return; 2167 } 2168 2169 bdev_module_action_complete(); 2170 } 2171 2172 static void 2173 bdev_mgr_unregister_cb(void *io_device) 2174 { 2175 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2176 2177 if (g_bdev_mgr.bdev_io_pool) { 2178 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2179 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2180 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2181 g_bdev_opts.bdev_io_pool_size); 2182 } 2183 2184 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2185 } 2186 2187 spdk_free(g_bdev_mgr.zero_buffer); 2188 2189 bdev_examine_allowlist_free(); 2190 2191 cb_fn(g_fini_cb_arg); 2192 g_fini_cb_fn = NULL; 2193 g_fini_cb_arg = NULL; 2194 g_bdev_mgr.init_complete = false; 2195 g_bdev_mgr.module_init_complete = false; 2196 } 2197 2198 static void 2199 bdev_module_fini_iter(void *arg) 2200 { 2201 struct spdk_bdev_module *bdev_module; 2202 2203 /* FIXME: Handling initialization failures is broken now, 2204 * so we won't even try cleaning up after successfully 2205 * initialized modules. if module_init_complete is false, 2206 * just call spdk_bdev_mgr_unregister_cb 2207 */ 2208 if (!g_bdev_mgr.module_init_complete) { 2209 bdev_mgr_unregister_cb(NULL); 2210 return; 2211 } 2212 2213 /* Start iterating from the last touched module */ 2214 if (!g_resume_bdev_module) { 2215 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2216 } else { 2217 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2218 internal.tailq); 2219 } 2220 2221 while (bdev_module) { 2222 if (bdev_module->async_fini) { 2223 /* Save our place so we can resume later. We must 2224 * save the variable here, before calling module_fini() 2225 * below, because in some cases the module may immediately 2226 * call spdk_bdev_module_fini_done() and re-enter 2227 * this function to continue iterating. */ 2228 g_resume_bdev_module = bdev_module; 2229 } 2230 2231 if (bdev_module->module_fini) { 2232 bdev_module->module_fini(); 2233 } 2234 2235 if (bdev_module->async_fini) { 2236 return; 2237 } 2238 2239 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2240 internal.tailq); 2241 } 2242 2243 g_resume_bdev_module = NULL; 2244 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2245 } 2246 2247 void 2248 spdk_bdev_module_fini_done(void) 2249 { 2250 if (spdk_get_thread() != g_fini_thread) { 2251 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2252 } else { 2253 bdev_module_fini_iter(NULL); 2254 } 2255 } 2256 2257 static void 2258 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2259 { 2260 struct spdk_bdev *bdev = cb_arg; 2261 2262 if (bdeverrno && bdev) { 2263 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2264 bdev->name); 2265 2266 /* 2267 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2268 * bdev; try to continue by manually removing this bdev from the list and continue 2269 * with the next bdev in the list. 2270 */ 2271 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2272 } 2273 2274 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2275 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2276 /* 2277 * Bdev module finish need to be deferred as we might be in the middle of some context 2278 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2279 * after returning. 2280 */ 2281 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2282 return; 2283 } 2284 2285 /* 2286 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2287 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2288 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2289 * base bdevs. 2290 * 2291 * Also, walk the list in the reverse order. 2292 */ 2293 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2294 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2295 spdk_spin_lock(&bdev->internal.spinlock); 2296 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2297 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2298 spdk_spin_unlock(&bdev->internal.spinlock); 2299 continue; 2300 } 2301 spdk_spin_unlock(&bdev->internal.spinlock); 2302 2303 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2304 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2305 return; 2306 } 2307 2308 /* 2309 * If any bdev fails to unclaim underlying bdev properly, we may face the 2310 * case of bdev list consisting of claimed bdevs only (if claims are managed 2311 * correctly, this would mean there's a loop in the claims graph which is 2312 * clearly impossible). Warn and unregister last bdev on the list then. 2313 */ 2314 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2315 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2316 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2317 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2318 return; 2319 } 2320 } 2321 2322 static void 2323 bdev_module_fini_start_iter(void *arg) 2324 { 2325 struct spdk_bdev_module *bdev_module; 2326 2327 if (!g_resume_bdev_module) { 2328 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2329 } else { 2330 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2331 } 2332 2333 while (bdev_module) { 2334 if (bdev_module->async_fini_start) { 2335 /* Save our place so we can resume later. We must 2336 * save the variable here, before calling fini_start() 2337 * below, because in some cases the module may immediately 2338 * call spdk_bdev_module_fini_start_done() and re-enter 2339 * this function to continue iterating. */ 2340 g_resume_bdev_module = bdev_module; 2341 } 2342 2343 if (bdev_module->fini_start) { 2344 bdev_module->fini_start(); 2345 } 2346 2347 if (bdev_module->async_fini_start) { 2348 return; 2349 } 2350 2351 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2352 } 2353 2354 g_resume_bdev_module = NULL; 2355 2356 bdev_finish_unregister_bdevs_iter(NULL, 0); 2357 } 2358 2359 void 2360 spdk_bdev_module_fini_start_done(void) 2361 { 2362 if (spdk_get_thread() != g_fini_thread) { 2363 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2364 } else { 2365 bdev_module_fini_start_iter(NULL); 2366 } 2367 } 2368 2369 static void 2370 bdev_finish_wait_for_examine_done(void *cb_arg) 2371 { 2372 bdev_module_fini_start_iter(NULL); 2373 } 2374 2375 static void bdev_open_async_fini(void); 2376 2377 void 2378 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2379 { 2380 int rc; 2381 2382 assert(cb_fn != NULL); 2383 2384 g_fini_thread = spdk_get_thread(); 2385 2386 g_fini_cb_fn = cb_fn; 2387 g_fini_cb_arg = cb_arg; 2388 2389 bdev_open_async_fini(); 2390 2391 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2392 if (rc != 0) { 2393 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2394 bdev_finish_wait_for_examine_done(NULL); 2395 } 2396 } 2397 2398 struct spdk_bdev_io * 2399 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2400 { 2401 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2402 struct spdk_bdev_io *bdev_io; 2403 2404 if (ch->per_thread_cache_count > 0) { 2405 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2406 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2407 ch->per_thread_cache_count--; 2408 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2409 /* 2410 * Don't try to look for bdev_ios in the global pool if there are 2411 * waiters on bdev_ios - we don't want this caller to jump the line. 2412 */ 2413 bdev_io = NULL; 2414 } else { 2415 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2416 } 2417 2418 return bdev_io; 2419 } 2420 2421 void 2422 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2423 { 2424 struct spdk_bdev_mgmt_channel *ch; 2425 2426 assert(bdev_io != NULL); 2427 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2428 2429 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2430 2431 if (bdev_io->internal.buf != NULL) { 2432 bdev_io_put_buf(bdev_io); 2433 } 2434 2435 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2436 ch->per_thread_cache_count++; 2437 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2438 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2439 struct spdk_bdev_io_wait_entry *entry; 2440 2441 entry = TAILQ_FIRST(&ch->io_wait_queue); 2442 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2443 entry->cb_fn(entry->cb_arg); 2444 } 2445 } else { 2446 /* We should never have a full cache with entries on the io wait queue. */ 2447 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2448 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2449 } 2450 } 2451 2452 static bool 2453 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2454 { 2455 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2456 2457 switch (limit) { 2458 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2459 return true; 2460 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2461 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2462 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2463 return false; 2464 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2465 default: 2466 return false; 2467 } 2468 } 2469 2470 static bool 2471 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2472 { 2473 switch (bdev_io->type) { 2474 case SPDK_BDEV_IO_TYPE_NVME_IO: 2475 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2476 case SPDK_BDEV_IO_TYPE_READ: 2477 case SPDK_BDEV_IO_TYPE_WRITE: 2478 return true; 2479 case SPDK_BDEV_IO_TYPE_ZCOPY: 2480 if (bdev_io->u.bdev.zcopy.start) { 2481 return true; 2482 } else { 2483 return false; 2484 } 2485 default: 2486 return false; 2487 } 2488 } 2489 2490 static bool 2491 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2492 { 2493 switch (bdev_io->type) { 2494 case SPDK_BDEV_IO_TYPE_NVME_IO: 2495 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2496 /* Bit 1 (0x2) set for read operation */ 2497 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2498 return true; 2499 } else { 2500 return false; 2501 } 2502 case SPDK_BDEV_IO_TYPE_READ: 2503 return true; 2504 case SPDK_BDEV_IO_TYPE_ZCOPY: 2505 /* Populate to read from disk */ 2506 if (bdev_io->u.bdev.zcopy.populate) { 2507 return true; 2508 } else { 2509 return false; 2510 } 2511 default: 2512 return false; 2513 } 2514 } 2515 2516 static uint64_t 2517 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2518 { 2519 struct spdk_bdev *bdev = bdev_io->bdev; 2520 2521 switch (bdev_io->type) { 2522 case SPDK_BDEV_IO_TYPE_NVME_IO: 2523 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2524 return bdev_io->u.nvme_passthru.nbytes; 2525 case SPDK_BDEV_IO_TYPE_READ: 2526 case SPDK_BDEV_IO_TYPE_WRITE: 2527 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2528 case SPDK_BDEV_IO_TYPE_ZCOPY: 2529 /* Track the data in the start phase only */ 2530 if (bdev_io->u.bdev.zcopy.start) { 2531 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2532 } else { 2533 return 0; 2534 } 2535 default: 2536 return 0; 2537 } 2538 } 2539 2540 static bool 2541 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2542 { 2543 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2544 return true; 2545 } else { 2546 return false; 2547 } 2548 } 2549 2550 static bool 2551 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2552 { 2553 if (bdev_is_read_io(io) == false) { 2554 return false; 2555 } 2556 2557 return bdev_qos_rw_queue_io(limit, io); 2558 } 2559 2560 static bool 2561 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2562 { 2563 if (bdev_is_read_io(io) == true) { 2564 return false; 2565 } 2566 2567 return bdev_qos_rw_queue_io(limit, io); 2568 } 2569 2570 static void 2571 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2572 { 2573 limit->remaining_this_timeslice--; 2574 } 2575 2576 static void 2577 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2578 { 2579 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2580 } 2581 2582 static void 2583 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2584 { 2585 if (bdev_is_read_io(io) == false) { 2586 return; 2587 } 2588 2589 return bdev_qos_rw_bps_update_quota(limit, io); 2590 } 2591 2592 static void 2593 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2594 { 2595 if (bdev_is_read_io(io) == true) { 2596 return; 2597 } 2598 2599 return bdev_qos_rw_bps_update_quota(limit, io); 2600 } 2601 2602 static void 2603 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2604 { 2605 int i; 2606 2607 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2608 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2609 qos->rate_limits[i].queue_io = NULL; 2610 qos->rate_limits[i].update_quota = NULL; 2611 continue; 2612 } 2613 2614 switch (i) { 2615 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2616 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2617 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2618 break; 2619 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2620 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2621 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2622 break; 2623 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2624 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2625 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2626 break; 2627 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2628 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2629 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2630 break; 2631 default: 2632 break; 2633 } 2634 } 2635 } 2636 2637 static void 2638 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2639 struct spdk_bdev_io *bdev_io, 2640 enum spdk_bdev_io_status status) 2641 { 2642 bdev_io->internal.in_submit_request = true; 2643 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2644 spdk_bdev_io_complete(bdev_io, status); 2645 bdev_io->internal.in_submit_request = false; 2646 } 2647 2648 static inline void 2649 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2650 { 2651 struct spdk_bdev *bdev = bdev_io->bdev; 2652 struct spdk_io_channel *ch = bdev_ch->channel; 2653 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2654 2655 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2656 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2657 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2658 2659 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2660 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2661 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2662 SPDK_BDEV_IO_STATUS_SUCCESS); 2663 return; 2664 } 2665 } 2666 2667 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2668 bdev_io->bdev->split_on_write_unit && 2669 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2670 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2671 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2672 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2673 return; 2674 } 2675 2676 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2677 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2678 bdev_io->internal.in_submit_request = true; 2679 bdev_submit_request(bdev, ch, bdev_io); 2680 bdev_io->internal.in_submit_request = false; 2681 } else { 2682 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2683 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2684 /* Special case when we have nomem IOs and no outstanding IOs which completions 2685 * could trigger retry of queued IOs */ 2686 bdev_shared_ch_retry_io(shared_resource); 2687 } 2688 } 2689 } 2690 2691 static bool 2692 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2693 { 2694 int i; 2695 2696 if (bdev_qos_io_to_limit(bdev_io) == true) { 2697 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2698 if (!qos->rate_limits[i].queue_io) { 2699 continue; 2700 } 2701 2702 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2703 bdev_io) == true) { 2704 return true; 2705 } 2706 } 2707 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2708 if (!qos->rate_limits[i].update_quota) { 2709 continue; 2710 } 2711 2712 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2713 } 2714 } 2715 2716 return false; 2717 } 2718 2719 static inline void 2720 _bdev_io_do_submit(void *ctx) 2721 { 2722 struct spdk_bdev_io *bdev_io = ctx; 2723 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2724 2725 bdev_io_do_submit(ch, bdev_io); 2726 } 2727 2728 static int 2729 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2730 { 2731 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2732 int submitted_ios = 0; 2733 2734 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2735 if (!bdev_qos_queue_io(qos, bdev_io)) { 2736 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2737 2738 if (bdev_io->internal.io_submit_ch) { 2739 /* Send back the IO to the original thread for the actual processing. */ 2740 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2741 bdev_io->internal.io_submit_ch = NULL; 2742 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2743 _bdev_io_do_submit, bdev_io); 2744 } else { 2745 bdev_io_do_submit(ch, bdev_io); 2746 } 2747 2748 submitted_ios++; 2749 } 2750 } 2751 2752 return submitted_ios; 2753 } 2754 2755 static void 2756 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2757 { 2758 int rc; 2759 2760 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2761 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2762 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2763 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2764 &bdev_io->internal.waitq_entry); 2765 if (rc != 0) { 2766 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2767 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2768 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2769 } 2770 } 2771 2772 static bool 2773 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2774 { 2775 uint32_t io_boundary; 2776 struct spdk_bdev *bdev = bdev_io->bdev; 2777 uint32_t max_size = bdev->max_segment_size; 2778 int max_segs = bdev->max_num_segments; 2779 2780 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2781 io_boundary = bdev->write_unit_size; 2782 } else if (bdev->split_on_optimal_io_boundary) { 2783 io_boundary = bdev->optimal_io_boundary; 2784 } else { 2785 io_boundary = 0; 2786 } 2787 2788 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2789 return false; 2790 } 2791 2792 if (io_boundary) { 2793 uint64_t start_stripe, end_stripe; 2794 2795 start_stripe = bdev_io->u.bdev.offset_blocks; 2796 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2797 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2798 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2799 start_stripe >>= spdk_u32log2(io_boundary); 2800 end_stripe >>= spdk_u32log2(io_boundary); 2801 } else { 2802 start_stripe /= io_boundary; 2803 end_stripe /= io_boundary; 2804 } 2805 2806 if (start_stripe != end_stripe) { 2807 return true; 2808 } 2809 } 2810 2811 if (max_segs) { 2812 if (bdev_io->u.bdev.iovcnt > max_segs) { 2813 return true; 2814 } 2815 } 2816 2817 if (max_size) { 2818 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2819 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2820 return true; 2821 } 2822 } 2823 } 2824 2825 return false; 2826 } 2827 2828 static bool 2829 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2830 { 2831 uint32_t num_unmap_segments; 2832 2833 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2834 return false; 2835 } 2836 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2837 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2838 return true; 2839 } 2840 2841 return false; 2842 } 2843 2844 static bool 2845 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2846 { 2847 if (!bdev_io->bdev->max_write_zeroes) { 2848 return false; 2849 } 2850 2851 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2852 return true; 2853 } 2854 2855 return false; 2856 } 2857 2858 static bool 2859 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2860 { 2861 if (bdev_io->bdev->max_copy != 0 && 2862 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2863 return true; 2864 } 2865 2866 return false; 2867 } 2868 2869 static bool 2870 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2871 { 2872 switch (bdev_io->type) { 2873 case SPDK_BDEV_IO_TYPE_READ: 2874 case SPDK_BDEV_IO_TYPE_WRITE: 2875 return bdev_rw_should_split(bdev_io); 2876 case SPDK_BDEV_IO_TYPE_UNMAP: 2877 return bdev_unmap_should_split(bdev_io); 2878 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2879 return bdev_write_zeroes_should_split(bdev_io); 2880 case SPDK_BDEV_IO_TYPE_COPY: 2881 return bdev_copy_should_split(bdev_io); 2882 default: 2883 return false; 2884 } 2885 } 2886 2887 static uint32_t 2888 _to_next_boundary(uint64_t offset, uint32_t boundary) 2889 { 2890 return (boundary - (offset % boundary)); 2891 } 2892 2893 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2894 2895 static void _bdev_rw_split(void *_bdev_io); 2896 2897 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2898 2899 static void 2900 _bdev_unmap_split(void *_bdev_io) 2901 { 2902 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2903 } 2904 2905 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2906 2907 static void 2908 _bdev_write_zeroes_split(void *_bdev_io) 2909 { 2910 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2911 } 2912 2913 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2914 2915 static void 2916 _bdev_copy_split(void *_bdev_io) 2917 { 2918 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2919 } 2920 2921 static int 2922 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2923 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2924 { 2925 int rc; 2926 uint64_t current_offset, current_remaining, current_src_offset; 2927 spdk_bdev_io_wait_cb io_wait_fn; 2928 2929 current_offset = *offset; 2930 current_remaining = *remaining; 2931 2932 bdev_io->u.bdev.split_outstanding++; 2933 2934 io_wait_fn = _bdev_rw_split; 2935 switch (bdev_io->type) { 2936 case SPDK_BDEV_IO_TYPE_READ: 2937 assert(bdev_io->u.bdev.accel_sequence == NULL); 2938 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2939 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2940 iov, iovcnt, md_buf, current_offset, 2941 num_blocks, bdev_io->internal.memory_domain, 2942 bdev_io->internal.memory_domain_ctx, NULL, 2943 bdev_io_split_done, bdev_io); 2944 break; 2945 case SPDK_BDEV_IO_TYPE_WRITE: 2946 assert(bdev_io->u.bdev.accel_sequence == NULL); 2947 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2948 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2949 iov, iovcnt, md_buf, current_offset, 2950 num_blocks, bdev_io->internal.memory_domain, 2951 bdev_io->internal.memory_domain_ctx, NULL, 2952 bdev_io_split_done, bdev_io); 2953 break; 2954 case SPDK_BDEV_IO_TYPE_UNMAP: 2955 io_wait_fn = _bdev_unmap_split; 2956 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2957 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2958 current_offset, num_blocks, 2959 bdev_io_split_done, bdev_io); 2960 break; 2961 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2962 io_wait_fn = _bdev_write_zeroes_split; 2963 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2964 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2965 current_offset, num_blocks, 2966 bdev_io_split_done, bdev_io); 2967 break; 2968 case SPDK_BDEV_IO_TYPE_COPY: 2969 io_wait_fn = _bdev_copy_split; 2970 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2971 (current_offset - bdev_io->u.bdev.offset_blocks); 2972 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2973 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2974 current_offset, current_src_offset, num_blocks, 2975 bdev_io_split_done, bdev_io); 2976 break; 2977 default: 2978 assert(false); 2979 rc = -EINVAL; 2980 break; 2981 } 2982 2983 if (rc == 0) { 2984 current_offset += num_blocks; 2985 current_remaining -= num_blocks; 2986 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2987 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2988 *offset = current_offset; 2989 *remaining = current_remaining; 2990 } else { 2991 bdev_io->u.bdev.split_outstanding--; 2992 if (rc == -ENOMEM) { 2993 if (bdev_io->u.bdev.split_outstanding == 0) { 2994 /* No I/O is outstanding. Hence we should wait here. */ 2995 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2996 } 2997 } else { 2998 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2999 if (bdev_io->u.bdev.split_outstanding == 0) { 3000 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3001 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3002 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3003 } 3004 } 3005 } 3006 3007 return rc; 3008 } 3009 3010 static void 3011 _bdev_rw_split(void *_bdev_io) 3012 { 3013 struct iovec *parent_iov, *iov; 3014 struct spdk_bdev_io *bdev_io = _bdev_io; 3015 struct spdk_bdev *bdev = bdev_io->bdev; 3016 uint64_t parent_offset, current_offset, remaining; 3017 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3018 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3019 uint32_t iovcnt, iov_len, child_iovsize; 3020 uint32_t blocklen = bdev->blocklen; 3021 uint32_t io_boundary; 3022 uint32_t max_segment_size = bdev->max_segment_size; 3023 uint32_t max_child_iovcnt = bdev->max_num_segments; 3024 void *md_buf = NULL; 3025 int rc; 3026 3027 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3028 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3029 SPDK_BDEV_IO_NUM_CHILD_IOV; 3030 3031 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3032 io_boundary = bdev->write_unit_size; 3033 } else if (bdev->split_on_optimal_io_boundary) { 3034 io_boundary = bdev->optimal_io_boundary; 3035 } else { 3036 io_boundary = UINT32_MAX; 3037 } 3038 3039 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3040 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3041 parent_offset = bdev_io->u.bdev.offset_blocks; 3042 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3043 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3044 3045 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3046 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3047 if (parent_iov_offset < parent_iov->iov_len) { 3048 break; 3049 } 3050 parent_iov_offset -= parent_iov->iov_len; 3051 } 3052 3053 child_iovcnt = 0; 3054 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3055 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3056 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3057 to_next_boundary = spdk_min(remaining, to_next_boundary); 3058 to_next_boundary_bytes = to_next_boundary * blocklen; 3059 3060 iov = &bdev_io->child_iov[child_iovcnt]; 3061 iovcnt = 0; 3062 3063 if (bdev_io->u.bdev.md_buf) { 3064 md_buf = (char *)bdev_io->u.bdev.md_buf + 3065 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3066 } 3067 3068 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3069 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3070 iovcnt < child_iovsize) { 3071 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3072 iov_len = parent_iov->iov_len - parent_iov_offset; 3073 3074 iov_len = spdk_min(iov_len, max_segment_size); 3075 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3076 to_next_boundary_bytes -= iov_len; 3077 3078 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3079 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3080 3081 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3082 parent_iov_offset += iov_len; 3083 } else { 3084 parent_iovpos++; 3085 parent_iov_offset = 0; 3086 } 3087 child_iovcnt++; 3088 iovcnt++; 3089 } 3090 3091 if (to_next_boundary_bytes > 0) { 3092 /* We had to stop this child I/O early because we ran out of 3093 * child_iov space or were limited by max_num_segments. 3094 * Ensure the iovs to be aligned with block size and 3095 * then adjust to_next_boundary before starting the 3096 * child I/O. 3097 */ 3098 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3099 iovcnt == child_iovsize); 3100 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3101 if (to_last_block_bytes != 0) { 3102 uint32_t child_iovpos = child_iovcnt - 1; 3103 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3104 * so the loop will naturally end 3105 */ 3106 3107 to_last_block_bytes = blocklen - to_last_block_bytes; 3108 to_next_boundary_bytes += to_last_block_bytes; 3109 while (to_last_block_bytes > 0 && iovcnt > 0) { 3110 iov_len = spdk_min(to_last_block_bytes, 3111 bdev_io->child_iov[child_iovpos].iov_len); 3112 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3113 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3114 child_iovpos--; 3115 if (--iovcnt == 0) { 3116 /* If the child IO is less than a block size just return. 3117 * If the first child IO of any split round is less than 3118 * a block size, an error exit. 3119 */ 3120 if (bdev_io->u.bdev.split_outstanding == 0) { 3121 SPDK_ERRLOG("The first child io was less than a block size\n"); 3122 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3123 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3124 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3125 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3126 } 3127 3128 return; 3129 } 3130 } 3131 3132 to_last_block_bytes -= iov_len; 3133 3134 if (parent_iov_offset == 0) { 3135 parent_iovpos--; 3136 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3137 } 3138 parent_iov_offset -= iov_len; 3139 } 3140 3141 assert(to_last_block_bytes == 0); 3142 } 3143 to_next_boundary -= to_next_boundary_bytes / blocklen; 3144 } 3145 3146 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3147 ¤t_offset, &remaining); 3148 if (spdk_unlikely(rc)) { 3149 return; 3150 } 3151 } 3152 } 3153 3154 static void 3155 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3156 { 3157 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3158 uint32_t num_children_reqs = 0; 3159 int rc; 3160 3161 offset = bdev_io->u.bdev.split_current_offset_blocks; 3162 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3163 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3164 3165 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3166 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3167 3168 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3169 &offset, &remaining); 3170 if (spdk_likely(rc == 0)) { 3171 num_children_reqs++; 3172 } else { 3173 return; 3174 } 3175 } 3176 } 3177 3178 static void 3179 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3180 { 3181 uint64_t offset, write_zeroes_blocks, remaining; 3182 uint32_t num_children_reqs = 0; 3183 int rc; 3184 3185 offset = bdev_io->u.bdev.split_current_offset_blocks; 3186 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3187 3188 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3189 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3190 3191 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3192 &offset, &remaining); 3193 if (spdk_likely(rc == 0)) { 3194 num_children_reqs++; 3195 } else { 3196 return; 3197 } 3198 } 3199 } 3200 3201 static void 3202 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3203 { 3204 uint64_t offset, copy_blocks, remaining; 3205 uint32_t num_children_reqs = 0; 3206 int rc; 3207 3208 offset = bdev_io->u.bdev.split_current_offset_blocks; 3209 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3210 3211 assert(bdev_io->bdev->max_copy != 0); 3212 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3213 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3214 3215 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3216 &offset, &remaining); 3217 if (spdk_likely(rc == 0)) { 3218 num_children_reqs++; 3219 } else { 3220 return; 3221 } 3222 } 3223 } 3224 3225 static void 3226 parent_bdev_io_complete(void *ctx, int rc) 3227 { 3228 struct spdk_bdev_io *parent_io = ctx; 3229 3230 if (rc) { 3231 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3232 } 3233 3234 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3235 parent_io->internal.caller_ctx); 3236 } 3237 3238 static void 3239 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3240 { 3241 struct spdk_bdev_io *bdev_io = ctx; 3242 3243 /* u.bdev.accel_sequence should have already been cleared at this point */ 3244 assert(bdev_io->u.bdev.accel_sequence == NULL); 3245 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3246 bdev_io->internal.accel_sequence = NULL; 3247 3248 if (spdk_unlikely(status != 0)) { 3249 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3250 } 3251 3252 parent_bdev_io_complete(bdev_io, status); 3253 } 3254 3255 static void 3256 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3257 { 3258 struct spdk_bdev_io *parent_io = cb_arg; 3259 3260 spdk_bdev_free_io(bdev_io); 3261 3262 if (!success) { 3263 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3264 /* If any child I/O failed, stop further splitting process. */ 3265 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3266 parent_io->u.bdev.split_remaining_num_blocks = 0; 3267 } 3268 parent_io->u.bdev.split_outstanding--; 3269 if (parent_io->u.bdev.split_outstanding != 0) { 3270 return; 3271 } 3272 3273 /* 3274 * Parent I/O finishes when all blocks are consumed. 3275 */ 3276 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3277 assert(parent_io->internal.cb != bdev_io_split_done); 3278 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3279 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3280 3281 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3282 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3283 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3284 return; 3285 } else if (parent_io->internal.orig_iovcnt != 0 && 3286 !bdev_io_use_accel_sequence(bdev_io)) { 3287 /* bdev IO will be completed in the callback */ 3288 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3289 return; 3290 } 3291 } 3292 3293 parent_bdev_io_complete(parent_io, 0); 3294 return; 3295 } 3296 3297 /* 3298 * Continue with the splitting process. This function will complete the parent I/O if the 3299 * splitting is done. 3300 */ 3301 switch (parent_io->type) { 3302 case SPDK_BDEV_IO_TYPE_READ: 3303 case SPDK_BDEV_IO_TYPE_WRITE: 3304 _bdev_rw_split(parent_io); 3305 break; 3306 case SPDK_BDEV_IO_TYPE_UNMAP: 3307 bdev_unmap_split(parent_io); 3308 break; 3309 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3310 bdev_write_zeroes_split(parent_io); 3311 break; 3312 case SPDK_BDEV_IO_TYPE_COPY: 3313 bdev_copy_split(parent_io); 3314 break; 3315 default: 3316 assert(false); 3317 break; 3318 } 3319 } 3320 3321 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3322 bool success); 3323 3324 static void 3325 bdev_io_split(struct spdk_bdev_io *bdev_io) 3326 { 3327 assert(bdev_io_should_split(bdev_io)); 3328 3329 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3330 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3331 bdev_io->u.bdev.split_outstanding = 0; 3332 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3333 3334 switch (bdev_io->type) { 3335 case SPDK_BDEV_IO_TYPE_READ: 3336 case SPDK_BDEV_IO_TYPE_WRITE: 3337 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3338 _bdev_rw_split(bdev_io); 3339 } else { 3340 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3341 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3342 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3343 } 3344 break; 3345 case SPDK_BDEV_IO_TYPE_UNMAP: 3346 bdev_unmap_split(bdev_io); 3347 break; 3348 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3349 bdev_write_zeroes_split(bdev_io); 3350 break; 3351 case SPDK_BDEV_IO_TYPE_COPY: 3352 bdev_copy_split(bdev_io); 3353 break; 3354 default: 3355 assert(false); 3356 break; 3357 } 3358 } 3359 3360 static void 3361 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3362 { 3363 if (!success) { 3364 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3365 return; 3366 } 3367 3368 _bdev_rw_split(bdev_io); 3369 } 3370 3371 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3372 * be inlined, at least on some compilers. 3373 */ 3374 static inline void 3375 _bdev_io_submit(void *ctx) 3376 { 3377 struct spdk_bdev_io *bdev_io = ctx; 3378 struct spdk_bdev *bdev = bdev_io->bdev; 3379 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3380 3381 if (spdk_likely(bdev_ch->flags == 0)) { 3382 bdev_io_do_submit(bdev_ch, bdev_io); 3383 return; 3384 } 3385 3386 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3387 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3388 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3389 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3390 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3391 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3392 } else { 3393 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3394 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3395 } 3396 } else { 3397 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3398 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3399 } 3400 } 3401 3402 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3403 3404 bool 3405 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3406 { 3407 if (range1->length == 0 || range2->length == 0) { 3408 return false; 3409 } 3410 3411 if (range1->offset + range1->length <= range2->offset) { 3412 return false; 3413 } 3414 3415 if (range2->offset + range2->length <= range1->offset) { 3416 return false; 3417 } 3418 3419 return true; 3420 } 3421 3422 static bool 3423 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3424 { 3425 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3426 struct lba_range r; 3427 3428 switch (bdev_io->type) { 3429 case SPDK_BDEV_IO_TYPE_NVME_IO: 3430 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3431 /* Don't try to decode the NVMe command - just assume worst-case and that 3432 * it overlaps a locked range. 3433 */ 3434 return true; 3435 case SPDK_BDEV_IO_TYPE_WRITE: 3436 case SPDK_BDEV_IO_TYPE_UNMAP: 3437 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3438 case SPDK_BDEV_IO_TYPE_ZCOPY: 3439 case SPDK_BDEV_IO_TYPE_COPY: 3440 r.offset = bdev_io->u.bdev.offset_blocks; 3441 r.length = bdev_io->u.bdev.num_blocks; 3442 if (!bdev_lba_range_overlapped(range, &r)) { 3443 /* This I/O doesn't overlap the specified LBA range. */ 3444 return false; 3445 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3446 /* This I/O overlaps, but the I/O is on the same channel that locked this 3447 * range, and the caller_ctx is the same as the locked_ctx. This means 3448 * that this I/O is associated with the lock, and is allowed to execute. 3449 */ 3450 return false; 3451 } else { 3452 return true; 3453 } 3454 default: 3455 return false; 3456 } 3457 } 3458 3459 void 3460 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3461 { 3462 struct spdk_bdev *bdev = bdev_io->bdev; 3463 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3464 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3465 3466 assert(thread != NULL); 3467 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3468 3469 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3470 struct lba_range *range; 3471 3472 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3473 if (bdev_io_range_is_locked(bdev_io, range)) { 3474 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3475 return; 3476 } 3477 } 3478 } 3479 3480 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3481 3482 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3483 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3484 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3485 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3486 spdk_bdev_get_name(bdev)); 3487 3488 if (bdev_io->internal.split) { 3489 bdev_io_split(bdev_io); 3490 return; 3491 } 3492 3493 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3494 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3495 _bdev_io_submit(bdev_io); 3496 } else { 3497 bdev_io->internal.io_submit_ch = ch; 3498 bdev_io->internal.ch = bdev->internal.qos->ch; 3499 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3500 } 3501 } else { 3502 _bdev_io_submit(bdev_io); 3503 } 3504 } 3505 3506 static inline void 3507 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3508 { 3509 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3510 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3511 * For write operation we need to pull buffers from memory domain before submitting IO. 3512 * Once read operation completes, we need to use memory_domain push functionality to 3513 * update data in original memory domain IO buffer 3514 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3515 bdev_io->u.bdev.memory_domain = NULL; 3516 bdev_io->u.bdev.memory_domain_ctx = NULL; 3517 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3518 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3519 } 3520 3521 static inline void 3522 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3523 { 3524 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3525 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3526 3527 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3528 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3529 bdev_io_complete_unsubmitted(bdev_io); 3530 return; 3531 } 3532 3533 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3534 * support them, but we need to execute an accel sequence and the data buffer is from accel 3535 * memory domain (to avoid doing a push/pull from that domain). 3536 */ 3537 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3538 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3539 _bdev_io_ext_use_bounce_buffer(bdev_io); 3540 return; 3541 } 3542 3543 if (needs_exec) { 3544 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3545 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3546 return; 3547 } 3548 /* For reads we'll execute the sequence after the data is read, so, for now, only 3549 * clear out accel_sequence pointer and submit the IO */ 3550 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3551 bdev_io->u.bdev.accel_sequence = NULL; 3552 } 3553 3554 bdev_io_submit(bdev_io); 3555 } 3556 3557 static void 3558 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3559 { 3560 struct spdk_bdev *bdev = bdev_io->bdev; 3561 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3562 struct spdk_io_channel *ch = bdev_ch->channel; 3563 3564 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3565 3566 bdev_io->internal.in_submit_request = true; 3567 bdev_submit_request(bdev, ch, bdev_io); 3568 bdev_io->internal.in_submit_request = false; 3569 } 3570 3571 void 3572 bdev_io_init(struct spdk_bdev_io *bdev_io, 3573 struct spdk_bdev *bdev, void *cb_arg, 3574 spdk_bdev_io_completion_cb cb) 3575 { 3576 bdev_io->bdev = bdev; 3577 bdev_io->internal.caller_ctx = cb_arg; 3578 bdev_io->internal.cb = cb; 3579 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3580 bdev_io->internal.in_submit_request = false; 3581 bdev_io->internal.buf = NULL; 3582 bdev_io->internal.io_submit_ch = NULL; 3583 bdev_io->internal.orig_iovs = NULL; 3584 bdev_io->internal.orig_iovcnt = 0; 3585 bdev_io->internal.orig_md_iov.iov_base = NULL; 3586 bdev_io->internal.error.nvme.cdw0 = 0; 3587 bdev_io->num_retries = 0; 3588 bdev_io->internal.get_buf_cb = NULL; 3589 bdev_io->internal.get_aux_buf_cb = NULL; 3590 bdev_io->internal.memory_domain = NULL; 3591 bdev_io->internal.memory_domain_ctx = NULL; 3592 bdev_io->internal.data_transfer_cpl = NULL; 3593 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3594 bdev_io->internal.accel_sequence = NULL; 3595 bdev_io->internal.has_accel_sequence = false; 3596 } 3597 3598 static bool 3599 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3600 { 3601 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3602 } 3603 3604 bool 3605 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3606 { 3607 bool supported; 3608 3609 supported = bdev_io_type_supported(bdev, io_type); 3610 3611 if (!supported) { 3612 switch (io_type) { 3613 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3614 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3615 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3616 break; 3617 default: 3618 break; 3619 } 3620 } 3621 3622 return supported; 3623 } 3624 3625 uint64_t 3626 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3627 { 3628 return bdev_io->internal.submit_tsc; 3629 } 3630 3631 int 3632 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3633 { 3634 if (bdev->fn_table->dump_info_json) { 3635 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3636 } 3637 3638 return 0; 3639 } 3640 3641 static void 3642 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3643 { 3644 uint32_t max_per_timeslice = 0; 3645 int i; 3646 3647 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3648 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3649 qos->rate_limits[i].max_per_timeslice = 0; 3650 continue; 3651 } 3652 3653 max_per_timeslice = qos->rate_limits[i].limit * 3654 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3655 3656 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3657 qos->rate_limits[i].min_per_timeslice); 3658 3659 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3660 } 3661 3662 bdev_qos_set_ops(qos); 3663 } 3664 3665 static int 3666 bdev_channel_poll_qos(void *arg) 3667 { 3668 struct spdk_bdev_qos *qos = arg; 3669 uint64_t now = spdk_get_ticks(); 3670 int i; 3671 3672 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3673 /* We received our callback earlier than expected - return 3674 * immediately and wait to do accounting until at least one 3675 * timeslice has actually expired. This should never happen 3676 * with a well-behaved timer implementation. 3677 */ 3678 return SPDK_POLLER_IDLE; 3679 } 3680 3681 /* Reset for next round of rate limiting */ 3682 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3683 /* We may have allowed the IOs or bytes to slightly overrun in the last 3684 * timeslice. remaining_this_timeslice is signed, so if it's negative 3685 * here, we'll account for the overrun so that the next timeslice will 3686 * be appropriately reduced. 3687 */ 3688 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3689 qos->rate_limits[i].remaining_this_timeslice = 0; 3690 } 3691 } 3692 3693 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3694 qos->last_timeslice += qos->timeslice_size; 3695 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3696 qos->rate_limits[i].remaining_this_timeslice += 3697 qos->rate_limits[i].max_per_timeslice; 3698 } 3699 } 3700 3701 return bdev_qos_io_submit(qos->ch, qos); 3702 } 3703 3704 static void 3705 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3706 { 3707 struct spdk_bdev_shared_resource *shared_resource; 3708 struct lba_range *range; 3709 3710 bdev_free_io_stat(ch->stat); 3711 #ifdef SPDK_CONFIG_VTUNE 3712 bdev_free_io_stat(ch->prev_stat); 3713 #endif 3714 3715 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3716 range = TAILQ_FIRST(&ch->locked_ranges); 3717 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3718 free(range); 3719 } 3720 3721 spdk_put_io_channel(ch->channel); 3722 spdk_put_io_channel(ch->accel_channel); 3723 3724 shared_resource = ch->shared_resource; 3725 3726 assert(TAILQ_EMPTY(&ch->io_locked)); 3727 assert(TAILQ_EMPTY(&ch->io_submitted)); 3728 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3729 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3730 assert(ch->io_outstanding == 0); 3731 assert(shared_resource->ref > 0); 3732 shared_resource->ref--; 3733 if (shared_resource->ref == 0) { 3734 assert(shared_resource->io_outstanding == 0); 3735 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3736 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3737 spdk_poller_unregister(&shared_resource->nomem_poller); 3738 free(shared_resource); 3739 } 3740 } 3741 3742 static void 3743 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3744 { 3745 struct spdk_bdev_qos *qos = bdev->internal.qos; 3746 int i; 3747 3748 assert(spdk_spin_held(&bdev->internal.spinlock)); 3749 3750 /* Rate limiting on this bdev enabled */ 3751 if (qos) { 3752 if (qos->ch == NULL) { 3753 struct spdk_io_channel *io_ch; 3754 3755 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3756 bdev->name, spdk_get_thread()); 3757 3758 /* No qos channel has been selected, so set one up */ 3759 3760 /* Take another reference to ch */ 3761 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3762 assert(io_ch != NULL); 3763 qos->ch = ch; 3764 3765 qos->thread = spdk_io_channel_get_thread(io_ch); 3766 3767 TAILQ_INIT(&qos->queued); 3768 3769 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3770 if (bdev_qos_is_iops_rate_limit(i) == true) { 3771 qos->rate_limits[i].min_per_timeslice = 3772 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3773 } else { 3774 qos->rate_limits[i].min_per_timeslice = 3775 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3776 } 3777 3778 if (qos->rate_limits[i].limit == 0) { 3779 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3780 } 3781 } 3782 bdev_qos_update_max_quota_per_timeslice(qos); 3783 qos->timeslice_size = 3784 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3785 qos->last_timeslice = spdk_get_ticks(); 3786 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3787 qos, 3788 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3789 } 3790 3791 ch->flags |= BDEV_CH_QOS_ENABLED; 3792 } 3793 } 3794 3795 struct poll_timeout_ctx { 3796 struct spdk_bdev_desc *desc; 3797 uint64_t timeout_in_sec; 3798 spdk_bdev_io_timeout_cb cb_fn; 3799 void *cb_arg; 3800 }; 3801 3802 static void 3803 bdev_desc_free(struct spdk_bdev_desc *desc) 3804 { 3805 spdk_spin_destroy(&desc->spinlock); 3806 free(desc->media_events_buffer); 3807 free(desc); 3808 } 3809 3810 static void 3811 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3812 { 3813 struct poll_timeout_ctx *ctx = _ctx; 3814 struct spdk_bdev_desc *desc = ctx->desc; 3815 3816 free(ctx); 3817 3818 spdk_spin_lock(&desc->spinlock); 3819 desc->refs--; 3820 if (desc->closed == true && desc->refs == 0) { 3821 spdk_spin_unlock(&desc->spinlock); 3822 bdev_desc_free(desc); 3823 return; 3824 } 3825 spdk_spin_unlock(&desc->spinlock); 3826 } 3827 3828 static void 3829 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3830 struct spdk_io_channel *io_ch, void *_ctx) 3831 { 3832 struct poll_timeout_ctx *ctx = _ctx; 3833 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3834 struct spdk_bdev_desc *desc = ctx->desc; 3835 struct spdk_bdev_io *bdev_io; 3836 uint64_t now; 3837 3838 spdk_spin_lock(&desc->spinlock); 3839 if (desc->closed == true) { 3840 spdk_spin_unlock(&desc->spinlock); 3841 spdk_bdev_for_each_channel_continue(i, -1); 3842 return; 3843 } 3844 spdk_spin_unlock(&desc->spinlock); 3845 3846 now = spdk_get_ticks(); 3847 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3848 /* Exclude any I/O that are generated via splitting. */ 3849 if (bdev_io->internal.cb == bdev_io_split_done) { 3850 continue; 3851 } 3852 3853 /* Once we find an I/O that has not timed out, we can immediately 3854 * exit the loop. 3855 */ 3856 if (now < (bdev_io->internal.submit_tsc + 3857 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3858 goto end; 3859 } 3860 3861 if (bdev_io->internal.desc == desc) { 3862 ctx->cb_fn(ctx->cb_arg, bdev_io); 3863 } 3864 } 3865 3866 end: 3867 spdk_bdev_for_each_channel_continue(i, 0); 3868 } 3869 3870 static int 3871 bdev_poll_timeout_io(void *arg) 3872 { 3873 struct spdk_bdev_desc *desc = arg; 3874 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3875 struct poll_timeout_ctx *ctx; 3876 3877 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3878 if (!ctx) { 3879 SPDK_ERRLOG("failed to allocate memory\n"); 3880 return SPDK_POLLER_BUSY; 3881 } 3882 ctx->desc = desc; 3883 ctx->cb_arg = desc->cb_arg; 3884 ctx->cb_fn = desc->cb_fn; 3885 ctx->timeout_in_sec = desc->timeout_in_sec; 3886 3887 /* Take a ref on the descriptor in case it gets closed while we are checking 3888 * all of the channels. 3889 */ 3890 spdk_spin_lock(&desc->spinlock); 3891 desc->refs++; 3892 spdk_spin_unlock(&desc->spinlock); 3893 3894 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3895 bdev_channel_poll_timeout_io_done); 3896 3897 return SPDK_POLLER_BUSY; 3898 } 3899 3900 int 3901 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3902 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3903 { 3904 assert(desc->thread == spdk_get_thread()); 3905 3906 spdk_poller_unregister(&desc->io_timeout_poller); 3907 3908 if (timeout_in_sec) { 3909 assert(cb_fn != NULL); 3910 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3911 desc, 3912 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3913 1000); 3914 if (desc->io_timeout_poller == NULL) { 3915 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3916 return -1; 3917 } 3918 } 3919 3920 desc->cb_fn = cb_fn; 3921 desc->cb_arg = cb_arg; 3922 desc->timeout_in_sec = timeout_in_sec; 3923 3924 return 0; 3925 } 3926 3927 static int 3928 bdev_channel_create(void *io_device, void *ctx_buf) 3929 { 3930 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3931 struct spdk_bdev_channel *ch = ctx_buf; 3932 struct spdk_io_channel *mgmt_io_ch; 3933 struct spdk_bdev_mgmt_channel *mgmt_ch; 3934 struct spdk_bdev_shared_resource *shared_resource; 3935 struct lba_range *range; 3936 3937 ch->bdev = bdev; 3938 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3939 if (!ch->channel) { 3940 return -1; 3941 } 3942 3943 ch->accel_channel = spdk_accel_get_io_channel(); 3944 if (!ch->accel_channel) { 3945 spdk_put_io_channel(ch->channel); 3946 return -1; 3947 } 3948 3949 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3950 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3951 3952 assert(ch->histogram == NULL); 3953 if (bdev->internal.histogram_enabled) { 3954 ch->histogram = spdk_histogram_data_alloc(); 3955 if (ch->histogram == NULL) { 3956 SPDK_ERRLOG("Could not allocate histogram\n"); 3957 } 3958 } 3959 3960 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3961 if (!mgmt_io_ch) { 3962 spdk_put_io_channel(ch->channel); 3963 spdk_put_io_channel(ch->accel_channel); 3964 return -1; 3965 } 3966 3967 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3968 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3969 if (shared_resource->shared_ch == ch->channel) { 3970 spdk_put_io_channel(mgmt_io_ch); 3971 shared_resource->ref++; 3972 break; 3973 } 3974 } 3975 3976 if (shared_resource == NULL) { 3977 shared_resource = calloc(1, sizeof(*shared_resource)); 3978 if (shared_resource == NULL) { 3979 spdk_put_io_channel(ch->channel); 3980 spdk_put_io_channel(ch->accel_channel); 3981 spdk_put_io_channel(mgmt_io_ch); 3982 return -1; 3983 } 3984 3985 shared_resource->mgmt_ch = mgmt_ch; 3986 shared_resource->io_outstanding = 0; 3987 TAILQ_INIT(&shared_resource->nomem_io); 3988 shared_resource->nomem_threshold = 0; 3989 shared_resource->shared_ch = ch->channel; 3990 shared_resource->ref = 1; 3991 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3992 } 3993 3994 ch->io_outstanding = 0; 3995 TAILQ_INIT(&ch->queued_resets); 3996 TAILQ_INIT(&ch->locked_ranges); 3997 ch->flags = 0; 3998 ch->shared_resource = shared_resource; 3999 4000 TAILQ_INIT(&ch->io_submitted); 4001 TAILQ_INIT(&ch->io_locked); 4002 TAILQ_INIT(&ch->io_accel_exec); 4003 TAILQ_INIT(&ch->io_memory_domain); 4004 4005 ch->stat = bdev_alloc_io_stat(false); 4006 if (ch->stat == NULL) { 4007 bdev_channel_destroy_resource(ch); 4008 return -1; 4009 } 4010 4011 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4012 4013 #ifdef SPDK_CONFIG_VTUNE 4014 { 4015 char *name; 4016 __itt_init_ittlib(NULL, 0); 4017 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4018 if (!name) { 4019 bdev_channel_destroy_resource(ch); 4020 return -1; 4021 } 4022 ch->handle = __itt_string_handle_create(name); 4023 free(name); 4024 ch->start_tsc = spdk_get_ticks(); 4025 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4026 ch->prev_stat = bdev_alloc_io_stat(false); 4027 if (ch->prev_stat == NULL) { 4028 bdev_channel_destroy_resource(ch); 4029 return -1; 4030 } 4031 } 4032 #endif 4033 4034 spdk_spin_lock(&bdev->internal.spinlock); 4035 bdev_enable_qos(bdev, ch); 4036 4037 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4038 struct lba_range *new_range; 4039 4040 new_range = calloc(1, sizeof(*new_range)); 4041 if (new_range == NULL) { 4042 spdk_spin_unlock(&bdev->internal.spinlock); 4043 bdev_channel_destroy_resource(ch); 4044 return -1; 4045 } 4046 new_range->length = range->length; 4047 new_range->offset = range->offset; 4048 new_range->locked_ctx = range->locked_ctx; 4049 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4050 } 4051 4052 spdk_spin_unlock(&bdev->internal.spinlock); 4053 4054 return 0; 4055 } 4056 4057 static int 4058 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4059 void *cb_ctx) 4060 { 4061 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4062 struct spdk_bdev_io *bdev_io; 4063 uint64_t buf_len; 4064 4065 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4066 if (bdev_io->internal.ch == bdev_ch) { 4067 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4068 spdk_iobuf_entry_abort(ch, entry, buf_len); 4069 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4070 } 4071 4072 return 0; 4073 } 4074 4075 /* 4076 * Abort I/O that are waiting on a data buffer. 4077 */ 4078 static void 4079 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4080 { 4081 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4082 bdev_abort_all_buf_io_cb, ch); 4083 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4084 bdev_abort_all_buf_io_cb, ch); 4085 } 4086 4087 /* 4088 * Abort I/O that are queued waiting for submission. These types of I/O are 4089 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4090 */ 4091 static void 4092 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4093 { 4094 struct spdk_bdev_io *bdev_io, *tmp; 4095 4096 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4097 if (bdev_io->internal.ch == ch) { 4098 TAILQ_REMOVE(queue, bdev_io, internal.link); 4099 /* 4100 * spdk_bdev_io_complete() assumes that the completed I/O had 4101 * been submitted to the bdev module. Since in this case it 4102 * hadn't, bump io_outstanding to account for the decrement 4103 * that spdk_bdev_io_complete() will do. 4104 */ 4105 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4106 bdev_io_increment_outstanding(ch, ch->shared_resource); 4107 } 4108 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4109 } 4110 } 4111 } 4112 4113 static bool 4114 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4115 { 4116 struct spdk_bdev_io *bdev_io; 4117 4118 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4119 if (bdev_io == bio_to_abort) { 4120 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4121 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4122 return true; 4123 } 4124 } 4125 4126 return false; 4127 } 4128 4129 static int 4130 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4131 { 4132 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4133 uint64_t buf_len; 4134 4135 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4136 if (bdev_io == bio_to_abort) { 4137 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4138 spdk_iobuf_entry_abort(ch, entry, buf_len); 4139 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4140 return 1; 4141 } 4142 4143 return 0; 4144 } 4145 4146 static bool 4147 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4148 { 4149 int rc; 4150 4151 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4152 bdev_abort_buf_io_cb, bio_to_abort); 4153 if (rc == 1) { 4154 return true; 4155 } 4156 4157 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4158 bdev_abort_buf_io_cb, bio_to_abort); 4159 return rc == 1; 4160 } 4161 4162 static void 4163 bdev_qos_channel_destroy(void *cb_arg) 4164 { 4165 struct spdk_bdev_qos *qos = cb_arg; 4166 4167 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4168 spdk_poller_unregister(&qos->poller); 4169 4170 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4171 4172 free(qos); 4173 } 4174 4175 static int 4176 bdev_qos_destroy(struct spdk_bdev *bdev) 4177 { 4178 int i; 4179 4180 /* 4181 * Cleanly shutting down the QoS poller is tricky, because 4182 * during the asynchronous operation the user could open 4183 * a new descriptor and create a new channel, spawning 4184 * a new QoS poller. 4185 * 4186 * The strategy is to create a new QoS structure here and swap it 4187 * in. The shutdown path then continues to refer to the old one 4188 * until it completes and then releases it. 4189 */ 4190 struct spdk_bdev_qos *new_qos, *old_qos; 4191 4192 old_qos = bdev->internal.qos; 4193 4194 new_qos = calloc(1, sizeof(*new_qos)); 4195 if (!new_qos) { 4196 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4197 return -ENOMEM; 4198 } 4199 4200 /* Copy the old QoS data into the newly allocated structure */ 4201 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4202 4203 /* Zero out the key parts of the QoS structure */ 4204 new_qos->ch = NULL; 4205 new_qos->thread = NULL; 4206 new_qos->poller = NULL; 4207 TAILQ_INIT(&new_qos->queued); 4208 /* 4209 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4210 * It will be used later for the new QoS structure. 4211 */ 4212 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4213 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4214 new_qos->rate_limits[i].min_per_timeslice = 0; 4215 new_qos->rate_limits[i].max_per_timeslice = 0; 4216 } 4217 4218 bdev->internal.qos = new_qos; 4219 4220 if (old_qos->thread == NULL) { 4221 free(old_qos); 4222 } else { 4223 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4224 } 4225 4226 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4227 * been destroyed yet. The destruction path will end up waiting for the final 4228 * channel to be put before it releases resources. */ 4229 4230 return 0; 4231 } 4232 4233 void 4234 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4235 { 4236 total->bytes_read += add->bytes_read; 4237 total->num_read_ops += add->num_read_ops; 4238 total->bytes_written += add->bytes_written; 4239 total->num_write_ops += add->num_write_ops; 4240 total->bytes_unmapped += add->bytes_unmapped; 4241 total->num_unmap_ops += add->num_unmap_ops; 4242 total->bytes_copied += add->bytes_copied; 4243 total->num_copy_ops += add->num_copy_ops; 4244 total->read_latency_ticks += add->read_latency_ticks; 4245 total->write_latency_ticks += add->write_latency_ticks; 4246 total->unmap_latency_ticks += add->unmap_latency_ticks; 4247 total->copy_latency_ticks += add->copy_latency_ticks; 4248 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4249 total->max_read_latency_ticks = add->max_read_latency_ticks; 4250 } 4251 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4252 total->min_read_latency_ticks = add->min_read_latency_ticks; 4253 } 4254 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4255 total->max_write_latency_ticks = add->max_write_latency_ticks; 4256 } 4257 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4258 total->min_write_latency_ticks = add->min_write_latency_ticks; 4259 } 4260 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4261 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4262 } 4263 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4264 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4265 } 4266 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4267 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4268 } 4269 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4270 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4271 } 4272 } 4273 4274 static void 4275 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4276 { 4277 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4278 4279 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4280 memcpy(to_stat->io_error, from_stat->io_error, 4281 sizeof(struct spdk_bdev_io_error_stat)); 4282 } 4283 } 4284 4285 void 4286 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4287 { 4288 stat->max_read_latency_ticks = 0; 4289 stat->min_read_latency_ticks = UINT64_MAX; 4290 stat->max_write_latency_ticks = 0; 4291 stat->min_write_latency_ticks = UINT64_MAX; 4292 stat->max_unmap_latency_ticks = 0; 4293 stat->min_unmap_latency_ticks = UINT64_MAX; 4294 stat->max_copy_latency_ticks = 0; 4295 stat->min_copy_latency_ticks = UINT64_MAX; 4296 4297 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4298 return; 4299 } 4300 4301 stat->bytes_read = 0; 4302 stat->num_read_ops = 0; 4303 stat->bytes_written = 0; 4304 stat->num_write_ops = 0; 4305 stat->bytes_unmapped = 0; 4306 stat->num_unmap_ops = 0; 4307 stat->bytes_copied = 0; 4308 stat->num_copy_ops = 0; 4309 stat->read_latency_ticks = 0; 4310 stat->write_latency_ticks = 0; 4311 stat->unmap_latency_ticks = 0; 4312 stat->copy_latency_ticks = 0; 4313 4314 if (stat->io_error != NULL) { 4315 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4316 } 4317 } 4318 4319 struct spdk_bdev_io_stat * 4320 bdev_alloc_io_stat(bool io_error_stat) 4321 { 4322 struct spdk_bdev_io_stat *stat; 4323 4324 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4325 if (stat == NULL) { 4326 return NULL; 4327 } 4328 4329 if (io_error_stat) { 4330 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4331 if (stat->io_error == NULL) { 4332 free(stat); 4333 return NULL; 4334 } 4335 } else { 4336 stat->io_error = NULL; 4337 } 4338 4339 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4340 4341 return stat; 4342 } 4343 4344 void 4345 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4346 { 4347 if (stat != NULL) { 4348 free(stat->io_error); 4349 free(stat); 4350 } 4351 } 4352 4353 void 4354 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4355 { 4356 int i; 4357 4358 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4359 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4360 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4361 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4362 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4363 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4364 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4365 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4366 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4367 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4368 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4369 stat->min_read_latency_ticks != UINT64_MAX ? 4370 stat->min_read_latency_ticks : 0); 4371 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4372 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4373 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4374 stat->min_write_latency_ticks != UINT64_MAX ? 4375 stat->min_write_latency_ticks : 0); 4376 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4377 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4378 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4379 stat->min_unmap_latency_ticks != UINT64_MAX ? 4380 stat->min_unmap_latency_ticks : 0); 4381 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4382 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4383 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4384 stat->min_copy_latency_ticks != UINT64_MAX ? 4385 stat->min_copy_latency_ticks : 0); 4386 4387 if (stat->io_error != NULL) { 4388 spdk_json_write_named_object_begin(w, "io_error"); 4389 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4390 if (stat->io_error->error_status[i] != 0) { 4391 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4392 stat->io_error->error_status[i]); 4393 } 4394 } 4395 spdk_json_write_object_end(w); 4396 } 4397 } 4398 4399 static void 4400 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4401 { 4402 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4403 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4404 4405 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4406 bdev_abort_all_buf_io(mgmt_ch, ch); 4407 } 4408 4409 static void 4410 bdev_channel_destroy(void *io_device, void *ctx_buf) 4411 { 4412 struct spdk_bdev_channel *ch = ctx_buf; 4413 4414 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4415 spdk_get_thread()); 4416 4417 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4418 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4419 4420 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4421 spdk_spin_lock(&ch->bdev->internal.spinlock); 4422 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4423 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4424 4425 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4426 4427 bdev_channel_abort_queued_ios(ch); 4428 4429 if (ch->histogram) { 4430 spdk_histogram_data_free(ch->histogram); 4431 } 4432 4433 bdev_channel_destroy_resource(ch); 4434 } 4435 4436 /* 4437 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4438 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4439 */ 4440 static int 4441 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4442 { 4443 struct spdk_bdev_name *tmp; 4444 4445 bdev_name->name = strdup(name); 4446 if (bdev_name->name == NULL) { 4447 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4448 return -ENOMEM; 4449 } 4450 4451 bdev_name->bdev = bdev; 4452 4453 spdk_spin_lock(&g_bdev_mgr.spinlock); 4454 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4455 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4456 4457 if (tmp != NULL) { 4458 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4459 free(bdev_name->name); 4460 return -EEXIST; 4461 } 4462 4463 return 0; 4464 } 4465 4466 static void 4467 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4468 { 4469 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4470 free(bdev_name->name); 4471 } 4472 4473 static void 4474 bdev_name_del(struct spdk_bdev_name *bdev_name) 4475 { 4476 spdk_spin_lock(&g_bdev_mgr.spinlock); 4477 bdev_name_del_unsafe(bdev_name); 4478 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4479 } 4480 4481 int 4482 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4483 { 4484 struct spdk_bdev_alias *tmp; 4485 int ret; 4486 4487 if (alias == NULL) { 4488 SPDK_ERRLOG("Empty alias passed\n"); 4489 return -EINVAL; 4490 } 4491 4492 tmp = calloc(1, sizeof(*tmp)); 4493 if (tmp == NULL) { 4494 SPDK_ERRLOG("Unable to allocate alias\n"); 4495 return -ENOMEM; 4496 } 4497 4498 ret = bdev_name_add(&tmp->alias, bdev, alias); 4499 if (ret != 0) { 4500 free(tmp); 4501 return ret; 4502 } 4503 4504 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4505 4506 return 0; 4507 } 4508 4509 static int 4510 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4511 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4512 { 4513 struct spdk_bdev_alias *tmp; 4514 4515 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4516 if (strcmp(alias, tmp->alias.name) == 0) { 4517 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4518 alias_del_fn(&tmp->alias); 4519 free(tmp); 4520 return 0; 4521 } 4522 } 4523 4524 return -ENOENT; 4525 } 4526 4527 int 4528 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4529 { 4530 int rc; 4531 4532 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4533 if (rc == -ENOENT) { 4534 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4535 } 4536 4537 return rc; 4538 } 4539 4540 void 4541 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4542 { 4543 struct spdk_bdev_alias *p, *tmp; 4544 4545 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4546 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4547 bdev_name_del(&p->alias); 4548 free(p); 4549 } 4550 } 4551 4552 struct spdk_io_channel * 4553 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4554 { 4555 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4556 } 4557 4558 void * 4559 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4560 { 4561 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4562 void *ctx = NULL; 4563 4564 if (bdev->fn_table->get_module_ctx) { 4565 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4566 } 4567 4568 return ctx; 4569 } 4570 4571 const char * 4572 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4573 { 4574 return bdev->module->name; 4575 } 4576 4577 const char * 4578 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4579 { 4580 return bdev->name; 4581 } 4582 4583 const char * 4584 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4585 { 4586 return bdev->product_name; 4587 } 4588 4589 const struct spdk_bdev_aliases_list * 4590 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4591 { 4592 return &bdev->aliases; 4593 } 4594 4595 uint32_t 4596 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4597 { 4598 return bdev->blocklen; 4599 } 4600 4601 uint32_t 4602 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4603 { 4604 return bdev->write_unit_size; 4605 } 4606 4607 uint64_t 4608 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4609 { 4610 return bdev->blockcnt; 4611 } 4612 4613 const char * 4614 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4615 { 4616 return qos_rpc_type[type]; 4617 } 4618 4619 void 4620 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4621 { 4622 int i; 4623 4624 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4625 4626 spdk_spin_lock(&bdev->internal.spinlock); 4627 if (bdev->internal.qos) { 4628 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4629 if (bdev->internal.qos->rate_limits[i].limit != 4630 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4631 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4632 if (bdev_qos_is_iops_rate_limit(i) == false) { 4633 /* Change from Byte to Megabyte which is user visible. */ 4634 limits[i] = limits[i] / 1024 / 1024; 4635 } 4636 } 4637 } 4638 } 4639 spdk_spin_unlock(&bdev->internal.spinlock); 4640 } 4641 4642 size_t 4643 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4644 { 4645 return 1 << bdev->required_alignment; 4646 } 4647 4648 uint32_t 4649 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4650 { 4651 return bdev->optimal_io_boundary; 4652 } 4653 4654 bool 4655 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4656 { 4657 return bdev->write_cache; 4658 } 4659 4660 const struct spdk_uuid * 4661 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4662 { 4663 return &bdev->uuid; 4664 } 4665 4666 uint16_t 4667 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4668 { 4669 return bdev->acwu; 4670 } 4671 4672 uint32_t 4673 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4674 { 4675 return bdev->md_len; 4676 } 4677 4678 bool 4679 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4680 { 4681 return (bdev->md_len != 0) && bdev->md_interleave; 4682 } 4683 4684 bool 4685 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4686 { 4687 return (bdev->md_len != 0) && !bdev->md_interleave; 4688 } 4689 4690 bool 4691 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4692 { 4693 return bdev->zoned; 4694 } 4695 4696 uint32_t 4697 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4698 { 4699 if (spdk_bdev_is_md_interleaved(bdev)) { 4700 return bdev->blocklen - bdev->md_len; 4701 } else { 4702 return bdev->blocklen; 4703 } 4704 } 4705 4706 uint32_t 4707 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4708 { 4709 return bdev->phys_blocklen; 4710 } 4711 4712 static uint32_t 4713 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4714 { 4715 if (!spdk_bdev_is_md_interleaved(bdev)) { 4716 return bdev->blocklen + bdev->md_len; 4717 } else { 4718 return bdev->blocklen; 4719 } 4720 } 4721 4722 /* We have to use the typedef in the function declaration to appease astyle. */ 4723 typedef enum spdk_dif_type spdk_dif_type_t; 4724 4725 spdk_dif_type_t 4726 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4727 { 4728 if (bdev->md_len != 0) { 4729 return bdev->dif_type; 4730 } else { 4731 return SPDK_DIF_DISABLE; 4732 } 4733 } 4734 4735 bool 4736 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4737 { 4738 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4739 return bdev->dif_is_head_of_md; 4740 } else { 4741 return false; 4742 } 4743 } 4744 4745 bool 4746 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4747 enum spdk_dif_check_type check_type) 4748 { 4749 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4750 return false; 4751 } 4752 4753 switch (check_type) { 4754 case SPDK_DIF_CHECK_TYPE_REFTAG: 4755 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4756 case SPDK_DIF_CHECK_TYPE_APPTAG: 4757 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4758 case SPDK_DIF_CHECK_TYPE_GUARD: 4759 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4760 default: 4761 return false; 4762 } 4763 } 4764 4765 static uint32_t 4766 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4767 { 4768 uint64_t aligned_length, max_write_blocks; 4769 4770 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4771 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4772 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4773 4774 return max_write_blocks; 4775 } 4776 4777 uint32_t 4778 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4779 { 4780 return bdev->max_copy; 4781 } 4782 4783 uint64_t 4784 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4785 { 4786 return bdev->internal.measured_queue_depth; 4787 } 4788 4789 uint64_t 4790 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4791 { 4792 return bdev->internal.period; 4793 } 4794 4795 uint64_t 4796 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4797 { 4798 return bdev->internal.weighted_io_time; 4799 } 4800 4801 uint64_t 4802 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4803 { 4804 return bdev->internal.io_time; 4805 } 4806 4807 static void bdev_update_qd_sampling_period(void *ctx); 4808 4809 static void 4810 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4811 { 4812 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4813 4814 if (bdev->internal.measured_queue_depth) { 4815 bdev->internal.io_time += bdev->internal.period; 4816 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4817 } 4818 4819 bdev->internal.qd_poll_in_progress = false; 4820 4821 bdev_update_qd_sampling_period(bdev); 4822 } 4823 4824 static void 4825 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4826 struct spdk_io_channel *io_ch, void *_ctx) 4827 { 4828 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4829 4830 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4831 spdk_bdev_for_each_channel_continue(i, 0); 4832 } 4833 4834 static int 4835 bdev_calculate_measured_queue_depth(void *ctx) 4836 { 4837 struct spdk_bdev *bdev = ctx; 4838 4839 bdev->internal.qd_poll_in_progress = true; 4840 bdev->internal.temporary_queue_depth = 0; 4841 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4842 return SPDK_POLLER_BUSY; 4843 } 4844 4845 static void 4846 bdev_update_qd_sampling_period(void *ctx) 4847 { 4848 struct spdk_bdev *bdev = ctx; 4849 4850 if (bdev->internal.period == bdev->internal.new_period) { 4851 return; 4852 } 4853 4854 if (bdev->internal.qd_poll_in_progress) { 4855 return; 4856 } 4857 4858 bdev->internal.period = bdev->internal.new_period; 4859 4860 spdk_poller_unregister(&bdev->internal.qd_poller); 4861 if (bdev->internal.period != 0) { 4862 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4863 bdev, bdev->internal.period); 4864 } else { 4865 spdk_bdev_close(bdev->internal.qd_desc); 4866 bdev->internal.qd_desc = NULL; 4867 } 4868 } 4869 4870 static void 4871 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4872 { 4873 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4874 } 4875 4876 void 4877 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4878 { 4879 int rc; 4880 4881 if (bdev->internal.new_period == period) { 4882 return; 4883 } 4884 4885 bdev->internal.new_period = period; 4886 4887 if (bdev->internal.qd_desc != NULL) { 4888 assert(bdev->internal.period != 0); 4889 4890 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4891 bdev_update_qd_sampling_period, bdev); 4892 return; 4893 } 4894 4895 assert(bdev->internal.period == 0); 4896 4897 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4898 NULL, &bdev->internal.qd_desc); 4899 if (rc != 0) { 4900 return; 4901 } 4902 4903 bdev->internal.period = period; 4904 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4905 bdev, period); 4906 } 4907 4908 struct bdev_get_current_qd_ctx { 4909 uint64_t current_qd; 4910 spdk_bdev_get_current_qd_cb cb_fn; 4911 void *cb_arg; 4912 }; 4913 4914 static void 4915 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4916 { 4917 struct bdev_get_current_qd_ctx *ctx = _ctx; 4918 4919 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4920 4921 free(ctx); 4922 } 4923 4924 static void 4925 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4926 struct spdk_io_channel *io_ch, void *_ctx) 4927 { 4928 struct bdev_get_current_qd_ctx *ctx = _ctx; 4929 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4930 4931 ctx->current_qd += bdev_ch->io_outstanding; 4932 4933 spdk_bdev_for_each_channel_continue(i, 0); 4934 } 4935 4936 void 4937 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4938 void *cb_arg) 4939 { 4940 struct bdev_get_current_qd_ctx *ctx; 4941 4942 assert(cb_fn != NULL); 4943 4944 ctx = calloc(1, sizeof(*ctx)); 4945 if (ctx == NULL) { 4946 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4947 return; 4948 } 4949 4950 ctx->cb_fn = cb_fn; 4951 ctx->cb_arg = cb_arg; 4952 4953 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4954 } 4955 4956 static void 4957 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4958 { 4959 assert(desc->thread == spdk_get_thread()); 4960 4961 spdk_spin_lock(&desc->spinlock); 4962 desc->refs--; 4963 if (!desc->closed) { 4964 spdk_spin_unlock(&desc->spinlock); 4965 desc->callback.event_fn(type, 4966 desc->bdev, 4967 desc->callback.ctx); 4968 return; 4969 } else if (desc->refs == 0) { 4970 /* This descriptor was closed after this event_notify message was sent. 4971 * spdk_bdev_close() could not free the descriptor since this message was 4972 * in flight, so we free it now using bdev_desc_free(). 4973 */ 4974 spdk_spin_unlock(&desc->spinlock); 4975 bdev_desc_free(desc); 4976 return; 4977 } 4978 spdk_spin_unlock(&desc->spinlock); 4979 } 4980 4981 static void 4982 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4983 { 4984 spdk_spin_lock(&desc->spinlock); 4985 desc->refs++; 4986 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4987 spdk_spin_unlock(&desc->spinlock); 4988 } 4989 4990 static void 4991 _resize_notify(void *ctx) 4992 { 4993 struct spdk_bdev_desc *desc = ctx; 4994 4995 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4996 } 4997 4998 int 4999 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5000 { 5001 struct spdk_bdev_desc *desc; 5002 int ret; 5003 5004 if (size == bdev->blockcnt) { 5005 return 0; 5006 } 5007 5008 spdk_spin_lock(&bdev->internal.spinlock); 5009 5010 /* bdev has open descriptors */ 5011 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5012 bdev->blockcnt > size) { 5013 ret = -EBUSY; 5014 } else { 5015 bdev->blockcnt = size; 5016 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5017 event_notify(desc, _resize_notify); 5018 } 5019 ret = 0; 5020 } 5021 5022 spdk_spin_unlock(&bdev->internal.spinlock); 5023 5024 return ret; 5025 } 5026 5027 /* 5028 * Convert I/O offset and length from bytes to blocks. 5029 * 5030 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5031 */ 5032 static uint64_t 5033 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5034 uint64_t num_bytes, uint64_t *num_blocks) 5035 { 5036 uint32_t block_size = bdev->blocklen; 5037 uint8_t shift_cnt; 5038 5039 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5040 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5041 shift_cnt = spdk_u32log2(block_size); 5042 *offset_blocks = offset_bytes >> shift_cnt; 5043 *num_blocks = num_bytes >> shift_cnt; 5044 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5045 (num_bytes - (*num_blocks << shift_cnt)); 5046 } else { 5047 *offset_blocks = offset_bytes / block_size; 5048 *num_blocks = num_bytes / block_size; 5049 return (offset_bytes % block_size) | (num_bytes % block_size); 5050 } 5051 } 5052 5053 static bool 5054 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5055 { 5056 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5057 * has been an overflow and hence the offset has been wrapped around */ 5058 if (offset_blocks + num_blocks < offset_blocks) { 5059 return false; 5060 } 5061 5062 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5063 if (offset_blocks + num_blocks > bdev->blockcnt) { 5064 return false; 5065 } 5066 5067 return true; 5068 } 5069 5070 static void 5071 bdev_seek_complete_cb(void *ctx) 5072 { 5073 struct spdk_bdev_io *bdev_io = ctx; 5074 5075 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5076 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5077 } 5078 5079 static int 5080 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5081 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5082 spdk_bdev_io_completion_cb cb, void *cb_arg) 5083 { 5084 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5085 struct spdk_bdev_io *bdev_io; 5086 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5087 5088 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5089 5090 /* Check if offset_blocks is valid looking at the validity of one block */ 5091 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5092 return -EINVAL; 5093 } 5094 5095 bdev_io = bdev_channel_get_io(channel); 5096 if (!bdev_io) { 5097 return -ENOMEM; 5098 } 5099 5100 bdev_io->internal.ch = channel; 5101 bdev_io->internal.desc = desc; 5102 bdev_io->type = io_type; 5103 bdev_io->u.bdev.offset_blocks = offset_blocks; 5104 bdev_io->u.bdev.memory_domain = NULL; 5105 bdev_io->u.bdev.memory_domain_ctx = NULL; 5106 bdev_io->u.bdev.accel_sequence = NULL; 5107 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5108 5109 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5110 /* In case bdev doesn't support seek to next data/hole offset, 5111 * it is assumed that only data and no holes are present */ 5112 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5113 bdev_io->u.bdev.seek.offset = offset_blocks; 5114 } else { 5115 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5116 } 5117 5118 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5119 return 0; 5120 } 5121 5122 bdev_io_submit(bdev_io); 5123 return 0; 5124 } 5125 5126 int 5127 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5128 uint64_t offset_blocks, 5129 spdk_bdev_io_completion_cb cb, void *cb_arg) 5130 { 5131 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5132 } 5133 5134 int 5135 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5136 uint64_t offset_blocks, 5137 spdk_bdev_io_completion_cb cb, void *cb_arg) 5138 { 5139 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5140 } 5141 5142 uint64_t 5143 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5144 { 5145 return bdev_io->u.bdev.seek.offset; 5146 } 5147 5148 static int 5149 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5150 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5151 spdk_bdev_io_completion_cb cb, void *cb_arg) 5152 { 5153 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5154 struct spdk_bdev_io *bdev_io; 5155 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5156 5157 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5158 return -EINVAL; 5159 } 5160 5161 bdev_io = bdev_channel_get_io(channel); 5162 if (!bdev_io) { 5163 return -ENOMEM; 5164 } 5165 5166 bdev_io->internal.ch = channel; 5167 bdev_io->internal.desc = desc; 5168 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5169 bdev_io->u.bdev.iovs = &bdev_io->iov; 5170 bdev_io->u.bdev.iovs[0].iov_base = buf; 5171 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5172 bdev_io->u.bdev.iovcnt = 1; 5173 bdev_io->u.bdev.md_buf = md_buf; 5174 bdev_io->u.bdev.num_blocks = num_blocks; 5175 bdev_io->u.bdev.offset_blocks = offset_blocks; 5176 bdev_io->u.bdev.memory_domain = NULL; 5177 bdev_io->u.bdev.memory_domain_ctx = NULL; 5178 bdev_io->u.bdev.accel_sequence = NULL; 5179 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5180 5181 bdev_io_submit(bdev_io); 5182 return 0; 5183 } 5184 5185 int 5186 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5187 void *buf, uint64_t offset, uint64_t nbytes, 5188 spdk_bdev_io_completion_cb cb, void *cb_arg) 5189 { 5190 uint64_t offset_blocks, num_blocks; 5191 5192 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5193 nbytes, &num_blocks) != 0) { 5194 return -EINVAL; 5195 } 5196 5197 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5198 } 5199 5200 int 5201 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5202 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5203 spdk_bdev_io_completion_cb cb, void *cb_arg) 5204 { 5205 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5206 } 5207 5208 int 5209 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5210 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5211 spdk_bdev_io_completion_cb cb, void *cb_arg) 5212 { 5213 struct iovec iov = { 5214 .iov_base = buf, 5215 }; 5216 5217 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5218 return -EINVAL; 5219 } 5220 5221 if (md_buf && !_is_buf_allocated(&iov)) { 5222 return -EINVAL; 5223 } 5224 5225 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5226 cb, cb_arg); 5227 } 5228 5229 int 5230 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5231 struct iovec *iov, int iovcnt, 5232 uint64_t offset, uint64_t nbytes, 5233 spdk_bdev_io_completion_cb cb, void *cb_arg) 5234 { 5235 uint64_t offset_blocks, num_blocks; 5236 5237 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5238 nbytes, &num_blocks) != 0) { 5239 return -EINVAL; 5240 } 5241 5242 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5243 } 5244 5245 static int 5246 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5247 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5248 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5249 struct spdk_accel_sequence *seq, 5250 spdk_bdev_io_completion_cb cb, void *cb_arg) 5251 { 5252 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5253 struct spdk_bdev_io *bdev_io; 5254 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5255 5256 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5257 return -EINVAL; 5258 } 5259 5260 bdev_io = bdev_channel_get_io(channel); 5261 if (!bdev_io) { 5262 return -ENOMEM; 5263 } 5264 5265 bdev_io->internal.ch = channel; 5266 bdev_io->internal.desc = desc; 5267 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5268 bdev_io->u.bdev.iovs = iov; 5269 bdev_io->u.bdev.iovcnt = iovcnt; 5270 bdev_io->u.bdev.md_buf = md_buf; 5271 bdev_io->u.bdev.num_blocks = num_blocks; 5272 bdev_io->u.bdev.offset_blocks = offset_blocks; 5273 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5274 bdev_io->internal.memory_domain = domain; 5275 bdev_io->internal.memory_domain_ctx = domain_ctx; 5276 bdev_io->internal.accel_sequence = seq; 5277 bdev_io->internal.has_accel_sequence = seq != NULL; 5278 bdev_io->u.bdev.memory_domain = domain; 5279 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5280 bdev_io->u.bdev.accel_sequence = seq; 5281 5282 _bdev_io_submit_ext(desc, bdev_io); 5283 5284 return 0; 5285 } 5286 5287 int 5288 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5289 struct iovec *iov, int iovcnt, 5290 uint64_t offset_blocks, uint64_t num_blocks, 5291 spdk_bdev_io_completion_cb cb, void *cb_arg) 5292 { 5293 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5294 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5295 } 5296 5297 int 5298 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5299 struct iovec *iov, int iovcnt, void *md_buf, 5300 uint64_t offset_blocks, uint64_t num_blocks, 5301 spdk_bdev_io_completion_cb cb, void *cb_arg) 5302 { 5303 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5304 return -EINVAL; 5305 } 5306 5307 if (md_buf && !_is_buf_allocated(iov)) { 5308 return -EINVAL; 5309 } 5310 5311 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5312 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5313 } 5314 5315 static inline bool 5316 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5317 { 5318 /* 5319 * We check if opts size is at least of size when we first introduced 5320 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5321 * are not checked internal. 5322 */ 5323 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5324 sizeof(opts->metadata) && 5325 opts->size <= sizeof(*opts) && 5326 /* When memory domain is used, the user must provide data buffers */ 5327 (!opts->memory_domain || (iov && iov[0].iov_base)); 5328 } 5329 5330 int 5331 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5332 struct iovec *iov, int iovcnt, 5333 uint64_t offset_blocks, uint64_t num_blocks, 5334 spdk_bdev_io_completion_cb cb, void *cb_arg, 5335 struct spdk_bdev_ext_io_opts *opts) 5336 { 5337 void *md = NULL; 5338 5339 if (opts) { 5340 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5341 return -EINVAL; 5342 } 5343 md = opts->metadata; 5344 } 5345 5346 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5347 return -EINVAL; 5348 } 5349 5350 if (md && !_is_buf_allocated(iov)) { 5351 return -EINVAL; 5352 } 5353 5354 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5355 num_blocks, 5356 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5357 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5358 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5359 cb, cb_arg); 5360 } 5361 5362 static int 5363 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5364 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5365 spdk_bdev_io_completion_cb cb, void *cb_arg) 5366 { 5367 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5368 struct spdk_bdev_io *bdev_io; 5369 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5370 5371 if (!desc->write) { 5372 return -EBADF; 5373 } 5374 5375 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5376 return -EINVAL; 5377 } 5378 5379 bdev_io = bdev_channel_get_io(channel); 5380 if (!bdev_io) { 5381 return -ENOMEM; 5382 } 5383 5384 bdev_io->internal.ch = channel; 5385 bdev_io->internal.desc = desc; 5386 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5387 bdev_io->u.bdev.iovs = &bdev_io->iov; 5388 bdev_io->u.bdev.iovs[0].iov_base = buf; 5389 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5390 bdev_io->u.bdev.iovcnt = 1; 5391 bdev_io->u.bdev.md_buf = md_buf; 5392 bdev_io->u.bdev.num_blocks = num_blocks; 5393 bdev_io->u.bdev.offset_blocks = offset_blocks; 5394 bdev_io->u.bdev.memory_domain = NULL; 5395 bdev_io->u.bdev.memory_domain_ctx = NULL; 5396 bdev_io->u.bdev.accel_sequence = NULL; 5397 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5398 5399 bdev_io_submit(bdev_io); 5400 return 0; 5401 } 5402 5403 int 5404 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5405 void *buf, uint64_t offset, uint64_t nbytes, 5406 spdk_bdev_io_completion_cb cb, void *cb_arg) 5407 { 5408 uint64_t offset_blocks, num_blocks; 5409 5410 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5411 nbytes, &num_blocks) != 0) { 5412 return -EINVAL; 5413 } 5414 5415 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5416 } 5417 5418 int 5419 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5420 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5421 spdk_bdev_io_completion_cb cb, void *cb_arg) 5422 { 5423 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5424 cb, cb_arg); 5425 } 5426 5427 int 5428 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5429 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5430 spdk_bdev_io_completion_cb cb, void *cb_arg) 5431 { 5432 struct iovec iov = { 5433 .iov_base = buf, 5434 }; 5435 5436 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5437 return -EINVAL; 5438 } 5439 5440 if (md_buf && !_is_buf_allocated(&iov)) { 5441 return -EINVAL; 5442 } 5443 5444 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5445 cb, cb_arg); 5446 } 5447 5448 static int 5449 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5450 struct iovec *iov, int iovcnt, void *md_buf, 5451 uint64_t offset_blocks, uint64_t num_blocks, 5452 struct spdk_memory_domain *domain, void *domain_ctx, 5453 struct spdk_accel_sequence *seq, 5454 spdk_bdev_io_completion_cb cb, void *cb_arg) 5455 { 5456 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5457 struct spdk_bdev_io *bdev_io; 5458 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5459 5460 if (!desc->write) { 5461 return -EBADF; 5462 } 5463 5464 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5465 return -EINVAL; 5466 } 5467 5468 bdev_io = bdev_channel_get_io(channel); 5469 if (!bdev_io) { 5470 return -ENOMEM; 5471 } 5472 5473 bdev_io->internal.ch = channel; 5474 bdev_io->internal.desc = desc; 5475 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5476 bdev_io->u.bdev.iovs = iov; 5477 bdev_io->u.bdev.iovcnt = iovcnt; 5478 bdev_io->u.bdev.md_buf = md_buf; 5479 bdev_io->u.bdev.num_blocks = num_blocks; 5480 bdev_io->u.bdev.offset_blocks = offset_blocks; 5481 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5482 bdev_io->internal.memory_domain = domain; 5483 bdev_io->internal.memory_domain_ctx = domain_ctx; 5484 bdev_io->internal.accel_sequence = seq; 5485 bdev_io->internal.has_accel_sequence = seq != NULL; 5486 bdev_io->u.bdev.memory_domain = domain; 5487 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5488 bdev_io->u.bdev.accel_sequence = seq; 5489 5490 _bdev_io_submit_ext(desc, bdev_io); 5491 5492 return 0; 5493 } 5494 5495 int 5496 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5497 struct iovec *iov, int iovcnt, 5498 uint64_t offset, uint64_t len, 5499 spdk_bdev_io_completion_cb cb, void *cb_arg) 5500 { 5501 uint64_t offset_blocks, num_blocks; 5502 5503 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5504 len, &num_blocks) != 0) { 5505 return -EINVAL; 5506 } 5507 5508 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5509 } 5510 5511 int 5512 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5513 struct iovec *iov, int iovcnt, 5514 uint64_t offset_blocks, uint64_t num_blocks, 5515 spdk_bdev_io_completion_cb cb, void *cb_arg) 5516 { 5517 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5518 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5519 } 5520 5521 int 5522 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5523 struct iovec *iov, int iovcnt, void *md_buf, 5524 uint64_t offset_blocks, uint64_t num_blocks, 5525 spdk_bdev_io_completion_cb cb, void *cb_arg) 5526 { 5527 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5528 return -EINVAL; 5529 } 5530 5531 if (md_buf && !_is_buf_allocated(iov)) { 5532 return -EINVAL; 5533 } 5534 5535 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5536 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5537 } 5538 5539 int 5540 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5541 struct iovec *iov, int iovcnt, 5542 uint64_t offset_blocks, uint64_t num_blocks, 5543 spdk_bdev_io_completion_cb cb, void *cb_arg, 5544 struct spdk_bdev_ext_io_opts *opts) 5545 { 5546 void *md = NULL; 5547 5548 if (opts) { 5549 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5550 return -EINVAL; 5551 } 5552 md = opts->metadata; 5553 } 5554 5555 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5556 return -EINVAL; 5557 } 5558 5559 if (md && !_is_buf_allocated(iov)) { 5560 return -EINVAL; 5561 } 5562 5563 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5564 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5565 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5566 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5567 cb, cb_arg); 5568 } 5569 5570 static void 5571 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5572 { 5573 struct spdk_bdev_io *parent_io = cb_arg; 5574 struct spdk_bdev *bdev = parent_io->bdev; 5575 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5576 int i, rc = 0; 5577 5578 if (!success) { 5579 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5580 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5581 spdk_bdev_free_io(bdev_io); 5582 return; 5583 } 5584 5585 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5586 rc = memcmp(read_buf, 5587 parent_io->u.bdev.iovs[i].iov_base, 5588 parent_io->u.bdev.iovs[i].iov_len); 5589 if (rc) { 5590 break; 5591 } 5592 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5593 } 5594 5595 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5596 rc = memcmp(bdev_io->u.bdev.md_buf, 5597 parent_io->u.bdev.md_buf, 5598 spdk_bdev_get_md_size(bdev)); 5599 } 5600 5601 spdk_bdev_free_io(bdev_io); 5602 5603 if (rc == 0) { 5604 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5605 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5606 } else { 5607 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5608 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5609 } 5610 } 5611 5612 static void 5613 bdev_compare_do_read(void *_bdev_io) 5614 { 5615 struct spdk_bdev_io *bdev_io = _bdev_io; 5616 int rc; 5617 5618 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5619 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5620 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5621 bdev_compare_do_read_done, bdev_io); 5622 5623 if (rc == -ENOMEM) { 5624 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5625 } else if (rc != 0) { 5626 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5627 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5628 } 5629 } 5630 5631 static int 5632 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5633 struct iovec *iov, int iovcnt, void *md_buf, 5634 uint64_t offset_blocks, uint64_t num_blocks, 5635 spdk_bdev_io_completion_cb cb, void *cb_arg) 5636 { 5637 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5638 struct spdk_bdev_io *bdev_io; 5639 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5640 5641 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5642 return -EINVAL; 5643 } 5644 5645 bdev_io = bdev_channel_get_io(channel); 5646 if (!bdev_io) { 5647 return -ENOMEM; 5648 } 5649 5650 bdev_io->internal.ch = channel; 5651 bdev_io->internal.desc = desc; 5652 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5653 bdev_io->u.bdev.iovs = iov; 5654 bdev_io->u.bdev.iovcnt = iovcnt; 5655 bdev_io->u.bdev.md_buf = md_buf; 5656 bdev_io->u.bdev.num_blocks = num_blocks; 5657 bdev_io->u.bdev.offset_blocks = offset_blocks; 5658 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5659 bdev_io->u.bdev.memory_domain = NULL; 5660 bdev_io->u.bdev.memory_domain_ctx = NULL; 5661 bdev_io->u.bdev.accel_sequence = NULL; 5662 5663 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5664 bdev_io_submit(bdev_io); 5665 return 0; 5666 } 5667 5668 bdev_compare_do_read(bdev_io); 5669 5670 return 0; 5671 } 5672 5673 int 5674 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5675 struct iovec *iov, int iovcnt, 5676 uint64_t offset_blocks, uint64_t num_blocks, 5677 spdk_bdev_io_completion_cb cb, void *cb_arg) 5678 { 5679 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5680 num_blocks, cb, cb_arg); 5681 } 5682 5683 int 5684 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5685 struct iovec *iov, int iovcnt, void *md_buf, 5686 uint64_t offset_blocks, uint64_t num_blocks, 5687 spdk_bdev_io_completion_cb cb, void *cb_arg) 5688 { 5689 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5690 return -EINVAL; 5691 } 5692 5693 if (md_buf && !_is_buf_allocated(iov)) { 5694 return -EINVAL; 5695 } 5696 5697 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5698 num_blocks, cb, cb_arg); 5699 } 5700 5701 static int 5702 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5703 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5704 spdk_bdev_io_completion_cb cb, void *cb_arg) 5705 { 5706 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5707 struct spdk_bdev_io *bdev_io; 5708 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5709 5710 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5711 return -EINVAL; 5712 } 5713 5714 bdev_io = bdev_channel_get_io(channel); 5715 if (!bdev_io) { 5716 return -ENOMEM; 5717 } 5718 5719 bdev_io->internal.ch = channel; 5720 bdev_io->internal.desc = desc; 5721 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5722 bdev_io->u.bdev.iovs = &bdev_io->iov; 5723 bdev_io->u.bdev.iovs[0].iov_base = buf; 5724 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5725 bdev_io->u.bdev.iovcnt = 1; 5726 bdev_io->u.bdev.md_buf = md_buf; 5727 bdev_io->u.bdev.num_blocks = num_blocks; 5728 bdev_io->u.bdev.offset_blocks = offset_blocks; 5729 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5730 bdev_io->u.bdev.memory_domain = NULL; 5731 bdev_io->u.bdev.memory_domain_ctx = NULL; 5732 bdev_io->u.bdev.accel_sequence = NULL; 5733 5734 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5735 bdev_io_submit(bdev_io); 5736 return 0; 5737 } 5738 5739 bdev_compare_do_read(bdev_io); 5740 5741 return 0; 5742 } 5743 5744 int 5745 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5746 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5747 spdk_bdev_io_completion_cb cb, void *cb_arg) 5748 { 5749 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5750 cb, cb_arg); 5751 } 5752 5753 int 5754 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5755 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5756 spdk_bdev_io_completion_cb cb, void *cb_arg) 5757 { 5758 struct iovec iov = { 5759 .iov_base = buf, 5760 }; 5761 5762 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5763 return -EINVAL; 5764 } 5765 5766 if (md_buf && !_is_buf_allocated(&iov)) { 5767 return -EINVAL; 5768 } 5769 5770 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5771 cb, cb_arg); 5772 } 5773 5774 static void 5775 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5776 { 5777 struct spdk_bdev_io *bdev_io = ctx; 5778 5779 if (unlock_status) { 5780 SPDK_ERRLOG("LBA range unlock failed\n"); 5781 } 5782 5783 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5784 false, bdev_io->internal.caller_ctx); 5785 } 5786 5787 static void 5788 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5789 { 5790 bdev_io->internal.status = status; 5791 5792 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5793 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5794 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5795 } 5796 5797 static void 5798 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5799 { 5800 struct spdk_bdev_io *parent_io = cb_arg; 5801 5802 if (!success) { 5803 SPDK_ERRLOG("Compare and write operation failed\n"); 5804 } 5805 5806 spdk_bdev_free_io(bdev_io); 5807 5808 bdev_comparev_and_writev_blocks_unlock(parent_io, 5809 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5810 } 5811 5812 static void 5813 bdev_compare_and_write_do_write(void *_bdev_io) 5814 { 5815 struct spdk_bdev_io *bdev_io = _bdev_io; 5816 int rc; 5817 5818 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5819 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5820 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5821 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5822 bdev_compare_and_write_do_write_done, bdev_io); 5823 5824 5825 if (rc == -ENOMEM) { 5826 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5827 } else if (rc != 0) { 5828 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5829 } 5830 } 5831 5832 static void 5833 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5834 { 5835 struct spdk_bdev_io *parent_io = cb_arg; 5836 5837 spdk_bdev_free_io(bdev_io); 5838 5839 if (!success) { 5840 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5841 return; 5842 } 5843 5844 bdev_compare_and_write_do_write(parent_io); 5845 } 5846 5847 static void 5848 bdev_compare_and_write_do_compare(void *_bdev_io) 5849 { 5850 struct spdk_bdev_io *bdev_io = _bdev_io; 5851 int rc; 5852 5853 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5854 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5855 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5856 bdev_compare_and_write_do_compare_done, bdev_io); 5857 5858 if (rc == -ENOMEM) { 5859 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5860 } else if (rc != 0) { 5861 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5862 } 5863 } 5864 5865 static void 5866 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5867 { 5868 struct spdk_bdev_io *bdev_io = ctx; 5869 5870 if (status) { 5871 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5872 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5873 return; 5874 } 5875 5876 bdev_compare_and_write_do_compare(bdev_io); 5877 } 5878 5879 int 5880 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5881 struct iovec *compare_iov, int compare_iovcnt, 5882 struct iovec *write_iov, int write_iovcnt, 5883 uint64_t offset_blocks, uint64_t num_blocks, 5884 spdk_bdev_io_completion_cb cb, void *cb_arg) 5885 { 5886 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5887 struct spdk_bdev_io *bdev_io; 5888 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5889 5890 if (!desc->write) { 5891 return -EBADF; 5892 } 5893 5894 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5895 return -EINVAL; 5896 } 5897 5898 if (num_blocks > bdev->acwu) { 5899 return -EINVAL; 5900 } 5901 5902 bdev_io = bdev_channel_get_io(channel); 5903 if (!bdev_io) { 5904 return -ENOMEM; 5905 } 5906 5907 bdev_io->internal.ch = channel; 5908 bdev_io->internal.desc = desc; 5909 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5910 bdev_io->u.bdev.iovs = compare_iov; 5911 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5912 bdev_io->u.bdev.fused_iovs = write_iov; 5913 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5914 bdev_io->u.bdev.md_buf = NULL; 5915 bdev_io->u.bdev.num_blocks = num_blocks; 5916 bdev_io->u.bdev.offset_blocks = offset_blocks; 5917 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5918 bdev_io->u.bdev.memory_domain = NULL; 5919 bdev_io->u.bdev.memory_domain_ctx = NULL; 5920 bdev_io->u.bdev.accel_sequence = NULL; 5921 5922 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5923 bdev_io_submit(bdev_io); 5924 return 0; 5925 } 5926 5927 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5928 bdev_comparev_and_writev_blocks_locked, bdev_io); 5929 } 5930 5931 int 5932 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5933 struct iovec *iov, int iovcnt, 5934 uint64_t offset_blocks, uint64_t num_blocks, 5935 bool populate, 5936 spdk_bdev_io_completion_cb cb, void *cb_arg) 5937 { 5938 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5939 struct spdk_bdev_io *bdev_io; 5940 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5941 5942 if (!desc->write) { 5943 return -EBADF; 5944 } 5945 5946 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5947 return -EINVAL; 5948 } 5949 5950 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5951 return -ENOTSUP; 5952 } 5953 5954 bdev_io = bdev_channel_get_io(channel); 5955 if (!bdev_io) { 5956 return -ENOMEM; 5957 } 5958 5959 bdev_io->internal.ch = channel; 5960 bdev_io->internal.desc = desc; 5961 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5962 bdev_io->u.bdev.num_blocks = num_blocks; 5963 bdev_io->u.bdev.offset_blocks = offset_blocks; 5964 bdev_io->u.bdev.iovs = iov; 5965 bdev_io->u.bdev.iovcnt = iovcnt; 5966 bdev_io->u.bdev.md_buf = NULL; 5967 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5968 bdev_io->u.bdev.zcopy.commit = 0; 5969 bdev_io->u.bdev.zcopy.start = 1; 5970 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5971 bdev_io->u.bdev.memory_domain = NULL; 5972 bdev_io->u.bdev.memory_domain_ctx = NULL; 5973 bdev_io->u.bdev.accel_sequence = NULL; 5974 5975 bdev_io_submit(bdev_io); 5976 5977 return 0; 5978 } 5979 5980 int 5981 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5982 spdk_bdev_io_completion_cb cb, void *cb_arg) 5983 { 5984 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5985 return -EINVAL; 5986 } 5987 5988 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5989 bdev_io->u.bdev.zcopy.start = 0; 5990 bdev_io->internal.caller_ctx = cb_arg; 5991 bdev_io->internal.cb = cb; 5992 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5993 5994 bdev_io_submit(bdev_io); 5995 5996 return 0; 5997 } 5998 5999 int 6000 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6001 uint64_t offset, uint64_t len, 6002 spdk_bdev_io_completion_cb cb, void *cb_arg) 6003 { 6004 uint64_t offset_blocks, num_blocks; 6005 6006 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6007 len, &num_blocks) != 0) { 6008 return -EINVAL; 6009 } 6010 6011 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6012 } 6013 6014 int 6015 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6016 uint64_t offset_blocks, uint64_t num_blocks, 6017 spdk_bdev_io_completion_cb cb, void *cb_arg) 6018 { 6019 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6020 struct spdk_bdev_io *bdev_io; 6021 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6022 6023 if (!desc->write) { 6024 return -EBADF; 6025 } 6026 6027 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6028 return -EINVAL; 6029 } 6030 6031 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6032 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6033 return -ENOTSUP; 6034 } 6035 6036 bdev_io = bdev_channel_get_io(channel); 6037 6038 if (!bdev_io) { 6039 return -ENOMEM; 6040 } 6041 6042 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6043 bdev_io->internal.ch = channel; 6044 bdev_io->internal.desc = desc; 6045 bdev_io->u.bdev.offset_blocks = offset_blocks; 6046 bdev_io->u.bdev.num_blocks = num_blocks; 6047 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6048 bdev_io->u.bdev.memory_domain = NULL; 6049 bdev_io->u.bdev.memory_domain_ctx = NULL; 6050 bdev_io->u.bdev.accel_sequence = NULL; 6051 6052 /* If the write_zeroes size is large and should be split, use the generic split 6053 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6054 * 6055 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6056 * or emulate it using regular write request otherwise. 6057 */ 6058 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6059 bdev_io->internal.split) { 6060 bdev_io_submit(bdev_io); 6061 return 0; 6062 } 6063 6064 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6065 6066 return bdev_write_zero_buffer(bdev_io); 6067 } 6068 6069 int 6070 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6071 uint64_t offset, uint64_t nbytes, 6072 spdk_bdev_io_completion_cb cb, void *cb_arg) 6073 { 6074 uint64_t offset_blocks, num_blocks; 6075 6076 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6077 nbytes, &num_blocks) != 0) { 6078 return -EINVAL; 6079 } 6080 6081 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6082 } 6083 6084 int 6085 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6086 uint64_t offset_blocks, uint64_t num_blocks, 6087 spdk_bdev_io_completion_cb cb, void *cb_arg) 6088 { 6089 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6090 struct spdk_bdev_io *bdev_io; 6091 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6092 6093 if (!desc->write) { 6094 return -EBADF; 6095 } 6096 6097 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6098 return -EINVAL; 6099 } 6100 6101 if (num_blocks == 0) { 6102 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6103 return -EINVAL; 6104 } 6105 6106 bdev_io = bdev_channel_get_io(channel); 6107 if (!bdev_io) { 6108 return -ENOMEM; 6109 } 6110 6111 bdev_io->internal.ch = channel; 6112 bdev_io->internal.desc = desc; 6113 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6114 6115 bdev_io->u.bdev.iovs = &bdev_io->iov; 6116 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6117 bdev_io->u.bdev.iovs[0].iov_len = 0; 6118 bdev_io->u.bdev.iovcnt = 1; 6119 6120 bdev_io->u.bdev.offset_blocks = offset_blocks; 6121 bdev_io->u.bdev.num_blocks = num_blocks; 6122 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6123 bdev_io->u.bdev.memory_domain = NULL; 6124 bdev_io->u.bdev.memory_domain_ctx = NULL; 6125 bdev_io->u.bdev.accel_sequence = NULL; 6126 6127 bdev_io_submit(bdev_io); 6128 return 0; 6129 } 6130 6131 int 6132 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6133 uint64_t offset, uint64_t length, 6134 spdk_bdev_io_completion_cb cb, void *cb_arg) 6135 { 6136 uint64_t offset_blocks, num_blocks; 6137 6138 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6139 length, &num_blocks) != 0) { 6140 return -EINVAL; 6141 } 6142 6143 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6144 } 6145 6146 int 6147 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6148 uint64_t offset_blocks, uint64_t num_blocks, 6149 spdk_bdev_io_completion_cb cb, void *cb_arg) 6150 { 6151 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6152 struct spdk_bdev_io *bdev_io; 6153 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6154 6155 if (!desc->write) { 6156 return -EBADF; 6157 } 6158 6159 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6160 return -EINVAL; 6161 } 6162 6163 bdev_io = bdev_channel_get_io(channel); 6164 if (!bdev_io) { 6165 return -ENOMEM; 6166 } 6167 6168 bdev_io->internal.ch = channel; 6169 bdev_io->internal.desc = desc; 6170 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6171 bdev_io->u.bdev.iovs = NULL; 6172 bdev_io->u.bdev.iovcnt = 0; 6173 bdev_io->u.bdev.offset_blocks = offset_blocks; 6174 bdev_io->u.bdev.num_blocks = num_blocks; 6175 bdev_io->u.bdev.memory_domain = NULL; 6176 bdev_io->u.bdev.memory_domain_ctx = NULL; 6177 bdev_io->u.bdev.accel_sequence = NULL; 6178 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6179 6180 bdev_io_submit(bdev_io); 6181 return 0; 6182 } 6183 6184 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6185 6186 static void 6187 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6188 { 6189 struct spdk_bdev_channel *ch = _ctx; 6190 struct spdk_bdev_io *bdev_io; 6191 6192 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6193 6194 if (status == -EBUSY) { 6195 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6196 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6197 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6198 } else { 6199 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6200 6201 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6202 /* If outstanding IOs are still present and reset_io_drain_timeout 6203 * seconds passed, start the reset. */ 6204 bdev_io_submit_reset(bdev_io); 6205 } else { 6206 /* We still have in progress memory domain pull/push or we're 6207 * executing accel sequence. Since we cannot abort either of those 6208 * operaions, fail the reset request. */ 6209 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6210 } 6211 } 6212 } else { 6213 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6214 SPDK_DEBUGLOG(bdev, 6215 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6216 ch->bdev->name); 6217 /* Mark the completion status as a SUCCESS and complete the reset. */ 6218 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6219 } 6220 } 6221 6222 static void 6223 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6224 struct spdk_io_channel *io_ch, void *_ctx) 6225 { 6226 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6227 int status = 0; 6228 6229 if (cur_ch->io_outstanding > 0 || 6230 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6231 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6232 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6233 * further iteration over the rest of the channels and pass non-zero status 6234 * to the callback function. */ 6235 status = -EBUSY; 6236 } 6237 spdk_bdev_for_each_channel_continue(i, status); 6238 } 6239 6240 static int 6241 bdev_reset_poll_for_outstanding_io(void *ctx) 6242 { 6243 struct spdk_bdev_channel *ch = ctx; 6244 struct spdk_bdev_io *bdev_io; 6245 6246 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6247 6248 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6249 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6250 bdev_reset_check_outstanding_io_done); 6251 6252 return SPDK_POLLER_BUSY; 6253 } 6254 6255 static void 6256 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6257 { 6258 struct spdk_bdev_channel *ch = _ctx; 6259 struct spdk_bdev_io *bdev_io; 6260 6261 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6262 6263 if (bdev->reset_io_drain_timeout == 0) { 6264 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6265 6266 bdev_io_submit_reset(bdev_io); 6267 return; 6268 } 6269 6270 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6271 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6272 6273 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6274 * submit the reset to the underlying module only if outstanding I/O 6275 * remain after reset_io_drain_timeout seconds have passed. */ 6276 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6277 bdev_reset_check_outstanding_io_done); 6278 } 6279 6280 static void 6281 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6282 struct spdk_io_channel *ch, void *_ctx) 6283 { 6284 struct spdk_bdev_channel *channel; 6285 struct spdk_bdev_mgmt_channel *mgmt_channel; 6286 struct spdk_bdev_shared_resource *shared_resource; 6287 bdev_io_tailq_t tmp_queued; 6288 6289 TAILQ_INIT(&tmp_queued); 6290 6291 channel = __io_ch_to_bdev_ch(ch); 6292 shared_resource = channel->shared_resource; 6293 mgmt_channel = shared_resource->mgmt_ch; 6294 6295 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6296 6297 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6298 /* The QoS object is always valid and readable while 6299 * the channel flag is set, so the lock here should not 6300 * be necessary. We're not in the fast path though, so 6301 * just take it anyway. */ 6302 spdk_spin_lock(&channel->bdev->internal.spinlock); 6303 if (channel->bdev->internal.qos->ch == channel) { 6304 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6305 } 6306 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6307 } 6308 6309 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6310 bdev_abort_all_buf_io(mgmt_channel, channel); 6311 bdev_abort_all_queued_io(&tmp_queued, channel); 6312 6313 spdk_bdev_for_each_channel_continue(i, 0); 6314 } 6315 6316 static void 6317 bdev_start_reset(void *ctx) 6318 { 6319 struct spdk_bdev_channel *ch = ctx; 6320 6321 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6322 bdev_reset_freeze_channel_done); 6323 } 6324 6325 static void 6326 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6327 { 6328 struct spdk_bdev *bdev = ch->bdev; 6329 6330 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6331 6332 spdk_spin_lock(&bdev->internal.spinlock); 6333 if (bdev->internal.reset_in_progress == NULL) { 6334 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6335 /* 6336 * Take a channel reference for the target bdev for the life of this 6337 * reset. This guards against the channel getting destroyed while 6338 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6339 * progress. We will release the reference when this reset is 6340 * completed. 6341 */ 6342 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6343 bdev_start_reset(ch); 6344 } 6345 spdk_spin_unlock(&bdev->internal.spinlock); 6346 } 6347 6348 int 6349 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6350 spdk_bdev_io_completion_cb cb, void *cb_arg) 6351 { 6352 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6353 struct spdk_bdev_io *bdev_io; 6354 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6355 6356 bdev_io = bdev_channel_get_io(channel); 6357 if (!bdev_io) { 6358 return -ENOMEM; 6359 } 6360 6361 bdev_io->internal.ch = channel; 6362 bdev_io->internal.desc = desc; 6363 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6364 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6365 bdev_io->u.reset.ch_ref = NULL; 6366 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6367 6368 spdk_spin_lock(&bdev->internal.spinlock); 6369 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6370 spdk_spin_unlock(&bdev->internal.spinlock); 6371 6372 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6373 internal.ch_link); 6374 6375 bdev_channel_start_reset(channel); 6376 6377 return 0; 6378 } 6379 6380 void 6381 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6382 struct spdk_bdev_io_stat *stat) 6383 { 6384 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6385 6386 bdev_get_io_stat(stat, channel->stat); 6387 } 6388 6389 static void 6390 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6391 { 6392 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6393 6394 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6395 bdev_iostat_ctx->cb_arg, 0); 6396 free(bdev_iostat_ctx); 6397 } 6398 6399 static void 6400 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6401 struct spdk_io_channel *ch, void *_ctx) 6402 { 6403 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6404 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6405 6406 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6407 spdk_bdev_for_each_channel_continue(i, 0); 6408 } 6409 6410 void 6411 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6412 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6413 { 6414 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6415 6416 assert(bdev != NULL); 6417 assert(stat != NULL); 6418 assert(cb != NULL); 6419 6420 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6421 if (bdev_iostat_ctx == NULL) { 6422 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6423 cb(bdev, stat, cb_arg, -ENOMEM); 6424 return; 6425 } 6426 6427 bdev_iostat_ctx->stat = stat; 6428 bdev_iostat_ctx->cb = cb; 6429 bdev_iostat_ctx->cb_arg = cb_arg; 6430 6431 /* Start with the statistics from previously deleted channels. */ 6432 spdk_spin_lock(&bdev->internal.spinlock); 6433 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6434 spdk_spin_unlock(&bdev->internal.spinlock); 6435 6436 /* Then iterate and add the statistics from each existing channel. */ 6437 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6438 bdev_get_device_stat_done); 6439 } 6440 6441 struct bdev_iostat_reset_ctx { 6442 enum spdk_bdev_reset_stat_mode mode; 6443 bdev_reset_device_stat_cb cb; 6444 void *cb_arg; 6445 }; 6446 6447 static void 6448 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6449 { 6450 struct bdev_iostat_reset_ctx *ctx = _ctx; 6451 6452 ctx->cb(bdev, ctx->cb_arg, 0); 6453 6454 free(ctx); 6455 } 6456 6457 static void 6458 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6459 struct spdk_io_channel *ch, void *_ctx) 6460 { 6461 struct bdev_iostat_reset_ctx *ctx = _ctx; 6462 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6463 6464 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6465 6466 spdk_bdev_for_each_channel_continue(i, 0); 6467 } 6468 6469 void 6470 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6471 bdev_reset_device_stat_cb cb, void *cb_arg) 6472 { 6473 struct bdev_iostat_reset_ctx *ctx; 6474 6475 assert(bdev != NULL); 6476 assert(cb != NULL); 6477 6478 ctx = calloc(1, sizeof(*ctx)); 6479 if (ctx == NULL) { 6480 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6481 cb(bdev, cb_arg, -ENOMEM); 6482 return; 6483 } 6484 6485 ctx->mode = mode; 6486 ctx->cb = cb; 6487 ctx->cb_arg = cb_arg; 6488 6489 spdk_spin_lock(&bdev->internal.spinlock); 6490 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6491 spdk_spin_unlock(&bdev->internal.spinlock); 6492 6493 spdk_bdev_for_each_channel(bdev, 6494 bdev_reset_each_channel_stat, 6495 ctx, 6496 bdev_reset_device_stat_done); 6497 } 6498 6499 int 6500 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6501 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6502 spdk_bdev_io_completion_cb cb, void *cb_arg) 6503 { 6504 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6505 struct spdk_bdev_io *bdev_io; 6506 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6507 6508 if (!desc->write) { 6509 return -EBADF; 6510 } 6511 6512 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6513 return -ENOTSUP; 6514 } 6515 6516 bdev_io = bdev_channel_get_io(channel); 6517 if (!bdev_io) { 6518 return -ENOMEM; 6519 } 6520 6521 bdev_io->internal.ch = channel; 6522 bdev_io->internal.desc = desc; 6523 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6524 bdev_io->u.nvme_passthru.cmd = *cmd; 6525 bdev_io->u.nvme_passthru.buf = buf; 6526 bdev_io->u.nvme_passthru.nbytes = nbytes; 6527 bdev_io->u.nvme_passthru.md_buf = NULL; 6528 bdev_io->u.nvme_passthru.md_len = 0; 6529 6530 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6531 6532 bdev_io_submit(bdev_io); 6533 return 0; 6534 } 6535 6536 int 6537 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6538 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6539 spdk_bdev_io_completion_cb cb, void *cb_arg) 6540 { 6541 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6542 struct spdk_bdev_io *bdev_io; 6543 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6544 6545 if (!desc->write) { 6546 /* 6547 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6548 * to easily determine if the command is a read or write, but for now just 6549 * do not allow io_passthru with a read-only descriptor. 6550 */ 6551 return -EBADF; 6552 } 6553 6554 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6555 return -ENOTSUP; 6556 } 6557 6558 bdev_io = bdev_channel_get_io(channel); 6559 if (!bdev_io) { 6560 return -ENOMEM; 6561 } 6562 6563 bdev_io->internal.ch = channel; 6564 bdev_io->internal.desc = desc; 6565 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6566 bdev_io->u.nvme_passthru.cmd = *cmd; 6567 bdev_io->u.nvme_passthru.buf = buf; 6568 bdev_io->u.nvme_passthru.nbytes = nbytes; 6569 bdev_io->u.nvme_passthru.md_buf = NULL; 6570 bdev_io->u.nvme_passthru.md_len = 0; 6571 6572 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6573 6574 bdev_io_submit(bdev_io); 6575 return 0; 6576 } 6577 6578 int 6579 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6580 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6581 spdk_bdev_io_completion_cb cb, void *cb_arg) 6582 { 6583 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6584 struct spdk_bdev_io *bdev_io; 6585 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6586 6587 if (!desc->write) { 6588 /* 6589 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6590 * to easily determine if the command is a read or write, but for now just 6591 * do not allow io_passthru with a read-only descriptor. 6592 */ 6593 return -EBADF; 6594 } 6595 6596 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6597 return -ENOTSUP; 6598 } 6599 6600 bdev_io = bdev_channel_get_io(channel); 6601 if (!bdev_io) { 6602 return -ENOMEM; 6603 } 6604 6605 bdev_io->internal.ch = channel; 6606 bdev_io->internal.desc = desc; 6607 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6608 bdev_io->u.nvme_passthru.cmd = *cmd; 6609 bdev_io->u.nvme_passthru.buf = buf; 6610 bdev_io->u.nvme_passthru.nbytes = nbytes; 6611 bdev_io->u.nvme_passthru.md_buf = md_buf; 6612 bdev_io->u.nvme_passthru.md_len = md_len; 6613 6614 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6615 6616 bdev_io_submit(bdev_io); 6617 return 0; 6618 } 6619 6620 static void bdev_abort_retry(void *ctx); 6621 static void bdev_abort(struct spdk_bdev_io *parent_io); 6622 6623 static void 6624 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6625 { 6626 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6627 struct spdk_bdev_io *parent_io = cb_arg; 6628 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6629 6630 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6631 6632 spdk_bdev_free_io(bdev_io); 6633 6634 if (!success) { 6635 /* Check if the target I/O completed in the meantime. */ 6636 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6637 if (tmp_io == bio_to_abort) { 6638 break; 6639 } 6640 } 6641 6642 /* If the target I/O still exists, set the parent to failed. */ 6643 if (tmp_io != NULL) { 6644 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6645 } 6646 } 6647 6648 parent_io->u.bdev.split_outstanding--; 6649 if (parent_io->u.bdev.split_outstanding == 0) { 6650 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6651 bdev_abort_retry(parent_io); 6652 } else { 6653 bdev_io_complete(parent_io); 6654 } 6655 } 6656 } 6657 6658 static int 6659 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6660 struct spdk_bdev_io *bio_to_abort, 6661 spdk_bdev_io_completion_cb cb, void *cb_arg) 6662 { 6663 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6664 struct spdk_bdev_io *bdev_io; 6665 6666 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6667 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6668 /* TODO: Abort reset or abort request. */ 6669 return -ENOTSUP; 6670 } 6671 6672 bdev_io = bdev_channel_get_io(channel); 6673 if (bdev_io == NULL) { 6674 return -ENOMEM; 6675 } 6676 6677 bdev_io->internal.ch = channel; 6678 bdev_io->internal.desc = desc; 6679 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6680 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6681 6682 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6683 assert(bdev_io_should_split(bio_to_abort)); 6684 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6685 6686 /* Parent abort request is not submitted directly, but to manage its 6687 * execution add it to the submitted list here. 6688 */ 6689 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6690 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6691 6692 bdev_abort(bdev_io); 6693 6694 return 0; 6695 } 6696 6697 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6698 6699 /* Submit the abort request to the underlying bdev module. */ 6700 bdev_io_submit(bdev_io); 6701 6702 return 0; 6703 } 6704 6705 static bool 6706 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6707 { 6708 struct spdk_bdev_io *iter; 6709 6710 TAILQ_FOREACH(iter, tailq, internal.link) { 6711 if (iter == bdev_io) { 6712 return true; 6713 } 6714 } 6715 6716 return false; 6717 } 6718 6719 static uint32_t 6720 _bdev_abort(struct spdk_bdev_io *parent_io) 6721 { 6722 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6723 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6724 void *bio_cb_arg; 6725 struct spdk_bdev_io *bio_to_abort; 6726 uint32_t matched_ios; 6727 int rc; 6728 6729 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6730 6731 /* matched_ios is returned and will be kept by the caller. 6732 * 6733 * This function will be used for two cases, 1) the same cb_arg is used for 6734 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6735 * Incrementing split_outstanding directly here may confuse readers especially 6736 * for the 1st case. 6737 * 6738 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6739 * works as expected. 6740 */ 6741 matched_ios = 0; 6742 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6743 6744 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6745 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6746 continue; 6747 } 6748 6749 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6750 /* Any I/O which was submitted after this abort command should be excluded. */ 6751 continue; 6752 } 6753 6754 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6755 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6756 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6757 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6758 break; 6759 } 6760 6761 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6762 if (rc != 0) { 6763 if (rc == -ENOMEM) { 6764 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6765 } else { 6766 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6767 } 6768 break; 6769 } 6770 matched_ios++; 6771 } 6772 6773 return matched_ios; 6774 } 6775 6776 static void 6777 bdev_abort_retry(void *ctx) 6778 { 6779 struct spdk_bdev_io *parent_io = ctx; 6780 uint32_t matched_ios; 6781 6782 matched_ios = _bdev_abort(parent_io); 6783 6784 if (matched_ios == 0) { 6785 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6786 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6787 } else { 6788 /* For retry, the case that no target I/O was found is success 6789 * because it means target I/Os completed in the meantime. 6790 */ 6791 bdev_io_complete(parent_io); 6792 } 6793 return; 6794 } 6795 6796 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6797 parent_io->u.bdev.split_outstanding = matched_ios; 6798 } 6799 6800 static void 6801 bdev_abort(struct spdk_bdev_io *parent_io) 6802 { 6803 uint32_t matched_ios; 6804 6805 matched_ios = _bdev_abort(parent_io); 6806 6807 if (matched_ios == 0) { 6808 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6809 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6810 } else { 6811 /* The case the no target I/O was found is failure. */ 6812 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6813 bdev_io_complete(parent_io); 6814 } 6815 return; 6816 } 6817 6818 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6819 parent_io->u.bdev.split_outstanding = matched_ios; 6820 } 6821 6822 int 6823 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6824 void *bio_cb_arg, 6825 spdk_bdev_io_completion_cb cb, void *cb_arg) 6826 { 6827 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6828 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6829 struct spdk_bdev_io *bdev_io; 6830 6831 if (bio_cb_arg == NULL) { 6832 return -EINVAL; 6833 } 6834 6835 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6836 return -ENOTSUP; 6837 } 6838 6839 bdev_io = bdev_channel_get_io(channel); 6840 if (bdev_io == NULL) { 6841 return -ENOMEM; 6842 } 6843 6844 bdev_io->internal.ch = channel; 6845 bdev_io->internal.desc = desc; 6846 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6847 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6848 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6849 6850 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6851 6852 /* Parent abort request is not submitted directly, but to manage its execution, 6853 * add it to the submitted list here. 6854 */ 6855 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6856 6857 bdev_abort(bdev_io); 6858 6859 return 0; 6860 } 6861 6862 int 6863 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6864 struct spdk_bdev_io_wait_entry *entry) 6865 { 6866 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6867 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6868 6869 if (bdev != entry->bdev) { 6870 SPDK_ERRLOG("bdevs do not match\n"); 6871 return -EINVAL; 6872 } 6873 6874 if (mgmt_ch->per_thread_cache_count > 0) { 6875 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6876 return -EINVAL; 6877 } 6878 6879 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6880 return 0; 6881 } 6882 6883 static inline void 6884 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6885 { 6886 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6887 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6888 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6889 uint32_t blocklen = bdev_io->bdev->blocklen; 6890 6891 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6892 switch (bdev_io->type) { 6893 case SPDK_BDEV_IO_TYPE_READ: 6894 io_stat->bytes_read += num_blocks * blocklen; 6895 io_stat->num_read_ops++; 6896 io_stat->read_latency_ticks += tsc_diff; 6897 if (io_stat->max_read_latency_ticks < tsc_diff) { 6898 io_stat->max_read_latency_ticks = tsc_diff; 6899 } 6900 if (io_stat->min_read_latency_ticks > tsc_diff) { 6901 io_stat->min_read_latency_ticks = tsc_diff; 6902 } 6903 break; 6904 case SPDK_BDEV_IO_TYPE_WRITE: 6905 io_stat->bytes_written += num_blocks * blocklen; 6906 io_stat->num_write_ops++; 6907 io_stat->write_latency_ticks += tsc_diff; 6908 if (io_stat->max_write_latency_ticks < tsc_diff) { 6909 io_stat->max_write_latency_ticks = tsc_diff; 6910 } 6911 if (io_stat->min_write_latency_ticks > tsc_diff) { 6912 io_stat->min_write_latency_ticks = tsc_diff; 6913 } 6914 break; 6915 case SPDK_BDEV_IO_TYPE_UNMAP: 6916 io_stat->bytes_unmapped += num_blocks * blocklen; 6917 io_stat->num_unmap_ops++; 6918 io_stat->unmap_latency_ticks += tsc_diff; 6919 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6920 io_stat->max_unmap_latency_ticks = tsc_diff; 6921 } 6922 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6923 io_stat->min_unmap_latency_ticks = tsc_diff; 6924 } 6925 break; 6926 case SPDK_BDEV_IO_TYPE_ZCOPY: 6927 /* Track the data in the start phase only */ 6928 if (bdev_io->u.bdev.zcopy.start) { 6929 if (bdev_io->u.bdev.zcopy.populate) { 6930 io_stat->bytes_read += num_blocks * blocklen; 6931 io_stat->num_read_ops++; 6932 io_stat->read_latency_ticks += tsc_diff; 6933 if (io_stat->max_read_latency_ticks < tsc_diff) { 6934 io_stat->max_read_latency_ticks = tsc_diff; 6935 } 6936 if (io_stat->min_read_latency_ticks > tsc_diff) { 6937 io_stat->min_read_latency_ticks = tsc_diff; 6938 } 6939 } else { 6940 io_stat->bytes_written += num_blocks * blocklen; 6941 io_stat->num_write_ops++; 6942 io_stat->write_latency_ticks += tsc_diff; 6943 if (io_stat->max_write_latency_ticks < tsc_diff) { 6944 io_stat->max_write_latency_ticks = tsc_diff; 6945 } 6946 if (io_stat->min_write_latency_ticks > tsc_diff) { 6947 io_stat->min_write_latency_ticks = tsc_diff; 6948 } 6949 } 6950 } 6951 break; 6952 case SPDK_BDEV_IO_TYPE_COPY: 6953 io_stat->bytes_copied += num_blocks * blocklen; 6954 io_stat->num_copy_ops++; 6955 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6956 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6957 io_stat->max_copy_latency_ticks = tsc_diff; 6958 } 6959 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6960 io_stat->min_copy_latency_ticks = tsc_diff; 6961 } 6962 break; 6963 default: 6964 break; 6965 } 6966 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6967 io_stat = bdev_io->bdev->internal.stat; 6968 assert(io_stat->io_error != NULL); 6969 6970 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6971 io_stat->io_error->error_status[-io_status - 1]++; 6972 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6973 } 6974 6975 #ifdef SPDK_CONFIG_VTUNE 6976 uint64_t now_tsc = spdk_get_ticks(); 6977 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6978 uint64_t data[5]; 6979 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6980 6981 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6982 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6983 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6984 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6985 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6986 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6987 6988 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6989 __itt_metadata_u64, 5, data); 6990 6991 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6992 bdev_io->internal.ch->start_tsc = now_tsc; 6993 } 6994 #endif 6995 } 6996 6997 static inline void 6998 _bdev_io_complete(void *ctx) 6999 { 7000 struct spdk_bdev_io *bdev_io = ctx; 7001 7002 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 7003 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7004 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7005 } 7006 7007 assert(bdev_io->internal.cb != NULL); 7008 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7009 7010 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7011 bdev_io->internal.caller_ctx); 7012 } 7013 7014 static inline void 7015 bdev_io_complete(void *ctx) 7016 { 7017 struct spdk_bdev_io *bdev_io = ctx; 7018 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7019 uint64_t tsc, tsc_diff; 7020 7021 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7022 /* 7023 * Defer completion to avoid potential infinite recursion if the 7024 * user's completion callback issues a new I/O. 7025 */ 7026 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7027 bdev_io_complete, bdev_io); 7028 return; 7029 } 7030 7031 tsc = spdk_get_ticks(); 7032 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7033 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7034 bdev_io->internal.caller_ctx); 7035 7036 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7037 7038 if (bdev_io->internal.ch->histogram) { 7039 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7040 } 7041 7042 bdev_io_update_io_stat(bdev_io, tsc_diff); 7043 _bdev_io_complete(bdev_io); 7044 } 7045 7046 /* The difference between this function and bdev_io_complete() is that this should be called to 7047 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7048 * io_submitted list and don't have submit_tsc updated. 7049 */ 7050 static inline void 7051 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7052 { 7053 /* Since the IO hasn't been submitted it's bound to be failed */ 7054 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7055 7056 /* At this point we don't know if the IO is completed from submission context or not, but, 7057 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7058 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7059 _bdev_io_complete, bdev_io); 7060 } 7061 7062 static void bdev_destroy_cb(void *io_device); 7063 7064 static void 7065 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7066 { 7067 struct spdk_bdev_io *bdev_io = _ctx; 7068 7069 if (bdev_io->u.reset.ch_ref != NULL) { 7070 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7071 bdev_io->u.reset.ch_ref = NULL; 7072 } 7073 7074 bdev_io_complete(bdev_io); 7075 7076 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7077 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7078 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7079 } 7080 } 7081 7082 static void 7083 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7084 struct spdk_io_channel *_ch, void *_ctx) 7085 { 7086 struct spdk_bdev_io *bdev_io = _ctx; 7087 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7088 struct spdk_bdev_io *queued_reset; 7089 7090 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7091 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7092 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7093 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7094 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7095 } 7096 7097 spdk_bdev_for_each_channel_continue(i, 0); 7098 } 7099 7100 static void 7101 bdev_io_complete_sequence_cb(void *ctx, int status) 7102 { 7103 struct spdk_bdev_io *bdev_io = ctx; 7104 7105 /* u.bdev.accel_sequence should have already been cleared at this point */ 7106 assert(bdev_io->u.bdev.accel_sequence == NULL); 7107 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7108 bdev_io->internal.accel_sequence = NULL; 7109 7110 if (spdk_unlikely(status != 0)) { 7111 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7112 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7113 } 7114 7115 bdev_io_complete(bdev_io); 7116 } 7117 7118 void 7119 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7120 { 7121 struct spdk_bdev *bdev = bdev_io->bdev; 7122 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7123 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7124 7125 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7126 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7127 spdk_bdev_get_module_name(bdev), 7128 bdev_io_status_get_string(bdev_io->internal.status)); 7129 assert(false); 7130 } 7131 bdev_io->internal.status = status; 7132 7133 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7134 bool unlock_channels = false; 7135 7136 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7137 SPDK_ERRLOG("NOMEM returned for reset\n"); 7138 } 7139 spdk_spin_lock(&bdev->internal.spinlock); 7140 if (bdev_io == bdev->internal.reset_in_progress) { 7141 bdev->internal.reset_in_progress = NULL; 7142 unlock_channels = true; 7143 } 7144 spdk_spin_unlock(&bdev->internal.spinlock); 7145 7146 if (unlock_channels) { 7147 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7148 bdev_reset_complete); 7149 return; 7150 } 7151 } else { 7152 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7153 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7154 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7155 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7156 return; 7157 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7158 !bdev_io_use_accel_sequence(bdev_io))) { 7159 _bdev_io_push_bounce_data_buffer(bdev_io, 7160 _bdev_io_complete_push_bounce_done); 7161 /* bdev IO will be completed in the callback */ 7162 return; 7163 } 7164 } 7165 7166 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7167 return; 7168 } 7169 } 7170 7171 bdev_io_complete(bdev_io); 7172 } 7173 7174 void 7175 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7176 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7177 { 7178 enum spdk_bdev_io_status status; 7179 7180 if (sc == SPDK_SCSI_STATUS_GOOD) { 7181 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7182 } else { 7183 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7184 bdev_io->internal.error.scsi.sc = sc; 7185 bdev_io->internal.error.scsi.sk = sk; 7186 bdev_io->internal.error.scsi.asc = asc; 7187 bdev_io->internal.error.scsi.ascq = ascq; 7188 } 7189 7190 spdk_bdev_io_complete(bdev_io, status); 7191 } 7192 7193 void 7194 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7195 int *sc, int *sk, int *asc, int *ascq) 7196 { 7197 assert(sc != NULL); 7198 assert(sk != NULL); 7199 assert(asc != NULL); 7200 assert(ascq != NULL); 7201 7202 switch (bdev_io->internal.status) { 7203 case SPDK_BDEV_IO_STATUS_SUCCESS: 7204 *sc = SPDK_SCSI_STATUS_GOOD; 7205 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7206 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7207 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7208 break; 7209 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7210 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7211 break; 7212 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7213 *sc = bdev_io->internal.error.scsi.sc; 7214 *sk = bdev_io->internal.error.scsi.sk; 7215 *asc = bdev_io->internal.error.scsi.asc; 7216 *ascq = bdev_io->internal.error.scsi.ascq; 7217 break; 7218 default: 7219 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7220 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7221 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7222 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7223 break; 7224 } 7225 } 7226 7227 void 7228 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7229 { 7230 enum spdk_bdev_io_status status; 7231 7232 if (aio_result == 0) { 7233 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7234 } else { 7235 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7236 } 7237 7238 bdev_io->internal.error.aio_result = aio_result; 7239 7240 spdk_bdev_io_complete(bdev_io, status); 7241 } 7242 7243 void 7244 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7245 { 7246 assert(aio_result != NULL); 7247 7248 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7249 *aio_result = bdev_io->internal.error.aio_result; 7250 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7251 *aio_result = 0; 7252 } else { 7253 *aio_result = -EIO; 7254 } 7255 } 7256 7257 void 7258 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7259 { 7260 enum spdk_bdev_io_status status; 7261 7262 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7263 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7264 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7265 status = SPDK_BDEV_IO_STATUS_ABORTED; 7266 } else { 7267 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7268 } 7269 7270 bdev_io->internal.error.nvme.cdw0 = cdw0; 7271 bdev_io->internal.error.nvme.sct = sct; 7272 bdev_io->internal.error.nvme.sc = sc; 7273 7274 spdk_bdev_io_complete(bdev_io, status); 7275 } 7276 7277 void 7278 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7279 { 7280 assert(sct != NULL); 7281 assert(sc != NULL); 7282 assert(cdw0 != NULL); 7283 7284 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7285 *sct = SPDK_NVME_SCT_GENERIC; 7286 *sc = SPDK_NVME_SC_SUCCESS; 7287 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7288 *cdw0 = 0; 7289 } else { 7290 *cdw0 = 1U; 7291 } 7292 return; 7293 } 7294 7295 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7296 *sct = bdev_io->internal.error.nvme.sct; 7297 *sc = bdev_io->internal.error.nvme.sc; 7298 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7299 *sct = SPDK_NVME_SCT_GENERIC; 7300 *sc = SPDK_NVME_SC_SUCCESS; 7301 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7302 *sct = SPDK_NVME_SCT_GENERIC; 7303 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7304 } else { 7305 *sct = SPDK_NVME_SCT_GENERIC; 7306 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7307 } 7308 7309 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7310 } 7311 7312 void 7313 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7314 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7315 { 7316 assert(first_sct != NULL); 7317 assert(first_sc != NULL); 7318 assert(second_sct != NULL); 7319 assert(second_sc != NULL); 7320 assert(cdw0 != NULL); 7321 7322 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7323 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7324 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7325 *first_sct = bdev_io->internal.error.nvme.sct; 7326 *first_sc = bdev_io->internal.error.nvme.sc; 7327 *second_sct = SPDK_NVME_SCT_GENERIC; 7328 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7329 } else { 7330 *first_sct = SPDK_NVME_SCT_GENERIC; 7331 *first_sc = SPDK_NVME_SC_SUCCESS; 7332 *second_sct = bdev_io->internal.error.nvme.sct; 7333 *second_sc = bdev_io->internal.error.nvme.sc; 7334 } 7335 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7336 *first_sct = SPDK_NVME_SCT_GENERIC; 7337 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7338 *second_sct = SPDK_NVME_SCT_GENERIC; 7339 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7340 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7341 *first_sct = SPDK_NVME_SCT_GENERIC; 7342 *first_sc = SPDK_NVME_SC_SUCCESS; 7343 *second_sct = SPDK_NVME_SCT_GENERIC; 7344 *second_sc = SPDK_NVME_SC_SUCCESS; 7345 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7346 *first_sct = SPDK_NVME_SCT_GENERIC; 7347 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7348 *second_sct = SPDK_NVME_SCT_GENERIC; 7349 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7350 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7351 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7352 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7353 *second_sct = SPDK_NVME_SCT_GENERIC; 7354 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7355 } else { 7356 *first_sct = SPDK_NVME_SCT_GENERIC; 7357 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7358 *second_sct = SPDK_NVME_SCT_GENERIC; 7359 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7360 } 7361 7362 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7363 } 7364 7365 struct spdk_thread * 7366 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7367 { 7368 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7369 } 7370 7371 struct spdk_io_channel * 7372 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7373 { 7374 return bdev_io->internal.ch->channel; 7375 } 7376 7377 static int 7378 bdev_register(struct spdk_bdev *bdev) 7379 { 7380 char *bdev_name; 7381 char uuid[SPDK_UUID_STRING_LEN]; 7382 struct spdk_iobuf_opts iobuf_opts; 7383 int ret, i; 7384 7385 assert(bdev->module != NULL); 7386 7387 if (!bdev->name) { 7388 SPDK_ERRLOG("Bdev name is NULL\n"); 7389 return -EINVAL; 7390 } 7391 7392 if (!strlen(bdev->name)) { 7393 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7394 return -EINVAL; 7395 } 7396 7397 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7398 if (bdev->fn_table->accel_sequence_supported == NULL) { 7399 continue; 7400 } 7401 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7402 (enum spdk_bdev_io_type)i)) { 7403 continue; 7404 } 7405 7406 if (spdk_bdev_is_md_separate(bdev)) { 7407 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7408 "accel sequence support\n"); 7409 return -EINVAL; 7410 } 7411 } 7412 7413 /* Users often register their own I/O devices using the bdev name. In 7414 * order to avoid conflicts, prepend bdev_. */ 7415 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7416 if (!bdev_name) { 7417 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7418 return -ENOMEM; 7419 } 7420 7421 bdev->internal.stat = bdev_alloc_io_stat(true); 7422 if (!bdev->internal.stat) { 7423 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7424 free(bdev_name); 7425 return -ENOMEM; 7426 } 7427 7428 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7429 bdev->internal.measured_queue_depth = UINT64_MAX; 7430 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7431 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7432 bdev->internal.qd_poller = NULL; 7433 bdev->internal.qos = NULL; 7434 7435 TAILQ_INIT(&bdev->internal.open_descs); 7436 TAILQ_INIT(&bdev->internal.locked_ranges); 7437 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7438 TAILQ_INIT(&bdev->aliases); 7439 7440 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7441 if (ret != 0) { 7442 bdev_free_io_stat(bdev->internal.stat); 7443 free(bdev_name); 7444 return ret; 7445 } 7446 7447 /* UUID may be specified by the user or defined by bdev itself. 7448 * Otherwise it will be generated here, so this field will never be empty. */ 7449 if (spdk_uuid_is_null(&bdev->uuid)) { 7450 spdk_uuid_generate(&bdev->uuid); 7451 } 7452 7453 /* Add the UUID alias only if it's different than the name */ 7454 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7455 if (strcmp(bdev->name, uuid) != 0) { 7456 ret = spdk_bdev_alias_add(bdev, uuid); 7457 if (ret != 0) { 7458 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7459 bdev_name_del(&bdev->internal.bdev_name); 7460 bdev_free_io_stat(bdev->internal.stat); 7461 free(bdev_name); 7462 return ret; 7463 } 7464 } 7465 7466 if (spdk_bdev_get_buf_align(bdev) > 1) { 7467 if (bdev->split_on_optimal_io_boundary) { 7468 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7469 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7470 } else { 7471 bdev->split_on_optimal_io_boundary = true; 7472 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7473 } 7474 } 7475 7476 /* If the user didn't specify a write unit size, set it to one. */ 7477 if (bdev->write_unit_size == 0) { 7478 bdev->write_unit_size = 1; 7479 } 7480 7481 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7482 if (bdev->acwu == 0) { 7483 bdev->acwu = bdev->write_unit_size; 7484 } 7485 7486 if (bdev->phys_blocklen == 0) { 7487 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7488 } 7489 7490 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7491 spdk_iobuf_get_opts(&iobuf_opts); 7492 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7493 } 7494 7495 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7496 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7497 } 7498 7499 bdev->internal.reset_in_progress = NULL; 7500 bdev->internal.qd_poll_in_progress = false; 7501 bdev->internal.period = 0; 7502 bdev->internal.new_period = 0; 7503 7504 spdk_io_device_register(__bdev_to_io_dev(bdev), 7505 bdev_channel_create, bdev_channel_destroy, 7506 sizeof(struct spdk_bdev_channel), 7507 bdev_name); 7508 7509 free(bdev_name); 7510 7511 spdk_spin_init(&bdev->internal.spinlock); 7512 7513 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7514 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7515 7516 return 0; 7517 } 7518 7519 static void 7520 bdev_destroy_cb(void *io_device) 7521 { 7522 int rc; 7523 struct spdk_bdev *bdev; 7524 spdk_bdev_unregister_cb cb_fn; 7525 void *cb_arg; 7526 7527 bdev = __bdev_from_io_dev(io_device); 7528 7529 if (bdev->internal.unregister_td != spdk_get_thread()) { 7530 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7531 return; 7532 } 7533 7534 cb_fn = bdev->internal.unregister_cb; 7535 cb_arg = bdev->internal.unregister_ctx; 7536 7537 spdk_spin_destroy(&bdev->internal.spinlock); 7538 free(bdev->internal.qos); 7539 bdev_free_io_stat(bdev->internal.stat); 7540 7541 rc = bdev->fn_table->destruct(bdev->ctxt); 7542 if (rc < 0) { 7543 SPDK_ERRLOG("destruct failed\n"); 7544 } 7545 if (rc <= 0 && cb_fn != NULL) { 7546 cb_fn(cb_arg, rc); 7547 } 7548 } 7549 7550 void 7551 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7552 { 7553 if (bdev->internal.unregister_cb != NULL) { 7554 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7555 } 7556 } 7557 7558 static void 7559 _remove_notify(void *arg) 7560 { 7561 struct spdk_bdev_desc *desc = arg; 7562 7563 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7564 } 7565 7566 /* returns: 0 - bdev removed and ready to be destructed. 7567 * -EBUSY - bdev can't be destructed yet. */ 7568 static int 7569 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7570 { 7571 struct spdk_bdev_desc *desc, *tmp; 7572 int rc = 0; 7573 char uuid[SPDK_UUID_STRING_LEN]; 7574 7575 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7576 assert(spdk_spin_held(&bdev->internal.spinlock)); 7577 7578 /* Notify each descriptor about hotremoval */ 7579 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7580 rc = -EBUSY; 7581 /* 7582 * Defer invocation of the event_cb to a separate message that will 7583 * run later on its thread. This ensures this context unwinds and 7584 * we don't recursively unregister this bdev again if the event_cb 7585 * immediately closes its descriptor. 7586 */ 7587 event_notify(desc, _remove_notify); 7588 } 7589 7590 /* If there are no descriptors, proceed removing the bdev */ 7591 if (rc == 0) { 7592 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7593 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7594 7595 /* Delete the name and the UUID alias */ 7596 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7597 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7598 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7599 7600 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7601 7602 if (bdev->internal.reset_in_progress != NULL) { 7603 /* If reset is in progress, let the completion callback for reset 7604 * unregister the bdev. 7605 */ 7606 rc = -EBUSY; 7607 } 7608 } 7609 7610 return rc; 7611 } 7612 7613 static void 7614 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7615 struct spdk_io_channel *io_ch, void *_ctx) 7616 { 7617 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7618 7619 bdev_channel_abort_queued_ios(bdev_ch); 7620 spdk_bdev_for_each_channel_continue(i, 0); 7621 } 7622 7623 static void 7624 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7625 { 7626 int rc; 7627 7628 spdk_spin_lock(&g_bdev_mgr.spinlock); 7629 spdk_spin_lock(&bdev->internal.spinlock); 7630 /* 7631 * Set the status to REMOVING after completing to abort channels. Otherwise, 7632 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7633 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7634 * may fail. 7635 */ 7636 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7637 rc = bdev_unregister_unsafe(bdev); 7638 spdk_spin_unlock(&bdev->internal.spinlock); 7639 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7640 7641 if (rc == 0) { 7642 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7643 } 7644 } 7645 7646 void 7647 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7648 { 7649 struct spdk_thread *thread; 7650 7651 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7652 7653 thread = spdk_get_thread(); 7654 if (!thread) { 7655 /* The user called this from a non-SPDK thread. */ 7656 if (cb_fn != NULL) { 7657 cb_fn(cb_arg, -ENOTSUP); 7658 } 7659 return; 7660 } 7661 7662 spdk_spin_lock(&g_bdev_mgr.spinlock); 7663 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7664 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7665 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7666 if (cb_fn) { 7667 cb_fn(cb_arg, -EBUSY); 7668 } 7669 return; 7670 } 7671 7672 spdk_spin_lock(&bdev->internal.spinlock); 7673 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7674 bdev->internal.unregister_cb = cb_fn; 7675 bdev->internal.unregister_ctx = cb_arg; 7676 bdev->internal.unregister_td = thread; 7677 spdk_spin_unlock(&bdev->internal.spinlock); 7678 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7679 7680 spdk_bdev_set_qd_sampling_period(bdev, 0); 7681 7682 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7683 bdev_unregister); 7684 } 7685 7686 int 7687 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7688 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7689 { 7690 struct spdk_bdev_desc *desc; 7691 struct spdk_bdev *bdev; 7692 int rc; 7693 7694 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7695 if (rc != 0) { 7696 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7697 return rc; 7698 } 7699 7700 bdev = spdk_bdev_desc_get_bdev(desc); 7701 7702 if (bdev->module != module) { 7703 spdk_bdev_close(desc); 7704 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7705 bdev_name); 7706 return -ENODEV; 7707 } 7708 7709 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7710 7711 spdk_bdev_close(desc); 7712 7713 return 0; 7714 } 7715 7716 static int 7717 bdev_start_qos(struct spdk_bdev *bdev) 7718 { 7719 struct set_qos_limit_ctx *ctx; 7720 7721 /* Enable QoS */ 7722 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7723 ctx = calloc(1, sizeof(*ctx)); 7724 if (ctx == NULL) { 7725 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7726 return -ENOMEM; 7727 } 7728 ctx->bdev = bdev; 7729 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7730 } 7731 7732 return 0; 7733 } 7734 7735 static void 7736 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7737 struct spdk_bdev *bdev) 7738 { 7739 enum spdk_bdev_claim_type type; 7740 const char *typename, *modname; 7741 extern struct spdk_log_flag SPDK_LOG_bdev; 7742 7743 assert(spdk_spin_held(&bdev->internal.spinlock)); 7744 7745 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7746 return; 7747 } 7748 7749 type = bdev->internal.claim_type; 7750 typename = spdk_bdev_claim_get_name(type); 7751 7752 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7753 modname = bdev->internal.claim.v1.module->name; 7754 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7755 bdev->name, detail, typename, modname); 7756 return; 7757 } 7758 7759 if (claim_type_is_v2(type)) { 7760 struct spdk_bdev_module_claim *claim; 7761 7762 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7763 modname = claim->module->name; 7764 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7765 bdev->name, detail, typename, modname); 7766 } 7767 return; 7768 } 7769 7770 assert(false); 7771 } 7772 7773 static int 7774 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7775 { 7776 struct spdk_thread *thread; 7777 int rc = 0; 7778 7779 thread = spdk_get_thread(); 7780 if (!thread) { 7781 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7782 return -ENOTSUP; 7783 } 7784 7785 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7786 spdk_get_thread()); 7787 7788 desc->bdev = bdev; 7789 desc->thread = thread; 7790 desc->write = write; 7791 7792 spdk_spin_lock(&bdev->internal.spinlock); 7793 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7794 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7795 spdk_spin_unlock(&bdev->internal.spinlock); 7796 return -ENODEV; 7797 } 7798 7799 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7800 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7801 spdk_spin_unlock(&bdev->internal.spinlock); 7802 return -EPERM; 7803 } 7804 7805 rc = bdev_start_qos(bdev); 7806 if (rc != 0) { 7807 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7808 spdk_spin_unlock(&bdev->internal.spinlock); 7809 return rc; 7810 } 7811 7812 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7813 7814 spdk_spin_unlock(&bdev->internal.spinlock); 7815 7816 return 0; 7817 } 7818 7819 static int 7820 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7821 struct spdk_bdev_desc **_desc) 7822 { 7823 struct spdk_bdev_desc *desc; 7824 unsigned int i; 7825 7826 desc = calloc(1, sizeof(*desc)); 7827 if (desc == NULL) { 7828 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7829 return -ENOMEM; 7830 } 7831 7832 TAILQ_INIT(&desc->pending_media_events); 7833 TAILQ_INIT(&desc->free_media_events); 7834 7835 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7836 desc->callback.event_fn = event_cb; 7837 desc->callback.ctx = event_ctx; 7838 spdk_spin_init(&desc->spinlock); 7839 7840 if (bdev->media_events) { 7841 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7842 sizeof(*desc->media_events_buffer)); 7843 if (desc->media_events_buffer == NULL) { 7844 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7845 bdev_desc_free(desc); 7846 return -ENOMEM; 7847 } 7848 7849 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7850 TAILQ_INSERT_TAIL(&desc->free_media_events, 7851 &desc->media_events_buffer[i], tailq); 7852 } 7853 } 7854 7855 if (bdev->fn_table->accel_sequence_supported != NULL) { 7856 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7857 desc->accel_sequence_supported[i] = 7858 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7859 (enum spdk_bdev_io_type)i); 7860 } 7861 } 7862 7863 *_desc = desc; 7864 7865 return 0; 7866 } 7867 7868 static int 7869 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7870 void *event_ctx, struct spdk_bdev_desc **_desc) 7871 { 7872 struct spdk_bdev_desc *desc; 7873 struct spdk_bdev *bdev; 7874 int rc; 7875 7876 bdev = bdev_get_by_name(bdev_name); 7877 7878 if (bdev == NULL) { 7879 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7880 return -ENODEV; 7881 } 7882 7883 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7884 if (rc != 0) { 7885 return rc; 7886 } 7887 7888 rc = bdev_open(bdev, write, desc); 7889 if (rc != 0) { 7890 bdev_desc_free(desc); 7891 desc = NULL; 7892 } 7893 7894 *_desc = desc; 7895 7896 return rc; 7897 } 7898 7899 int 7900 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7901 void *event_ctx, struct spdk_bdev_desc **_desc) 7902 { 7903 int rc; 7904 7905 if (event_cb == NULL) { 7906 SPDK_ERRLOG("Missing event callback function\n"); 7907 return -EINVAL; 7908 } 7909 7910 spdk_spin_lock(&g_bdev_mgr.spinlock); 7911 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 7912 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7913 7914 return rc; 7915 } 7916 7917 struct spdk_bdev_open_async_ctx { 7918 char *bdev_name; 7919 spdk_bdev_event_cb_t event_cb; 7920 void *event_ctx; 7921 bool write; 7922 int rc; 7923 spdk_bdev_open_async_cb_t cb_fn; 7924 void *cb_arg; 7925 struct spdk_bdev_desc *desc; 7926 struct spdk_bdev_open_async_opts opts; 7927 uint64_t start_ticks; 7928 struct spdk_thread *orig_thread; 7929 struct spdk_poller *poller; 7930 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 7931 }; 7932 7933 static void 7934 bdev_open_async_done(void *arg) 7935 { 7936 struct spdk_bdev_open_async_ctx *ctx = arg; 7937 7938 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 7939 7940 free(ctx->bdev_name); 7941 free(ctx); 7942 } 7943 7944 static void 7945 bdev_open_async_cancel(void *arg) 7946 { 7947 struct spdk_bdev_open_async_ctx *ctx = arg; 7948 7949 assert(ctx->rc == -ESHUTDOWN); 7950 7951 spdk_poller_unregister(&ctx->poller); 7952 7953 bdev_open_async_done(ctx); 7954 } 7955 7956 /* This is called when the bdev library finishes at shutdown. */ 7957 static void 7958 bdev_open_async_fini(void) 7959 { 7960 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 7961 7962 spdk_spin_lock(&g_bdev_mgr.spinlock); 7963 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 7964 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 7965 /* 7966 * We have to move to ctx->orig_thread to unregister ctx->poller. 7967 * However, there is a chance that ctx->poller is executed before 7968 * message is executed, which could result in bdev_open_async_done() 7969 * being called twice. To avoid such race condition, set ctx->rc to 7970 * -ESHUTDOWN. 7971 */ 7972 ctx->rc = -ESHUTDOWN; 7973 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 7974 } 7975 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7976 } 7977 7978 static int bdev_open_async(void *arg); 7979 7980 static void 7981 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 7982 { 7983 uint64_t timeout_ticks; 7984 7985 if (ctx->rc == -ESHUTDOWN) { 7986 /* This context is being canceled. Do nothing. */ 7987 return; 7988 } 7989 7990 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 7991 &ctx->desc); 7992 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 7993 goto exit; 7994 } 7995 7996 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 7997 if (spdk_get_ticks() >= timeout_ticks) { 7998 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 7999 ctx->rc = -ETIMEDOUT; 8000 goto exit; 8001 } 8002 8003 return; 8004 8005 exit: 8006 spdk_poller_unregister(&ctx->poller); 8007 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8008 8009 /* Completion callback is processed after stack unwinding. */ 8010 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8011 } 8012 8013 static int 8014 bdev_open_async(void *arg) 8015 { 8016 struct spdk_bdev_open_async_ctx *ctx = arg; 8017 8018 spdk_spin_lock(&g_bdev_mgr.spinlock); 8019 8020 _bdev_open_async(ctx); 8021 8022 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8023 8024 return SPDK_POLLER_BUSY; 8025 } 8026 8027 static void 8028 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8029 struct spdk_bdev_open_async_opts *opts_src, 8030 size_t size) 8031 { 8032 assert(opts); 8033 assert(opts_src); 8034 8035 opts->size = size; 8036 8037 #define SET_FIELD(field) \ 8038 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8039 opts->field = opts_src->field; \ 8040 } \ 8041 8042 SET_FIELD(timeout_ms); 8043 8044 /* Do not remove this statement, you should always update this statement when you adding a new field, 8045 * and do not forget to add the SET_FIELD statement for your added field. */ 8046 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8047 8048 #undef SET_FIELD 8049 } 8050 8051 static void 8052 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8053 { 8054 assert(opts); 8055 8056 opts->size = size; 8057 8058 #define SET_FIELD(field, value) \ 8059 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8060 opts->field = value; \ 8061 } \ 8062 8063 SET_FIELD(timeout_ms, 0); 8064 8065 #undef SET_FIELD 8066 } 8067 8068 int 8069 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8070 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8071 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8072 { 8073 struct spdk_bdev_open_async_ctx *ctx; 8074 8075 if (event_cb == NULL) { 8076 SPDK_ERRLOG("Missing event callback function\n"); 8077 return -EINVAL; 8078 } 8079 8080 if (open_cb == NULL) { 8081 SPDK_ERRLOG("Missing open callback function\n"); 8082 return -EINVAL; 8083 } 8084 8085 if (opts != NULL && opts->size == 0) { 8086 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8087 return -EINVAL; 8088 } 8089 8090 ctx = calloc(1, sizeof(*ctx)); 8091 if (ctx == NULL) { 8092 SPDK_ERRLOG("Failed to allocate open context\n"); 8093 return -ENOMEM; 8094 } 8095 8096 ctx->bdev_name = strdup(bdev_name); 8097 if (ctx->bdev_name == NULL) { 8098 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8099 free(ctx); 8100 return -ENOMEM; 8101 } 8102 8103 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8104 if (ctx->poller == NULL) { 8105 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8106 free(ctx->bdev_name); 8107 free(ctx); 8108 return -ENOMEM; 8109 } 8110 8111 ctx->cb_fn = open_cb; 8112 ctx->cb_arg = open_cb_arg; 8113 ctx->write = write; 8114 ctx->event_cb = event_cb; 8115 ctx->event_ctx = event_ctx; 8116 ctx->orig_thread = spdk_get_thread(); 8117 ctx->start_ticks = spdk_get_ticks(); 8118 8119 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8120 if (opts != NULL) { 8121 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8122 } 8123 8124 spdk_spin_lock(&g_bdev_mgr.spinlock); 8125 8126 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8127 _bdev_open_async(ctx); 8128 8129 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8130 8131 return 0; 8132 } 8133 8134 static void 8135 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8136 { 8137 int rc; 8138 8139 spdk_spin_lock(&bdev->internal.spinlock); 8140 spdk_spin_lock(&desc->spinlock); 8141 8142 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8143 8144 desc->closed = true; 8145 8146 if (desc->claim != NULL) { 8147 bdev_desc_release_claims(desc); 8148 } 8149 8150 if (0 == desc->refs) { 8151 spdk_spin_unlock(&desc->spinlock); 8152 bdev_desc_free(desc); 8153 } else { 8154 spdk_spin_unlock(&desc->spinlock); 8155 } 8156 8157 /* If no more descriptors, kill QoS channel */ 8158 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8159 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8160 bdev->name, spdk_get_thread()); 8161 8162 if (bdev_qos_destroy(bdev)) { 8163 /* There isn't anything we can do to recover here. Just let the 8164 * old QoS poller keep running. The QoS handling won't change 8165 * cores when the user allocates a new channel, but it won't break. */ 8166 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8167 } 8168 } 8169 8170 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8171 rc = bdev_unregister_unsafe(bdev); 8172 spdk_spin_unlock(&bdev->internal.spinlock); 8173 8174 if (rc == 0) { 8175 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8176 } 8177 } else { 8178 spdk_spin_unlock(&bdev->internal.spinlock); 8179 } 8180 } 8181 8182 void 8183 spdk_bdev_close(struct spdk_bdev_desc *desc) 8184 { 8185 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8186 8187 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8188 spdk_get_thread()); 8189 8190 assert(desc->thread == spdk_get_thread()); 8191 8192 spdk_poller_unregister(&desc->io_timeout_poller); 8193 8194 spdk_spin_lock(&g_bdev_mgr.spinlock); 8195 8196 bdev_close(bdev, desc); 8197 8198 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8199 } 8200 8201 static void 8202 bdev_register_finished(void *arg) 8203 { 8204 struct spdk_bdev_desc *desc = arg; 8205 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8206 8207 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8208 8209 spdk_spin_lock(&g_bdev_mgr.spinlock); 8210 8211 bdev_close(bdev, desc); 8212 8213 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8214 } 8215 8216 int 8217 spdk_bdev_register(struct spdk_bdev *bdev) 8218 { 8219 struct spdk_bdev_desc *desc; 8220 struct spdk_thread *thread = spdk_get_thread(); 8221 int rc; 8222 8223 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8224 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 8225 thread ? spdk_thread_get_name(thread) : "null"); 8226 return -EINVAL; 8227 } 8228 8229 rc = bdev_register(bdev); 8230 if (rc != 0) { 8231 return rc; 8232 } 8233 8234 /* A descriptor is opened to prevent bdev deletion during examination */ 8235 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8236 if (rc != 0) { 8237 spdk_bdev_unregister(bdev, NULL, NULL); 8238 return rc; 8239 } 8240 8241 rc = bdev_open(bdev, false, desc); 8242 if (rc != 0) { 8243 bdev_desc_free(desc); 8244 spdk_bdev_unregister(bdev, NULL, NULL); 8245 return rc; 8246 } 8247 8248 /* Examine configuration before initializing I/O */ 8249 bdev_examine(bdev); 8250 8251 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8252 if (rc != 0) { 8253 bdev_close(bdev, desc); 8254 spdk_bdev_unregister(bdev, NULL, NULL); 8255 } 8256 8257 return rc; 8258 } 8259 8260 int 8261 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8262 struct spdk_bdev_module *module) 8263 { 8264 spdk_spin_lock(&bdev->internal.spinlock); 8265 8266 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8267 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8268 spdk_spin_unlock(&bdev->internal.spinlock); 8269 return -EPERM; 8270 } 8271 8272 if (desc && !desc->write) { 8273 desc->write = true; 8274 } 8275 8276 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8277 bdev->internal.claim.v1.module = module; 8278 8279 spdk_spin_unlock(&bdev->internal.spinlock); 8280 return 0; 8281 } 8282 8283 void 8284 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8285 { 8286 spdk_spin_lock(&bdev->internal.spinlock); 8287 8288 assert(bdev->internal.claim.v1.module != NULL); 8289 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8290 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8291 bdev->internal.claim.v1.module = NULL; 8292 8293 spdk_spin_unlock(&bdev->internal.spinlock); 8294 } 8295 8296 /* 8297 * Start claims v2 8298 */ 8299 8300 const char * 8301 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8302 { 8303 switch (type) { 8304 case SPDK_BDEV_CLAIM_NONE: 8305 return "not_claimed"; 8306 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8307 return "exclusive_write"; 8308 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8309 return "read_many_write_one"; 8310 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8311 return "read_many_write_none"; 8312 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8313 return "read_many_write_many"; 8314 default: 8315 break; 8316 } 8317 return "invalid_claim"; 8318 } 8319 8320 static bool 8321 claim_type_is_v2(enum spdk_bdev_claim_type type) 8322 { 8323 switch (type) { 8324 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8325 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8326 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8327 return true; 8328 default: 8329 break; 8330 } 8331 return false; 8332 } 8333 8334 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8335 static bool 8336 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8337 { 8338 switch (type) { 8339 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8340 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8341 return true; 8342 default: 8343 break; 8344 } 8345 return false; 8346 } 8347 8348 void 8349 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8350 { 8351 if (opts == NULL) { 8352 SPDK_ERRLOG("opts should not be NULL\n"); 8353 assert(opts != NULL); 8354 return; 8355 } 8356 if (size == 0) { 8357 SPDK_ERRLOG("size should not be zero\n"); 8358 assert(size != 0); 8359 return; 8360 } 8361 8362 memset(opts, 0, size); 8363 opts->opts_size = size; 8364 8365 #define FIELD_OK(field) \ 8366 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8367 8368 #define SET_FIELD(field, value) \ 8369 if (FIELD_OK(field)) { \ 8370 opts->field = value; \ 8371 } \ 8372 8373 SET_FIELD(shared_claim_key, 0); 8374 8375 #undef FIELD_OK 8376 #undef SET_FIELD 8377 } 8378 8379 static int 8380 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8381 { 8382 if (src->opts_size == 0) { 8383 SPDK_ERRLOG("size should not be zero\n"); 8384 return -1; 8385 } 8386 8387 memset(dst, 0, sizeof(*dst)); 8388 dst->opts_size = src->opts_size; 8389 8390 #define FIELD_OK(field) \ 8391 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8392 8393 #define SET_FIELD(field) \ 8394 if (FIELD_OK(field)) { \ 8395 dst->field = src->field; \ 8396 } \ 8397 8398 if (FIELD_OK(name)) { 8399 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8400 } 8401 8402 SET_FIELD(shared_claim_key); 8403 8404 /* You should not remove this statement, but need to update the assert statement 8405 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8406 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8407 8408 #undef FIELD_OK 8409 #undef SET_FIELD 8410 return 0; 8411 } 8412 8413 /* Returns 0 if a read-write-once claim can be taken. */ 8414 static int 8415 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8416 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8417 { 8418 struct spdk_bdev *bdev = desc->bdev; 8419 struct spdk_bdev_desc *open_desc; 8420 8421 assert(spdk_spin_held(&bdev->internal.spinlock)); 8422 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8423 8424 if (opts->shared_claim_key != 0) { 8425 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8426 bdev->name); 8427 return -EINVAL; 8428 } 8429 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8430 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8431 return -EPERM; 8432 } 8433 if (desc->claim != NULL) { 8434 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8435 bdev->name, desc->claim->module->name); 8436 return -EPERM; 8437 } 8438 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8439 if (desc != open_desc && open_desc->write) { 8440 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8441 "another descriptor is open for writing\n", 8442 bdev->name); 8443 return -EPERM; 8444 } 8445 } 8446 8447 return 0; 8448 } 8449 8450 /* Returns 0 if a read-only-many claim can be taken. */ 8451 static int 8452 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8453 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8454 { 8455 struct spdk_bdev *bdev = desc->bdev; 8456 struct spdk_bdev_desc *open_desc; 8457 8458 assert(spdk_spin_held(&bdev->internal.spinlock)); 8459 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8460 assert(desc->claim == NULL); 8461 8462 if (desc->write) { 8463 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8464 bdev->name); 8465 return -EINVAL; 8466 } 8467 if (opts->shared_claim_key != 0) { 8468 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8469 return -EINVAL; 8470 } 8471 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8472 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8473 if (open_desc->write) { 8474 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8475 "another descriptor is open for writing\n", 8476 bdev->name); 8477 return -EPERM; 8478 } 8479 } 8480 } 8481 8482 return 0; 8483 } 8484 8485 /* Returns 0 if a read-write-many claim can be taken. */ 8486 static int 8487 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8488 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8489 { 8490 struct spdk_bdev *bdev = desc->bdev; 8491 struct spdk_bdev_desc *open_desc; 8492 8493 assert(spdk_spin_held(&bdev->internal.spinlock)); 8494 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8495 assert(desc->claim == NULL); 8496 8497 if (opts->shared_claim_key == 0) { 8498 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8499 bdev->name); 8500 return -EINVAL; 8501 } 8502 switch (bdev->internal.claim_type) { 8503 case SPDK_BDEV_CLAIM_NONE: 8504 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8505 if (open_desc == desc) { 8506 continue; 8507 } 8508 if (open_desc->write) { 8509 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8510 "another descriptor is open for writing without a " 8511 "claim\n", bdev->name); 8512 return -EPERM; 8513 } 8514 } 8515 break; 8516 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8517 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8518 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8519 return -EPERM; 8520 } 8521 break; 8522 default: 8523 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8524 return -EBUSY; 8525 } 8526 8527 return 0; 8528 } 8529 8530 /* Updates desc and its bdev with a v2 claim. */ 8531 static int 8532 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8533 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8534 { 8535 struct spdk_bdev *bdev = desc->bdev; 8536 struct spdk_bdev_module_claim *claim; 8537 8538 assert(spdk_spin_held(&bdev->internal.spinlock)); 8539 assert(claim_type_is_v2(type)); 8540 assert(desc->claim == NULL); 8541 8542 claim = calloc(1, sizeof(*desc->claim)); 8543 if (claim == NULL) { 8544 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8545 return -ENOMEM; 8546 } 8547 claim->module = module; 8548 claim->desc = desc; 8549 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8550 memcpy(claim->name, opts->name, sizeof(claim->name)); 8551 desc->claim = claim; 8552 8553 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8554 bdev->internal.claim_type = type; 8555 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8556 bdev->internal.claim.v2.key = opts->shared_claim_key; 8557 } 8558 assert(type == bdev->internal.claim_type); 8559 8560 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8561 8562 if (!desc->write && claim_type_promotes_to_write(type)) { 8563 desc->write = true; 8564 } 8565 8566 return 0; 8567 } 8568 8569 int 8570 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8571 struct spdk_bdev_claim_opts *_opts, 8572 struct spdk_bdev_module *module) 8573 { 8574 struct spdk_bdev *bdev; 8575 struct spdk_bdev_claim_opts opts; 8576 int rc = 0; 8577 8578 if (desc == NULL) { 8579 SPDK_ERRLOG("descriptor must not be NULL\n"); 8580 return -EINVAL; 8581 } 8582 8583 bdev = desc->bdev; 8584 8585 if (_opts == NULL) { 8586 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8587 } else if (claim_opts_copy(_opts, &opts) != 0) { 8588 return -EINVAL; 8589 } 8590 8591 spdk_spin_lock(&bdev->internal.spinlock); 8592 8593 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8594 bdev->internal.claim_type != type) { 8595 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8596 spdk_spin_unlock(&bdev->internal.spinlock); 8597 return -EPERM; 8598 } 8599 8600 if (claim_type_is_v2(type) && desc->claim != NULL) { 8601 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8602 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8603 spdk_spin_unlock(&bdev->internal.spinlock); 8604 return -EPERM; 8605 } 8606 8607 switch (type) { 8608 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8609 spdk_spin_unlock(&bdev->internal.spinlock); 8610 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8611 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8612 rc = claim_verify_rwo(desc, type, &opts, module); 8613 break; 8614 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8615 rc = claim_verify_rom(desc, type, &opts, module); 8616 break; 8617 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8618 rc = claim_verify_rwm(desc, type, &opts, module); 8619 break; 8620 default: 8621 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8622 rc = -ENOTSUP; 8623 } 8624 8625 if (rc == 0) { 8626 rc = claim_bdev(desc, type, &opts, module); 8627 } 8628 8629 spdk_spin_unlock(&bdev->internal.spinlock); 8630 return rc; 8631 } 8632 8633 static void 8634 claim_reset(struct spdk_bdev *bdev) 8635 { 8636 assert(spdk_spin_held(&bdev->internal.spinlock)); 8637 assert(claim_type_is_v2(bdev->internal.claim_type)); 8638 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8639 8640 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8641 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8642 } 8643 8644 static void 8645 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8646 { 8647 struct spdk_bdev *bdev = desc->bdev; 8648 8649 assert(spdk_spin_held(&bdev->internal.spinlock)); 8650 assert(claim_type_is_v2(bdev->internal.claim_type)); 8651 8652 if (bdev->internal.examine_in_progress == 0) { 8653 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8654 free(desc->claim); 8655 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8656 claim_reset(bdev); 8657 } 8658 } else { 8659 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8660 desc->claim->module = NULL; 8661 desc->claim->desc = NULL; 8662 } 8663 desc->claim = NULL; 8664 } 8665 8666 /* 8667 * End claims v2 8668 */ 8669 8670 struct spdk_bdev * 8671 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8672 { 8673 assert(desc != NULL); 8674 return desc->bdev; 8675 } 8676 8677 int 8678 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8679 { 8680 struct spdk_bdev *bdev, *tmp; 8681 struct spdk_bdev_desc *desc; 8682 int rc = 0; 8683 8684 assert(fn != NULL); 8685 8686 spdk_spin_lock(&g_bdev_mgr.spinlock); 8687 bdev = spdk_bdev_first(); 8688 while (bdev != NULL) { 8689 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8690 if (rc != 0) { 8691 break; 8692 } 8693 rc = bdev_open(bdev, false, desc); 8694 if (rc != 0) { 8695 bdev_desc_free(desc); 8696 if (rc == -ENODEV) { 8697 /* Ignore the error and move to the next bdev. */ 8698 rc = 0; 8699 bdev = spdk_bdev_next(bdev); 8700 continue; 8701 } 8702 break; 8703 } 8704 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8705 8706 rc = fn(ctx, bdev); 8707 8708 spdk_spin_lock(&g_bdev_mgr.spinlock); 8709 tmp = spdk_bdev_next(bdev); 8710 bdev_close(bdev, desc); 8711 if (rc != 0) { 8712 break; 8713 } 8714 bdev = tmp; 8715 } 8716 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8717 8718 return rc; 8719 } 8720 8721 int 8722 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8723 { 8724 struct spdk_bdev *bdev, *tmp; 8725 struct spdk_bdev_desc *desc; 8726 int rc = 0; 8727 8728 assert(fn != NULL); 8729 8730 spdk_spin_lock(&g_bdev_mgr.spinlock); 8731 bdev = spdk_bdev_first_leaf(); 8732 while (bdev != NULL) { 8733 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8734 if (rc != 0) { 8735 break; 8736 } 8737 rc = bdev_open(bdev, false, desc); 8738 if (rc != 0) { 8739 bdev_desc_free(desc); 8740 if (rc == -ENODEV) { 8741 /* Ignore the error and move to the next bdev. */ 8742 rc = 0; 8743 bdev = spdk_bdev_next_leaf(bdev); 8744 continue; 8745 } 8746 break; 8747 } 8748 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8749 8750 rc = fn(ctx, bdev); 8751 8752 spdk_spin_lock(&g_bdev_mgr.spinlock); 8753 tmp = spdk_bdev_next_leaf(bdev); 8754 bdev_close(bdev, desc); 8755 if (rc != 0) { 8756 break; 8757 } 8758 bdev = tmp; 8759 } 8760 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8761 8762 return rc; 8763 } 8764 8765 void 8766 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8767 { 8768 struct iovec *iovs; 8769 int iovcnt; 8770 8771 if (bdev_io == NULL) { 8772 return; 8773 } 8774 8775 switch (bdev_io->type) { 8776 case SPDK_BDEV_IO_TYPE_READ: 8777 case SPDK_BDEV_IO_TYPE_WRITE: 8778 case SPDK_BDEV_IO_TYPE_ZCOPY: 8779 iovs = bdev_io->u.bdev.iovs; 8780 iovcnt = bdev_io->u.bdev.iovcnt; 8781 break; 8782 default: 8783 iovs = NULL; 8784 iovcnt = 0; 8785 break; 8786 } 8787 8788 if (iovp) { 8789 *iovp = iovs; 8790 } 8791 if (iovcntp) { 8792 *iovcntp = iovcnt; 8793 } 8794 } 8795 8796 void * 8797 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8798 { 8799 if (bdev_io == NULL) { 8800 return NULL; 8801 } 8802 8803 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8804 return NULL; 8805 } 8806 8807 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8808 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8809 return bdev_io->u.bdev.md_buf; 8810 } 8811 8812 return NULL; 8813 } 8814 8815 void * 8816 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8817 { 8818 if (bdev_io == NULL) { 8819 assert(false); 8820 return NULL; 8821 } 8822 8823 return bdev_io->internal.caller_ctx; 8824 } 8825 8826 void 8827 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8828 { 8829 8830 if (spdk_bdev_module_list_find(bdev_module->name)) { 8831 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8832 assert(false); 8833 } 8834 8835 spdk_spin_init(&bdev_module->internal.spinlock); 8836 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8837 8838 /* 8839 * Modules with examine callbacks must be initialized first, so they are 8840 * ready to handle examine callbacks from later modules that will 8841 * register physical bdevs. 8842 */ 8843 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8844 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8845 } else { 8846 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8847 } 8848 } 8849 8850 struct spdk_bdev_module * 8851 spdk_bdev_module_list_find(const char *name) 8852 { 8853 struct spdk_bdev_module *bdev_module; 8854 8855 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8856 if (strcmp(name, bdev_module->name) == 0) { 8857 break; 8858 } 8859 } 8860 8861 return bdev_module; 8862 } 8863 8864 static int 8865 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8866 { 8867 uint64_t num_blocks; 8868 void *md_buf = NULL; 8869 8870 num_blocks = bdev_io->u.bdev.num_blocks; 8871 8872 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8873 md_buf = (char *)g_bdev_mgr.zero_buffer + 8874 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8875 } 8876 8877 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8878 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8879 g_bdev_mgr.zero_buffer, md_buf, 8880 bdev_io->u.bdev.offset_blocks, num_blocks, 8881 bdev_write_zero_buffer_done, bdev_io); 8882 } 8883 8884 static void 8885 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8886 { 8887 struct spdk_bdev_io *parent_io = cb_arg; 8888 8889 spdk_bdev_free_io(bdev_io); 8890 8891 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8892 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8893 } 8894 8895 static void 8896 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8897 { 8898 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8899 ctx->bdev->internal.qos_mod_in_progress = false; 8900 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8901 8902 if (ctx->cb_fn) { 8903 ctx->cb_fn(ctx->cb_arg, status); 8904 } 8905 free(ctx); 8906 } 8907 8908 static void 8909 bdev_disable_qos_done(void *cb_arg) 8910 { 8911 struct set_qos_limit_ctx *ctx = cb_arg; 8912 struct spdk_bdev *bdev = ctx->bdev; 8913 struct spdk_bdev_io *bdev_io; 8914 struct spdk_bdev_qos *qos; 8915 8916 spdk_spin_lock(&bdev->internal.spinlock); 8917 qos = bdev->internal.qos; 8918 bdev->internal.qos = NULL; 8919 spdk_spin_unlock(&bdev->internal.spinlock); 8920 8921 while (!TAILQ_EMPTY(&qos->queued)) { 8922 /* Send queued I/O back to their original thread for resubmission. */ 8923 bdev_io = TAILQ_FIRST(&qos->queued); 8924 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8925 8926 if (bdev_io->internal.io_submit_ch) { 8927 /* 8928 * Channel was changed when sending it to the QoS thread - change it back 8929 * before sending it back to the original thread. 8930 */ 8931 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8932 bdev_io->internal.io_submit_ch = NULL; 8933 } 8934 8935 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8936 _bdev_io_submit, bdev_io); 8937 } 8938 8939 if (qos->thread != NULL) { 8940 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8941 spdk_poller_unregister(&qos->poller); 8942 } 8943 8944 free(qos); 8945 8946 bdev_set_qos_limit_done(ctx, 0); 8947 } 8948 8949 static void 8950 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8951 { 8952 struct set_qos_limit_ctx *ctx = _ctx; 8953 struct spdk_thread *thread; 8954 8955 spdk_spin_lock(&bdev->internal.spinlock); 8956 thread = bdev->internal.qos->thread; 8957 spdk_spin_unlock(&bdev->internal.spinlock); 8958 8959 if (thread != NULL) { 8960 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8961 } else { 8962 bdev_disable_qos_done(ctx); 8963 } 8964 } 8965 8966 static void 8967 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8968 struct spdk_io_channel *ch, void *_ctx) 8969 { 8970 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8971 8972 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8973 8974 spdk_bdev_for_each_channel_continue(i, 0); 8975 } 8976 8977 static void 8978 bdev_update_qos_rate_limit_msg(void *cb_arg) 8979 { 8980 struct set_qos_limit_ctx *ctx = cb_arg; 8981 struct spdk_bdev *bdev = ctx->bdev; 8982 8983 spdk_spin_lock(&bdev->internal.spinlock); 8984 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8985 spdk_spin_unlock(&bdev->internal.spinlock); 8986 8987 bdev_set_qos_limit_done(ctx, 0); 8988 } 8989 8990 static void 8991 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8992 struct spdk_io_channel *ch, void *_ctx) 8993 { 8994 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8995 8996 spdk_spin_lock(&bdev->internal.spinlock); 8997 bdev_enable_qos(bdev, bdev_ch); 8998 spdk_spin_unlock(&bdev->internal.spinlock); 8999 spdk_bdev_for_each_channel_continue(i, 0); 9000 } 9001 9002 static void 9003 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9004 { 9005 struct set_qos_limit_ctx *ctx = _ctx; 9006 9007 bdev_set_qos_limit_done(ctx, status); 9008 } 9009 9010 static void 9011 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9012 { 9013 int i; 9014 9015 assert(bdev->internal.qos != NULL); 9016 9017 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9018 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9019 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9020 9021 if (limits[i] == 0) { 9022 bdev->internal.qos->rate_limits[i].limit = 9023 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9024 } 9025 } 9026 } 9027 } 9028 9029 void 9030 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9031 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9032 { 9033 struct set_qos_limit_ctx *ctx; 9034 uint32_t limit_set_complement; 9035 uint64_t min_limit_per_sec; 9036 int i; 9037 bool disable_rate_limit = true; 9038 9039 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9040 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9041 continue; 9042 } 9043 9044 if (limits[i] > 0) { 9045 disable_rate_limit = false; 9046 } 9047 9048 if (bdev_qos_is_iops_rate_limit(i) == true) { 9049 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9050 } else { 9051 /* Change from megabyte to byte rate limit */ 9052 limits[i] = limits[i] * 1024 * 1024; 9053 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9054 } 9055 9056 limit_set_complement = limits[i] % min_limit_per_sec; 9057 if (limit_set_complement) { 9058 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9059 limits[i], min_limit_per_sec); 9060 limits[i] += min_limit_per_sec - limit_set_complement; 9061 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9062 } 9063 } 9064 9065 ctx = calloc(1, sizeof(*ctx)); 9066 if (ctx == NULL) { 9067 cb_fn(cb_arg, -ENOMEM); 9068 return; 9069 } 9070 9071 ctx->cb_fn = cb_fn; 9072 ctx->cb_arg = cb_arg; 9073 ctx->bdev = bdev; 9074 9075 spdk_spin_lock(&bdev->internal.spinlock); 9076 if (bdev->internal.qos_mod_in_progress) { 9077 spdk_spin_unlock(&bdev->internal.spinlock); 9078 free(ctx); 9079 cb_fn(cb_arg, -EAGAIN); 9080 return; 9081 } 9082 bdev->internal.qos_mod_in_progress = true; 9083 9084 if (disable_rate_limit == true && bdev->internal.qos) { 9085 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9086 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9087 (bdev->internal.qos->rate_limits[i].limit > 0 && 9088 bdev->internal.qos->rate_limits[i].limit != 9089 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9090 disable_rate_limit = false; 9091 break; 9092 } 9093 } 9094 } 9095 9096 if (disable_rate_limit == false) { 9097 if (bdev->internal.qos == NULL) { 9098 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9099 if (!bdev->internal.qos) { 9100 spdk_spin_unlock(&bdev->internal.spinlock); 9101 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9102 bdev_set_qos_limit_done(ctx, -ENOMEM); 9103 return; 9104 } 9105 } 9106 9107 if (bdev->internal.qos->thread == NULL) { 9108 /* Enabling */ 9109 bdev_set_qos_rate_limits(bdev, limits); 9110 9111 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9112 bdev_enable_qos_done); 9113 } else { 9114 /* Updating */ 9115 bdev_set_qos_rate_limits(bdev, limits); 9116 9117 spdk_thread_send_msg(bdev->internal.qos->thread, 9118 bdev_update_qos_rate_limit_msg, ctx); 9119 } 9120 } else { 9121 if (bdev->internal.qos != NULL) { 9122 bdev_set_qos_rate_limits(bdev, limits); 9123 9124 /* Disabling */ 9125 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9126 bdev_disable_qos_msg_done); 9127 } else { 9128 spdk_spin_unlock(&bdev->internal.spinlock); 9129 bdev_set_qos_limit_done(ctx, 0); 9130 return; 9131 } 9132 } 9133 9134 spdk_spin_unlock(&bdev->internal.spinlock); 9135 } 9136 9137 struct spdk_bdev_histogram_ctx { 9138 spdk_bdev_histogram_status_cb cb_fn; 9139 void *cb_arg; 9140 struct spdk_bdev *bdev; 9141 int status; 9142 }; 9143 9144 static void 9145 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9146 { 9147 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9148 9149 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9150 ctx->bdev->internal.histogram_in_progress = false; 9151 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9152 ctx->cb_fn(ctx->cb_arg, ctx->status); 9153 free(ctx); 9154 } 9155 9156 static void 9157 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9158 struct spdk_io_channel *_ch, void *_ctx) 9159 { 9160 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9161 9162 if (ch->histogram != NULL) { 9163 spdk_histogram_data_free(ch->histogram); 9164 ch->histogram = NULL; 9165 } 9166 spdk_bdev_for_each_channel_continue(i, 0); 9167 } 9168 9169 static void 9170 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9171 { 9172 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9173 9174 if (status != 0) { 9175 ctx->status = status; 9176 ctx->bdev->internal.histogram_enabled = false; 9177 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9178 bdev_histogram_disable_channel_cb); 9179 } else { 9180 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9181 ctx->bdev->internal.histogram_in_progress = false; 9182 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9183 ctx->cb_fn(ctx->cb_arg, ctx->status); 9184 free(ctx); 9185 } 9186 } 9187 9188 static void 9189 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9190 struct spdk_io_channel *_ch, void *_ctx) 9191 { 9192 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9193 int status = 0; 9194 9195 if (ch->histogram == NULL) { 9196 ch->histogram = spdk_histogram_data_alloc(); 9197 if (ch->histogram == NULL) { 9198 status = -ENOMEM; 9199 } 9200 } 9201 9202 spdk_bdev_for_each_channel_continue(i, status); 9203 } 9204 9205 void 9206 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9207 void *cb_arg, bool enable) 9208 { 9209 struct spdk_bdev_histogram_ctx *ctx; 9210 9211 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9212 if (ctx == NULL) { 9213 cb_fn(cb_arg, -ENOMEM); 9214 return; 9215 } 9216 9217 ctx->bdev = bdev; 9218 ctx->status = 0; 9219 ctx->cb_fn = cb_fn; 9220 ctx->cb_arg = cb_arg; 9221 9222 spdk_spin_lock(&bdev->internal.spinlock); 9223 if (bdev->internal.histogram_in_progress) { 9224 spdk_spin_unlock(&bdev->internal.spinlock); 9225 free(ctx); 9226 cb_fn(cb_arg, -EAGAIN); 9227 return; 9228 } 9229 9230 bdev->internal.histogram_in_progress = true; 9231 spdk_spin_unlock(&bdev->internal.spinlock); 9232 9233 bdev->internal.histogram_enabled = enable; 9234 9235 if (enable) { 9236 /* Allocate histogram for each channel */ 9237 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9238 bdev_histogram_enable_channel_cb); 9239 } else { 9240 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9241 bdev_histogram_disable_channel_cb); 9242 } 9243 } 9244 9245 struct spdk_bdev_histogram_data_ctx { 9246 spdk_bdev_histogram_data_cb cb_fn; 9247 void *cb_arg; 9248 struct spdk_bdev *bdev; 9249 /** merged histogram data from all channels */ 9250 struct spdk_histogram_data *histogram; 9251 }; 9252 9253 static void 9254 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9255 { 9256 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9257 9258 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9259 free(ctx); 9260 } 9261 9262 static void 9263 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9264 struct spdk_io_channel *_ch, void *_ctx) 9265 { 9266 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9267 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9268 int status = 0; 9269 9270 if (ch->histogram == NULL) { 9271 status = -EFAULT; 9272 } else { 9273 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9274 } 9275 9276 spdk_bdev_for_each_channel_continue(i, status); 9277 } 9278 9279 void 9280 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9281 spdk_bdev_histogram_data_cb cb_fn, 9282 void *cb_arg) 9283 { 9284 struct spdk_bdev_histogram_data_ctx *ctx; 9285 9286 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9287 if (ctx == NULL) { 9288 cb_fn(cb_arg, -ENOMEM, NULL); 9289 return; 9290 } 9291 9292 ctx->bdev = bdev; 9293 ctx->cb_fn = cb_fn; 9294 ctx->cb_arg = cb_arg; 9295 9296 ctx->histogram = histogram; 9297 9298 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9299 bdev_histogram_get_channel_cb); 9300 } 9301 9302 void 9303 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9304 void *cb_arg) 9305 { 9306 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9307 int status = 0; 9308 9309 assert(cb_fn != NULL); 9310 9311 if (bdev_ch->histogram == NULL) { 9312 status = -EFAULT; 9313 } 9314 cb_fn(cb_arg, status, bdev_ch->histogram); 9315 } 9316 9317 size_t 9318 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9319 size_t max_events) 9320 { 9321 struct media_event_entry *entry; 9322 size_t num_events = 0; 9323 9324 for (; num_events < max_events; ++num_events) { 9325 entry = TAILQ_FIRST(&desc->pending_media_events); 9326 if (entry == NULL) { 9327 break; 9328 } 9329 9330 events[num_events] = entry->event; 9331 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9332 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9333 } 9334 9335 return num_events; 9336 } 9337 9338 int 9339 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9340 size_t num_events) 9341 { 9342 struct spdk_bdev_desc *desc; 9343 struct media_event_entry *entry; 9344 size_t event_id; 9345 int rc = 0; 9346 9347 assert(bdev->media_events); 9348 9349 spdk_spin_lock(&bdev->internal.spinlock); 9350 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9351 if (desc->write) { 9352 break; 9353 } 9354 } 9355 9356 if (desc == NULL || desc->media_events_buffer == NULL) { 9357 rc = -ENODEV; 9358 goto out; 9359 } 9360 9361 for (event_id = 0; event_id < num_events; ++event_id) { 9362 entry = TAILQ_FIRST(&desc->free_media_events); 9363 if (entry == NULL) { 9364 break; 9365 } 9366 9367 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9368 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9369 entry->event = events[event_id]; 9370 } 9371 9372 rc = event_id; 9373 out: 9374 spdk_spin_unlock(&bdev->internal.spinlock); 9375 return rc; 9376 } 9377 9378 static void 9379 _media_management_notify(void *arg) 9380 { 9381 struct spdk_bdev_desc *desc = arg; 9382 9383 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9384 } 9385 9386 void 9387 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9388 { 9389 struct spdk_bdev_desc *desc; 9390 9391 spdk_spin_lock(&bdev->internal.spinlock); 9392 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9393 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9394 event_notify(desc, _media_management_notify); 9395 } 9396 } 9397 spdk_spin_unlock(&bdev->internal.spinlock); 9398 } 9399 9400 struct locked_lba_range_ctx { 9401 struct lba_range range; 9402 struct lba_range *current_range; 9403 struct lba_range *owner_range; 9404 struct spdk_poller *poller; 9405 lock_range_cb cb_fn; 9406 void *cb_arg; 9407 }; 9408 9409 static void 9410 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9411 { 9412 struct locked_lba_range_ctx *ctx = _ctx; 9413 9414 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9415 free(ctx); 9416 } 9417 9418 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9419 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9420 9421 static void 9422 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9423 { 9424 struct locked_lba_range_ctx *ctx = _ctx; 9425 9426 if (status == -ENOMEM) { 9427 /* One of the channels could not allocate a range object. 9428 * So we have to go back and clean up any ranges that were 9429 * allocated successfully before we return error status to 9430 * the caller. We can reuse the unlock function to do that 9431 * clean up. 9432 */ 9433 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9434 bdev_lock_error_cleanup_cb); 9435 return; 9436 } 9437 9438 /* All channels have locked this range and no I/O overlapping the range 9439 * are outstanding! Set the owner_ch for the range object for the 9440 * locking channel, so that this channel will know that it is allowed 9441 * to write to this range. 9442 */ 9443 if (ctx->owner_range != NULL) { 9444 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9445 } 9446 9447 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9448 9449 /* Don't free the ctx here. Its range is in the bdev's global list of 9450 * locked ranges still, and will be removed and freed when this range 9451 * is later unlocked. 9452 */ 9453 } 9454 9455 static int 9456 bdev_lock_lba_range_check_io(void *_i) 9457 { 9458 struct spdk_bdev_channel_iter *i = _i; 9459 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9460 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9461 struct locked_lba_range_ctx *ctx = i->ctx; 9462 struct lba_range *range = ctx->current_range; 9463 struct spdk_bdev_io *bdev_io; 9464 9465 spdk_poller_unregister(&ctx->poller); 9466 9467 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9468 * range. But we need to wait until any outstanding IO overlapping with this range 9469 * are completed. 9470 */ 9471 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9472 if (bdev_io_range_is_locked(bdev_io, range)) { 9473 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9474 return SPDK_POLLER_BUSY; 9475 } 9476 } 9477 9478 spdk_bdev_for_each_channel_continue(i, 0); 9479 return SPDK_POLLER_BUSY; 9480 } 9481 9482 static void 9483 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9484 struct spdk_io_channel *_ch, void *_ctx) 9485 { 9486 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9487 struct locked_lba_range_ctx *ctx = _ctx; 9488 struct lba_range *range; 9489 9490 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9491 if (range->length == ctx->range.length && 9492 range->offset == ctx->range.offset && 9493 range->locked_ctx == ctx->range.locked_ctx) { 9494 /* This range already exists on this channel, so don't add 9495 * it again. This can happen when a new channel is created 9496 * while the for_each_channel operation is in progress. 9497 * Do not check for outstanding I/O in that case, since the 9498 * range was locked before any I/O could be submitted to the 9499 * new channel. 9500 */ 9501 spdk_bdev_for_each_channel_continue(i, 0); 9502 return; 9503 } 9504 } 9505 9506 range = calloc(1, sizeof(*range)); 9507 if (range == NULL) { 9508 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9509 return; 9510 } 9511 9512 range->length = ctx->range.length; 9513 range->offset = ctx->range.offset; 9514 range->locked_ctx = ctx->range.locked_ctx; 9515 ctx->current_range = range; 9516 if (ctx->range.owner_ch == ch) { 9517 /* This is the range object for the channel that will hold 9518 * the lock. Store it in the ctx object so that we can easily 9519 * set its owner_ch after the lock is finally acquired. 9520 */ 9521 ctx->owner_range = range; 9522 } 9523 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9524 bdev_lock_lba_range_check_io(i); 9525 } 9526 9527 static void 9528 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9529 { 9530 assert(spdk_get_thread() == ctx->range.owner_thread); 9531 assert(ctx->range.owner_ch == NULL || 9532 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9533 9534 /* We will add a copy of this range to each channel now. */ 9535 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9536 bdev_lock_lba_range_cb); 9537 } 9538 9539 static bool 9540 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9541 { 9542 struct lba_range *r; 9543 9544 TAILQ_FOREACH(r, tailq, tailq) { 9545 if (bdev_lba_range_overlapped(range, r)) { 9546 return true; 9547 } 9548 } 9549 return false; 9550 } 9551 9552 static int 9553 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9554 uint64_t offset, uint64_t length, 9555 lock_range_cb cb_fn, void *cb_arg) 9556 { 9557 struct locked_lba_range_ctx *ctx; 9558 9559 ctx = calloc(1, sizeof(*ctx)); 9560 if (ctx == NULL) { 9561 return -ENOMEM; 9562 } 9563 9564 ctx->range.offset = offset; 9565 ctx->range.length = length; 9566 ctx->range.owner_thread = spdk_get_thread(); 9567 ctx->range.owner_ch = ch; 9568 ctx->range.locked_ctx = cb_arg; 9569 ctx->range.bdev = bdev; 9570 ctx->cb_fn = cb_fn; 9571 ctx->cb_arg = cb_arg; 9572 9573 spdk_spin_lock(&bdev->internal.spinlock); 9574 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9575 /* There is an active lock overlapping with this range. 9576 * Put it on the pending list until this range no 9577 * longer overlaps with another. 9578 */ 9579 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9580 } else { 9581 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9582 bdev_lock_lba_range_ctx(bdev, ctx); 9583 } 9584 spdk_spin_unlock(&bdev->internal.spinlock); 9585 return 0; 9586 } 9587 9588 static int 9589 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9590 uint64_t offset, uint64_t length, 9591 lock_range_cb cb_fn, void *cb_arg) 9592 { 9593 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9594 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9595 9596 if (cb_arg == NULL) { 9597 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9598 return -EINVAL; 9599 } 9600 9601 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9602 } 9603 9604 static void 9605 bdev_lock_lba_range_ctx_msg(void *_ctx) 9606 { 9607 struct locked_lba_range_ctx *ctx = _ctx; 9608 9609 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9610 } 9611 9612 static void 9613 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9614 { 9615 struct locked_lba_range_ctx *ctx = _ctx; 9616 struct locked_lba_range_ctx *pending_ctx; 9617 struct lba_range *range, *tmp; 9618 9619 spdk_spin_lock(&bdev->internal.spinlock); 9620 /* Check if there are any pending locked ranges that overlap with this range 9621 * that was just unlocked. If there are, check that it doesn't overlap with any 9622 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9623 * the lock process. 9624 */ 9625 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9626 if (bdev_lba_range_overlapped(range, &ctx->range) && 9627 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9628 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9629 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9630 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9631 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9632 bdev_lock_lba_range_ctx_msg, pending_ctx); 9633 } 9634 } 9635 spdk_spin_unlock(&bdev->internal.spinlock); 9636 9637 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9638 free(ctx); 9639 } 9640 9641 static void 9642 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9643 struct spdk_io_channel *_ch, void *_ctx) 9644 { 9645 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9646 struct locked_lba_range_ctx *ctx = _ctx; 9647 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9648 struct spdk_bdev_io *bdev_io; 9649 struct lba_range *range; 9650 9651 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9652 if (ctx->range.offset == range->offset && 9653 ctx->range.length == range->length && 9654 ctx->range.locked_ctx == range->locked_ctx) { 9655 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9656 free(range); 9657 break; 9658 } 9659 } 9660 9661 /* Note: we should almost always be able to assert that the range specified 9662 * was found. But there are some very rare corner cases where a new channel 9663 * gets created simultaneously with a range unlock, where this function 9664 * would execute on that new channel and wouldn't have the range. 9665 * We also use this to clean up range allocations when a later allocation 9666 * fails in the locking path. 9667 * So we can't actually assert() here. 9668 */ 9669 9670 /* Swap the locked IO into a temporary list, and then try to submit them again. 9671 * We could hyper-optimize this to only resubmit locked I/O that overlap 9672 * with the range that was just unlocked, but this isn't a performance path so 9673 * we go for simplicity here. 9674 */ 9675 TAILQ_INIT(&io_locked); 9676 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9677 while (!TAILQ_EMPTY(&io_locked)) { 9678 bdev_io = TAILQ_FIRST(&io_locked); 9679 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9680 bdev_io_submit(bdev_io); 9681 } 9682 9683 spdk_bdev_for_each_channel_continue(i, 0); 9684 } 9685 9686 static int 9687 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9688 lock_range_cb cb_fn, void *cb_arg) 9689 { 9690 struct locked_lba_range_ctx *ctx; 9691 struct lba_range *range; 9692 9693 spdk_spin_lock(&bdev->internal.spinlock); 9694 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9695 * and remove it. This ensures new channels don't inherit the locked range. 9696 * Then we will send a message to each channel to remove the range from its 9697 * per-channel list. 9698 */ 9699 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9700 if (range->offset == offset && range->length == length && 9701 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9702 break; 9703 } 9704 } 9705 if (range == NULL) { 9706 assert(false); 9707 spdk_spin_unlock(&bdev->internal.spinlock); 9708 return -EINVAL; 9709 } 9710 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9711 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9712 spdk_spin_unlock(&bdev->internal.spinlock); 9713 9714 ctx->cb_fn = cb_fn; 9715 ctx->cb_arg = cb_arg; 9716 9717 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9718 bdev_unlock_lba_range_cb); 9719 return 0; 9720 } 9721 9722 static int 9723 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9724 uint64_t offset, uint64_t length, 9725 lock_range_cb cb_fn, void *cb_arg) 9726 { 9727 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9728 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9729 struct lba_range *range; 9730 bool range_found = false; 9731 9732 /* Let's make sure the specified channel actually has a lock on 9733 * the specified range. Note that the range must match exactly. 9734 */ 9735 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9736 if (range->offset == offset && range->length == length && 9737 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9738 range_found = true; 9739 break; 9740 } 9741 } 9742 9743 if (!range_found) { 9744 return -EINVAL; 9745 } 9746 9747 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9748 } 9749 9750 struct bdev_quiesce_ctx { 9751 spdk_bdev_quiesce_cb cb_fn; 9752 void *cb_arg; 9753 }; 9754 9755 static void 9756 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9757 { 9758 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9759 9760 if (quiesce_ctx->cb_fn != NULL) { 9761 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9762 } 9763 9764 free(quiesce_ctx); 9765 } 9766 9767 static void 9768 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9769 { 9770 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9771 struct spdk_bdev_module *module = range->bdev->module; 9772 9773 if (status != 0) { 9774 if (quiesce_ctx->cb_fn != NULL) { 9775 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9776 } 9777 free(quiesce_ctx); 9778 return; 9779 } 9780 9781 spdk_spin_lock(&module->internal.spinlock); 9782 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9783 spdk_spin_unlock(&module->internal.spinlock); 9784 9785 if (quiesce_ctx->cb_fn != NULL) { 9786 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9787 quiesce_ctx->cb_fn = NULL; 9788 quiesce_ctx->cb_arg = NULL; 9789 } 9790 /* quiesce_ctx will be freed on unquiesce */ 9791 } 9792 9793 static int 9794 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9795 uint64_t offset, uint64_t length, 9796 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9797 bool unquiesce) 9798 { 9799 struct bdev_quiesce_ctx *quiesce_ctx; 9800 int rc; 9801 9802 if (module != bdev->module) { 9803 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9804 return -EINVAL; 9805 } 9806 9807 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9808 return -EINVAL; 9809 } 9810 9811 if (unquiesce) { 9812 struct lba_range *range; 9813 9814 /* Make sure the specified range is actually quiesced in the specified module and 9815 * then remove it from the list. Note that the range must match exactly. 9816 */ 9817 spdk_spin_lock(&module->internal.spinlock); 9818 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9819 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9820 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9821 break; 9822 } 9823 } 9824 spdk_spin_unlock(&module->internal.spinlock); 9825 9826 if (range == NULL) { 9827 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9828 return -EINVAL; 9829 } 9830 9831 quiesce_ctx = range->locked_ctx; 9832 quiesce_ctx->cb_fn = cb_fn; 9833 quiesce_ctx->cb_arg = cb_arg; 9834 9835 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9836 } else { 9837 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9838 if (quiesce_ctx == NULL) { 9839 return -ENOMEM; 9840 } 9841 9842 quiesce_ctx->cb_fn = cb_fn; 9843 quiesce_ctx->cb_arg = cb_arg; 9844 9845 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9846 if (rc != 0) { 9847 free(quiesce_ctx); 9848 } 9849 } 9850 9851 return rc; 9852 } 9853 9854 int 9855 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9856 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9857 { 9858 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9859 } 9860 9861 int 9862 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9863 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9864 { 9865 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9866 } 9867 9868 int 9869 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9870 uint64_t offset, uint64_t length, 9871 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9872 { 9873 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9874 } 9875 9876 int 9877 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9878 uint64_t offset, uint64_t length, 9879 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9880 { 9881 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9882 } 9883 9884 int 9885 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9886 int array_size) 9887 { 9888 if (!bdev) { 9889 return -EINVAL; 9890 } 9891 9892 if (bdev->fn_table->get_memory_domains) { 9893 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9894 } 9895 9896 return 0; 9897 } 9898 9899 struct spdk_bdev_for_each_io_ctx { 9900 void *ctx; 9901 spdk_bdev_io_fn fn; 9902 spdk_bdev_for_each_io_cb cb; 9903 }; 9904 9905 static void 9906 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9907 struct spdk_io_channel *io_ch, void *_ctx) 9908 { 9909 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9910 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9911 struct spdk_bdev_io *bdev_io; 9912 int rc = 0; 9913 9914 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9915 rc = ctx->fn(ctx->ctx, bdev_io); 9916 if (rc != 0) { 9917 break; 9918 } 9919 } 9920 9921 spdk_bdev_for_each_channel_continue(i, rc); 9922 } 9923 9924 static void 9925 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9926 { 9927 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9928 9929 ctx->cb(ctx->ctx, status); 9930 9931 free(ctx); 9932 } 9933 9934 void 9935 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9936 spdk_bdev_for_each_io_cb cb) 9937 { 9938 struct spdk_bdev_for_each_io_ctx *ctx; 9939 9940 assert(fn != NULL && cb != NULL); 9941 9942 ctx = calloc(1, sizeof(*ctx)); 9943 if (ctx == NULL) { 9944 SPDK_ERRLOG("Failed to allocate context.\n"); 9945 cb(_ctx, -ENOMEM); 9946 return; 9947 } 9948 9949 ctx->ctx = _ctx; 9950 ctx->fn = fn; 9951 ctx->cb = cb; 9952 9953 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9954 bdev_for_each_io_done); 9955 } 9956 9957 void 9958 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9959 { 9960 spdk_for_each_channel_continue(iter->i, status); 9961 } 9962 9963 static struct spdk_bdev * 9964 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9965 { 9966 void *io_device = spdk_io_channel_iter_get_io_device(i); 9967 9968 return __bdev_from_io_dev(io_device); 9969 } 9970 9971 static void 9972 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9973 { 9974 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9975 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9976 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9977 9978 iter->i = i; 9979 iter->fn(iter, bdev, ch, iter->ctx); 9980 } 9981 9982 static void 9983 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9984 { 9985 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9986 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9987 9988 iter->i = i; 9989 iter->cpl(bdev, iter->ctx, status); 9990 9991 free(iter); 9992 } 9993 9994 void 9995 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9996 void *ctx, spdk_bdev_for_each_channel_done cpl) 9997 { 9998 struct spdk_bdev_channel_iter *iter; 9999 10000 assert(bdev != NULL && fn != NULL && ctx != NULL); 10001 10002 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10003 if (iter == NULL) { 10004 SPDK_ERRLOG("Unable to allocate iterator\n"); 10005 assert(false); 10006 return; 10007 } 10008 10009 iter->fn = fn; 10010 iter->cpl = cpl; 10011 iter->ctx = ctx; 10012 10013 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10014 iter, bdev_each_channel_cpl); 10015 } 10016 10017 static void 10018 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10019 { 10020 struct spdk_bdev_io *parent_io = cb_arg; 10021 10022 spdk_bdev_free_io(bdev_io); 10023 10024 /* Check return status of write */ 10025 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10026 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10027 } 10028 10029 static void 10030 bdev_copy_do_write(void *_bdev_io) 10031 { 10032 struct spdk_bdev_io *bdev_io = _bdev_io; 10033 int rc; 10034 10035 /* Write blocks */ 10036 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10037 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10038 bdev_io->u.bdev.iovs[0].iov_base, 10039 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10040 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10041 10042 if (rc == -ENOMEM) { 10043 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10044 } else if (rc != 0) { 10045 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10046 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10047 } 10048 } 10049 10050 static void 10051 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10052 { 10053 struct spdk_bdev_io *parent_io = cb_arg; 10054 10055 spdk_bdev_free_io(bdev_io); 10056 10057 /* Check return status of read */ 10058 if (!success) { 10059 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10060 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10061 return; 10062 } 10063 10064 /* Do write */ 10065 bdev_copy_do_write(parent_io); 10066 } 10067 10068 static void 10069 bdev_copy_do_read(void *_bdev_io) 10070 { 10071 struct spdk_bdev_io *bdev_io = _bdev_io; 10072 int rc; 10073 10074 /* Read blocks */ 10075 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10076 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10077 bdev_io->u.bdev.iovs[0].iov_base, 10078 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10079 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10080 10081 if (rc == -ENOMEM) { 10082 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10083 } else if (rc != 0) { 10084 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10085 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10086 } 10087 } 10088 10089 static void 10090 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10091 { 10092 if (!success) { 10093 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10094 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10095 return; 10096 } 10097 10098 bdev_copy_do_read(bdev_io); 10099 } 10100 10101 int 10102 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10103 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10104 spdk_bdev_io_completion_cb cb, void *cb_arg) 10105 { 10106 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10107 struct spdk_bdev_io *bdev_io; 10108 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10109 10110 if (!desc->write) { 10111 return -EBADF; 10112 } 10113 10114 if (num_blocks == 0) { 10115 SPDK_ERRLOG("Can't copy 0 blocks\n"); 10116 return -EINVAL; 10117 } 10118 10119 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10120 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10121 SPDK_DEBUGLOG(bdev, 10122 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10123 dst_offset_blocks, src_offset_blocks, num_blocks); 10124 return -EINVAL; 10125 } 10126 10127 bdev_io = bdev_channel_get_io(channel); 10128 if (!bdev_io) { 10129 return -ENOMEM; 10130 } 10131 10132 bdev_io->internal.ch = channel; 10133 bdev_io->internal.desc = desc; 10134 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10135 10136 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10137 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10138 bdev_io->u.bdev.num_blocks = num_blocks; 10139 bdev_io->u.bdev.memory_domain = NULL; 10140 bdev_io->u.bdev.memory_domain_ctx = NULL; 10141 bdev_io->u.bdev.iovs = NULL; 10142 bdev_io->u.bdev.iovcnt = 0; 10143 bdev_io->u.bdev.md_buf = NULL; 10144 bdev_io->u.bdev.accel_sequence = NULL; 10145 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10146 10147 if (dst_offset_blocks == src_offset_blocks) { 10148 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 10149 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 10150 10151 return 0; 10152 } 10153 10154 10155 /* If the copy size is large and should be split, use the generic split logic 10156 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10157 * 10158 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10159 * emulate it using regular read and write requests otherwise. 10160 */ 10161 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10162 bdev_io->internal.split) { 10163 bdev_io_submit(bdev_io); 10164 return 0; 10165 } 10166 10167 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10168 10169 return 0; 10170 } 10171 10172 SPDK_LOG_REGISTER_COMPONENT(bdev) 10173 10174 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10175 { 10176 struct spdk_trace_tpoint_opts opts[] = { 10177 { 10178 "BDEV_IO_START", TRACE_BDEV_IO_START, 10179 OWNER_BDEV, OBJECT_BDEV_IO, 1, 10180 { 10181 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10182 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10183 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10184 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10185 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10186 } 10187 }, 10188 { 10189 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10190 OWNER_BDEV, OBJECT_BDEV_IO, 0, 10191 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10192 }, 10193 { 10194 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10195 OWNER_BDEV, OBJECT_NONE, 1, 10196 { 10197 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10198 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10199 } 10200 }, 10201 { 10202 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10203 OWNER_BDEV, OBJECT_NONE, 0, 10204 { 10205 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10206 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10207 } 10208 }, 10209 }; 10210 10211 10212 spdk_trace_register_owner(OWNER_BDEV, 'b'); 10213 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10214 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10215 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10216 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10217 } 10218