1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 79 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 80 }; 81 82 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 83 84 RB_HEAD(bdev_name_tree, spdk_bdev_name); 85 86 static int 87 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 88 { 89 return strcmp(name1->name, name2->name); 90 } 91 92 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 93 94 struct spdk_bdev_mgr { 95 struct spdk_mempool *bdev_io_pool; 96 97 void *zero_buffer; 98 99 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 100 101 struct spdk_bdev_list bdevs; 102 struct bdev_name_tree bdev_names; 103 104 bool init_complete; 105 bool module_init_complete; 106 107 struct spdk_spinlock spinlock; 108 109 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 123 }; 124 125 static void 126 __attribute__((constructor)) 127 _bdev_init(void) 128 { 129 spdk_spin_init(&g_bdev_mgr.spinlock); 130 } 131 132 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 133 134 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 135 136 struct lba_range { 137 struct spdk_bdev *bdev; 138 uint64_t offset; 139 uint64_t length; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 }; 152 153 static spdk_bdev_init_cb g_init_cb_fn = NULL; 154 static void *g_init_cb_arg = NULL; 155 156 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 157 static void *g_fini_cb_arg = NULL; 158 static struct spdk_thread *g_fini_thread = NULL; 159 160 struct spdk_bdev_qos_limit { 161 /** IOs or bytes allowed per second (i.e., 1s). */ 162 uint64_t limit; 163 164 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 165 * For remaining bytes, allowed to run negative if an I/O is submitted when 166 * some bytes are remaining, but the I/O is bigger than that amount. The 167 * excess will be deducted from the next timeslice. 168 */ 169 int64_t remaining_this_timeslice; 170 171 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 172 uint32_t min_per_timeslice; 173 174 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 175 uint32_t max_per_timeslice; 176 177 /** Function to check whether to queue the IO. */ 178 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 179 180 /** Function to update for the submitted IO. */ 181 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 182 }; 183 184 struct spdk_bdev_qos { 185 /** Types of structure of rate limits. */ 186 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 187 188 /** The channel that all I/O are funneled through. */ 189 struct spdk_bdev_channel *ch; 190 191 /** The thread on which the poller is running. */ 192 struct spdk_thread *thread; 193 194 /** Queue of I/O waiting to be issued. */ 195 bdev_io_tailq_t queued; 196 197 /** Size of a timeslice in tsc ticks. */ 198 uint64_t timeslice_size; 199 200 /** Timestamp of start of last timeslice. */ 201 uint64_t last_timeslice; 202 203 /** Poller that processes queued I/O commands each time slice. */ 204 struct spdk_poller *poller; 205 }; 206 207 struct spdk_bdev_mgmt_channel { 208 /* 209 * Each thread keeps a cache of bdev_io - this allows 210 * bdev threads which are *not* DPDK threads to still 211 * benefit from a per-thread bdev_io cache. Without 212 * this, non-DPDK threads fetching from the mempool 213 * incur a cmpxchg on get and put. 214 */ 215 bdev_io_stailq_t per_thread_cache; 216 uint32_t per_thread_cache_count; 217 uint32_t bdev_io_cache_size; 218 219 struct spdk_iobuf_channel iobuf; 220 221 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 222 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 223 }; 224 225 /* 226 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 227 * will queue here their IO that awaits retry. It makes it possible to retry sending 228 * IO to one bdev after IO from other bdev completes. 229 */ 230 struct spdk_bdev_shared_resource { 231 /* The bdev management channel */ 232 struct spdk_bdev_mgmt_channel *mgmt_ch; 233 234 /* 235 * Count of I/O submitted to bdev module and waiting for completion. 236 * Incremented before submit_request() is called on an spdk_bdev_io. 237 */ 238 uint64_t io_outstanding; 239 240 /* 241 * Queue of IO awaiting retry because of a previous NOMEM status returned 242 * on this channel. 243 */ 244 bdev_io_tailq_t nomem_io; 245 246 /* 247 * Threshold which io_outstanding must drop to before retrying nomem_io. 248 */ 249 uint64_t nomem_threshold; 250 251 /* I/O channel allocated by a bdev module */ 252 struct spdk_io_channel *shared_ch; 253 254 /* Refcount of bdev channels using this resource */ 255 uint32_t ref; 256 257 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 258 }; 259 260 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 261 #define BDEV_CH_QOS_ENABLED (1 << 1) 262 263 struct spdk_bdev_channel { 264 struct spdk_bdev *bdev; 265 266 /* The channel for the underlying device */ 267 struct spdk_io_channel *channel; 268 269 /* Accel channel */ 270 struct spdk_io_channel *accel_channel; 271 272 /* Per io_device per thread data */ 273 struct spdk_bdev_shared_resource *shared_resource; 274 275 struct spdk_bdev_io_stat *stat; 276 277 /* 278 * Count of I/O submitted to the underlying dev module through this channel 279 * and waiting for completion. 280 */ 281 uint64_t io_outstanding; 282 283 /* 284 * List of all submitted I/Os including I/O that are generated via splitting. 285 */ 286 bdev_io_tailq_t io_submitted; 287 288 /* 289 * List of spdk_bdev_io that are currently queued because they write to a locked 290 * LBA range. 291 */ 292 bdev_io_tailq_t io_locked; 293 294 /* List of I/Os with accel sequence being currently executed */ 295 bdev_io_tailq_t io_accel_exec; 296 297 /* List of I/Os doing memory domain pull/push */ 298 bdev_io_tailq_t io_memory_domain; 299 300 uint32_t flags; 301 302 struct spdk_histogram_data *histogram; 303 304 #ifdef SPDK_CONFIG_VTUNE 305 uint64_t start_tsc; 306 uint64_t interval_tsc; 307 __itt_string_handle *handle; 308 struct spdk_bdev_io_stat *prev_stat; 309 #endif 310 311 bdev_io_tailq_t queued_resets; 312 313 lba_range_tailq_t locked_ranges; 314 }; 315 316 struct media_event_entry { 317 struct spdk_bdev_media_event event; 318 TAILQ_ENTRY(media_event_entry) tailq; 319 }; 320 321 #define MEDIA_EVENT_POOL_SIZE 64 322 323 struct spdk_bdev_desc { 324 struct spdk_bdev *bdev; 325 struct spdk_thread *thread; 326 struct { 327 spdk_bdev_event_cb_t event_fn; 328 void *ctx; 329 } callback; 330 bool closed; 331 bool write; 332 bool memory_domains_supported; 333 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 334 struct spdk_spinlock spinlock; 335 uint32_t refs; 336 TAILQ_HEAD(, media_event_entry) pending_media_events; 337 TAILQ_HEAD(, media_event_entry) free_media_events; 338 struct media_event_entry *media_events_buffer; 339 TAILQ_ENTRY(spdk_bdev_desc) link; 340 341 uint64_t timeout_in_sec; 342 spdk_bdev_io_timeout_cb cb_fn; 343 void *cb_arg; 344 struct spdk_poller *io_timeout_poller; 345 struct spdk_bdev_module_claim *claim; 346 }; 347 348 struct spdk_bdev_iostat_ctx { 349 struct spdk_bdev_io_stat *stat; 350 spdk_bdev_get_device_stat_cb cb; 351 void *cb_arg; 352 }; 353 354 struct set_qos_limit_ctx { 355 void (*cb_fn)(void *cb_arg, int status); 356 void *cb_arg; 357 struct spdk_bdev *bdev; 358 }; 359 360 struct spdk_bdev_channel_iter { 361 spdk_bdev_for_each_channel_msg fn; 362 spdk_bdev_for_each_channel_done cpl; 363 struct spdk_io_channel_iter *i; 364 void *ctx; 365 }; 366 367 struct spdk_bdev_io_error_stat { 368 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 369 }; 370 371 enum bdev_io_retry_state { 372 BDEV_IO_RETRY_STATE_INVALID, 373 BDEV_IO_RETRY_STATE_PULL, 374 BDEV_IO_RETRY_STATE_PULL_MD, 375 BDEV_IO_RETRY_STATE_SUBMIT, 376 BDEV_IO_RETRY_STATE_PUSH, 377 BDEV_IO_RETRY_STATE_PUSH_MD, 378 }; 379 380 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 381 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 382 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 383 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 384 385 static inline void bdev_io_complete(void *ctx); 386 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 387 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 388 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 389 390 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 391 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 392 393 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 394 struct spdk_io_channel *ch, void *_ctx); 395 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 396 397 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 398 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 399 uint64_t num_blocks, 400 struct spdk_memory_domain *domain, void *domain_ctx, 401 struct spdk_accel_sequence *seq, 402 spdk_bdev_io_completion_cb cb, void *cb_arg); 403 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 404 struct iovec *iov, int iovcnt, void *md_buf, 405 uint64_t offset_blocks, uint64_t num_blocks, 406 struct spdk_memory_domain *domain, void *domain_ctx, 407 struct spdk_accel_sequence *seq, 408 spdk_bdev_io_completion_cb cb, void *cb_arg); 409 410 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 411 uint64_t offset, uint64_t length, 412 lock_range_cb cb_fn, void *cb_arg); 413 414 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 415 uint64_t offset, uint64_t length, 416 lock_range_cb cb_fn, void *cb_arg); 417 418 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 419 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 420 421 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 422 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 423 static void claim_reset(struct spdk_bdev *bdev); 424 425 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 426 427 #define bdev_get_ext_io_opt(opts, field, defval) \ 428 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 429 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 430 431 void 432 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 433 { 434 if (!opts) { 435 SPDK_ERRLOG("opts should not be NULL\n"); 436 return; 437 } 438 439 if (!opts_size) { 440 SPDK_ERRLOG("opts_size should not be zero value\n"); 441 return; 442 } 443 444 opts->opts_size = opts_size; 445 446 #define SET_FIELD(field) \ 447 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 448 opts->field = g_bdev_opts.field; \ 449 } \ 450 451 SET_FIELD(bdev_io_pool_size); 452 SET_FIELD(bdev_io_cache_size); 453 SET_FIELD(bdev_auto_examine); 454 455 /* Do not remove this statement, you should always update this statement when you adding a new field, 456 * and do not forget to add the SET_FIELD statement for your added field. */ 457 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 458 459 #undef SET_FIELD 460 } 461 462 int 463 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 464 { 465 uint32_t min_pool_size; 466 467 if (!opts) { 468 SPDK_ERRLOG("opts cannot be NULL\n"); 469 return -1; 470 } 471 472 if (!opts->opts_size) { 473 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 474 return -1; 475 } 476 477 /* 478 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 479 * initialization. A second mgmt_ch will be created on the same thread when the application starts 480 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 481 */ 482 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 483 if (opts->bdev_io_pool_size < min_pool_size) { 484 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 485 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 486 spdk_thread_get_count()); 487 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 488 return -1; 489 } 490 491 #define SET_FIELD(field) \ 492 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 493 g_bdev_opts.field = opts->field; \ 494 } \ 495 496 SET_FIELD(bdev_io_pool_size); 497 SET_FIELD(bdev_io_cache_size); 498 SET_FIELD(bdev_auto_examine); 499 500 g_bdev_opts.opts_size = opts->opts_size; 501 502 #undef SET_FIELD 503 504 return 0; 505 } 506 507 static struct spdk_bdev * 508 bdev_get_by_name(const char *bdev_name) 509 { 510 struct spdk_bdev_name find; 511 struct spdk_bdev_name *res; 512 513 find.name = (char *)bdev_name; 514 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 515 if (res != NULL) { 516 return res->bdev; 517 } 518 519 return NULL; 520 } 521 522 struct spdk_bdev * 523 spdk_bdev_get_by_name(const char *bdev_name) 524 { 525 struct spdk_bdev *bdev; 526 527 spdk_spin_lock(&g_bdev_mgr.spinlock); 528 bdev = bdev_get_by_name(bdev_name); 529 spdk_spin_unlock(&g_bdev_mgr.spinlock); 530 531 return bdev; 532 } 533 534 struct bdev_io_status_string { 535 enum spdk_bdev_io_status status; 536 const char *str; 537 }; 538 539 static const struct bdev_io_status_string bdev_io_status_strings[] = { 540 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 541 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 542 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 543 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 544 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 545 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 546 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 547 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 548 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 549 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 550 }; 551 552 static const char * 553 bdev_io_status_get_string(enum spdk_bdev_io_status status) 554 { 555 uint32_t i; 556 557 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 558 if (bdev_io_status_strings[i].status == status) { 559 return bdev_io_status_strings[i].str; 560 } 561 } 562 563 return "reserved"; 564 } 565 566 struct spdk_bdev_wait_for_examine_ctx { 567 struct spdk_poller *poller; 568 spdk_bdev_wait_for_examine_cb cb_fn; 569 void *cb_arg; 570 }; 571 572 static bool bdev_module_all_actions_completed(void); 573 574 static int 575 bdev_wait_for_examine_cb(void *arg) 576 { 577 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 578 579 if (!bdev_module_all_actions_completed()) { 580 return SPDK_POLLER_IDLE; 581 } 582 583 spdk_poller_unregister(&ctx->poller); 584 ctx->cb_fn(ctx->cb_arg); 585 free(ctx); 586 587 return SPDK_POLLER_BUSY; 588 } 589 590 int 591 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 592 { 593 struct spdk_bdev_wait_for_examine_ctx *ctx; 594 595 ctx = calloc(1, sizeof(*ctx)); 596 if (ctx == NULL) { 597 return -ENOMEM; 598 } 599 ctx->cb_fn = cb_fn; 600 ctx->cb_arg = cb_arg; 601 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 602 603 return 0; 604 } 605 606 struct spdk_bdev_examine_item { 607 char *name; 608 TAILQ_ENTRY(spdk_bdev_examine_item) link; 609 }; 610 611 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 612 613 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 614 g_bdev_examine_allowlist); 615 616 static inline bool 617 bdev_examine_allowlist_check(const char *name) 618 { 619 struct spdk_bdev_examine_item *item; 620 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 621 if (strcmp(name, item->name) == 0) { 622 return true; 623 } 624 } 625 return false; 626 } 627 628 static inline void 629 bdev_examine_allowlist_free(void) 630 { 631 struct spdk_bdev_examine_item *item; 632 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 633 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 634 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 635 free(item->name); 636 free(item); 637 } 638 } 639 640 static inline bool 641 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 642 { 643 struct spdk_bdev_alias *tmp; 644 if (bdev_examine_allowlist_check(bdev->name)) { 645 return true; 646 } 647 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 648 if (bdev_examine_allowlist_check(tmp->alias.name)) { 649 return true; 650 } 651 } 652 return false; 653 } 654 655 static inline bool 656 bdev_ok_to_examine(struct spdk_bdev *bdev) 657 { 658 if (g_bdev_opts.bdev_auto_examine) { 659 return true; 660 } else { 661 return bdev_in_examine_allowlist(bdev); 662 } 663 } 664 665 static void 666 bdev_examine(struct spdk_bdev *bdev) 667 { 668 struct spdk_bdev_module *module; 669 struct spdk_bdev_module_claim *claim, *tmpclaim; 670 uint32_t action; 671 672 if (!bdev_ok_to_examine(bdev)) { 673 return; 674 } 675 676 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 677 if (module->examine_config) { 678 spdk_spin_lock(&module->internal.spinlock); 679 action = module->internal.action_in_progress; 680 module->internal.action_in_progress++; 681 spdk_spin_unlock(&module->internal.spinlock); 682 module->examine_config(bdev); 683 if (action != module->internal.action_in_progress) { 684 SPDK_ERRLOG("examine_config for module %s did not call " 685 "spdk_bdev_module_examine_done()\n", module->name); 686 } 687 } 688 } 689 690 spdk_spin_lock(&bdev->internal.spinlock); 691 692 switch (bdev->internal.claim_type) { 693 case SPDK_BDEV_CLAIM_NONE: 694 /* Examine by all bdev modules */ 695 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 696 if (module->examine_disk) { 697 spdk_spin_lock(&module->internal.spinlock); 698 module->internal.action_in_progress++; 699 spdk_spin_unlock(&module->internal.spinlock); 700 spdk_spin_unlock(&bdev->internal.spinlock); 701 module->examine_disk(bdev); 702 spdk_spin_lock(&bdev->internal.spinlock); 703 } 704 } 705 break; 706 case SPDK_BDEV_CLAIM_EXCL_WRITE: 707 /* Examine by the one bdev module with a v1 claim */ 708 module = bdev->internal.claim.v1.module; 709 if (module->examine_disk) { 710 spdk_spin_lock(&module->internal.spinlock); 711 module->internal.action_in_progress++; 712 spdk_spin_unlock(&module->internal.spinlock); 713 spdk_spin_unlock(&bdev->internal.spinlock); 714 module->examine_disk(bdev); 715 return; 716 } 717 break; 718 default: 719 /* Examine by all bdev modules with a v2 claim */ 720 assert(claim_type_is_v2(bdev->internal.claim_type)); 721 /* 722 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 723 * list, perhaps accessing freed memory. Without protection, this could happen 724 * while the lock is dropped during the examine callback. 725 */ 726 bdev->internal.examine_in_progress++; 727 728 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 729 module = claim->module; 730 731 if (module == NULL) { 732 /* This is a vestigial claim, held by examine_count */ 733 continue; 734 } 735 736 if (module->examine_disk == NULL) { 737 continue; 738 } 739 740 spdk_spin_lock(&module->internal.spinlock); 741 module->internal.action_in_progress++; 742 spdk_spin_unlock(&module->internal.spinlock); 743 744 /* Call examine_disk without holding internal.spinlock. */ 745 spdk_spin_unlock(&bdev->internal.spinlock); 746 module->examine_disk(bdev); 747 spdk_spin_lock(&bdev->internal.spinlock); 748 } 749 750 assert(bdev->internal.examine_in_progress > 0); 751 bdev->internal.examine_in_progress--; 752 if (bdev->internal.examine_in_progress == 0) { 753 /* Remove any claims that were released during examine_disk */ 754 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 755 if (claim->desc != NULL) { 756 continue; 757 } 758 759 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 760 free(claim); 761 } 762 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 763 claim_reset(bdev); 764 } 765 } 766 } 767 768 spdk_spin_unlock(&bdev->internal.spinlock); 769 } 770 771 int 772 spdk_bdev_examine(const char *name) 773 { 774 struct spdk_bdev *bdev; 775 struct spdk_bdev_examine_item *item; 776 struct spdk_thread *thread = spdk_get_thread(); 777 778 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 779 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 780 thread ? spdk_thread_get_name(thread) : "null"); 781 return -EINVAL; 782 } 783 784 if (g_bdev_opts.bdev_auto_examine) { 785 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 786 return -EINVAL; 787 } 788 789 if (bdev_examine_allowlist_check(name)) { 790 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 791 return -EEXIST; 792 } 793 794 item = calloc(1, sizeof(*item)); 795 if (!item) { 796 return -ENOMEM; 797 } 798 item->name = strdup(name); 799 if (!item->name) { 800 free(item); 801 return -ENOMEM; 802 } 803 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 804 805 bdev = spdk_bdev_get_by_name(name); 806 if (bdev) { 807 bdev_examine(bdev); 808 } 809 return 0; 810 } 811 812 static inline void 813 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 814 { 815 struct spdk_bdev_examine_item *item; 816 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 817 spdk_json_write_object_begin(w); 818 spdk_json_write_named_string(w, "method", "bdev_examine"); 819 spdk_json_write_named_object_begin(w, "params"); 820 spdk_json_write_named_string(w, "name", item->name); 821 spdk_json_write_object_end(w); 822 spdk_json_write_object_end(w); 823 } 824 } 825 826 struct spdk_bdev * 827 spdk_bdev_first(void) 828 { 829 struct spdk_bdev *bdev; 830 831 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 832 if (bdev) { 833 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 834 } 835 836 return bdev; 837 } 838 839 struct spdk_bdev * 840 spdk_bdev_next(struct spdk_bdev *prev) 841 { 842 struct spdk_bdev *bdev; 843 844 bdev = TAILQ_NEXT(prev, internal.link); 845 if (bdev) { 846 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 847 } 848 849 return bdev; 850 } 851 852 static struct spdk_bdev * 853 _bdev_next_leaf(struct spdk_bdev *bdev) 854 { 855 while (bdev != NULL) { 856 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 857 return bdev; 858 } else { 859 bdev = TAILQ_NEXT(bdev, internal.link); 860 } 861 } 862 863 return bdev; 864 } 865 866 struct spdk_bdev * 867 spdk_bdev_first_leaf(void) 868 { 869 struct spdk_bdev *bdev; 870 871 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 872 873 if (bdev) { 874 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 875 } 876 877 return bdev; 878 } 879 880 struct spdk_bdev * 881 spdk_bdev_next_leaf(struct spdk_bdev *prev) 882 { 883 struct spdk_bdev *bdev; 884 885 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 886 887 if (bdev) { 888 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 889 } 890 891 return bdev; 892 } 893 894 static inline bool 895 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 896 { 897 return bdev_io->internal.memory_domain; 898 } 899 900 static inline bool 901 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 902 { 903 return bdev_io->internal.has_accel_sequence; 904 } 905 906 static inline void 907 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 908 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 909 { 910 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 911 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 912 * channels we will instead wait for half to complete. 913 */ 914 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 915 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 916 917 assert(state != BDEV_IO_RETRY_STATE_INVALID); 918 bdev_io->internal.retry_state = state; 919 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 920 } 921 922 static inline void 923 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 924 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 925 { 926 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 927 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 928 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 929 930 assert(state != BDEV_IO_RETRY_STATE_INVALID); 931 bdev_io->internal.retry_state = state; 932 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 933 } 934 935 void 936 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 937 { 938 struct iovec *iovs; 939 940 if (bdev_io->u.bdev.iovs == NULL) { 941 bdev_io->u.bdev.iovs = &bdev_io->iov; 942 bdev_io->u.bdev.iovcnt = 1; 943 } 944 945 iovs = bdev_io->u.bdev.iovs; 946 947 assert(iovs != NULL); 948 assert(bdev_io->u.bdev.iovcnt >= 1); 949 950 iovs[0].iov_base = buf; 951 iovs[0].iov_len = len; 952 } 953 954 void 955 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 956 { 957 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 958 bdev_io->u.bdev.md_buf = md_buf; 959 } 960 961 static bool 962 _is_buf_allocated(const struct iovec *iovs) 963 { 964 if (iovs == NULL) { 965 return false; 966 } 967 968 return iovs[0].iov_base != NULL; 969 } 970 971 static bool 972 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 973 { 974 int i; 975 uintptr_t iov_base; 976 977 if (spdk_likely(alignment == 1)) { 978 return true; 979 } 980 981 for (i = 0; i < iovcnt; i++) { 982 iov_base = (uintptr_t)iovs[i].iov_base; 983 if ((iov_base & (alignment - 1)) != 0) { 984 return false; 985 } 986 } 987 988 return true; 989 } 990 991 static inline bool 992 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 993 { 994 if (!bdev_io->internal.accel_sequence) { 995 return false; 996 } 997 998 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 999 * bdev module didn't support accel sequences */ 1000 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1001 } 1002 1003 static inline void 1004 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1005 struct spdk_bdev_shared_resource *shared_resource) 1006 { 1007 bdev_ch->io_outstanding++; 1008 shared_resource->io_outstanding++; 1009 } 1010 1011 static inline void 1012 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1013 struct spdk_bdev_shared_resource *shared_resource) 1014 { 1015 assert(bdev_ch->io_outstanding > 0); 1016 assert(shared_resource->io_outstanding > 0); 1017 bdev_ch->io_outstanding--; 1018 shared_resource->io_outstanding--; 1019 } 1020 1021 static void 1022 bdev_io_submit_sequence_cb(void *ctx, int status) 1023 { 1024 struct spdk_bdev_io *bdev_io = ctx; 1025 1026 bdev_io->u.bdev.accel_sequence = NULL; 1027 bdev_io->internal.accel_sequence = NULL; 1028 1029 if (spdk_unlikely(status != 0)) { 1030 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1031 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1032 bdev_io_complete_unsubmitted(bdev_io); 1033 return; 1034 } 1035 1036 bdev_io_submit(bdev_io); 1037 } 1038 1039 static void 1040 bdev_io_exec_sequence_cb(void *ctx, int status) 1041 { 1042 struct spdk_bdev_io *bdev_io = ctx; 1043 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1044 1045 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1046 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1047 1048 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1049 bdev_ch_retry_io(ch); 1050 } 1051 1052 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1053 } 1054 1055 static void 1056 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1057 { 1058 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1059 1060 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1061 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1062 1063 /* Since the operations are appended during submission, they're in the opposite order than 1064 * how we want to execute them for reads (i.e. we need to execute the most recently added 1065 * operation first), so reverse the sequence before executing it. 1066 */ 1067 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1068 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1069 } 1070 1071 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1072 bdev_io_increment_outstanding(ch, ch->shared_resource); 1073 bdev_io->internal.data_transfer_cpl = cb_fn; 1074 1075 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1076 bdev_io_exec_sequence_cb, bdev_io); 1077 } 1078 1079 static void 1080 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1081 { 1082 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1083 void *buf; 1084 1085 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1086 buf = bdev_io->internal.buf; 1087 bdev_io->internal.buf = NULL; 1088 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1089 bdev_io->internal.get_aux_buf_cb = NULL; 1090 } else { 1091 assert(bdev_io->internal.get_buf_cb != NULL); 1092 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1093 bdev_io->internal.get_buf_cb = NULL; 1094 } 1095 } 1096 1097 static void 1098 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1099 { 1100 struct spdk_bdev_io *bdev_io = ctx; 1101 1102 if (rc) { 1103 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1104 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1105 } 1106 bdev_io_get_buf_complete(bdev_io, !rc); 1107 } 1108 1109 static void 1110 bdev_io_pull_md_buf_done(void *ctx, int status) 1111 { 1112 struct spdk_bdev_io *bdev_io = ctx; 1113 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1114 1115 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1116 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1117 1118 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1119 bdev_ch_retry_io(ch); 1120 } 1121 1122 assert(bdev_io->internal.data_transfer_cpl); 1123 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1124 } 1125 1126 static void 1127 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1128 { 1129 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1130 int rc = 0; 1131 1132 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1133 if (bdev_io_use_memory_domain(bdev_io)) { 1134 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1135 bdev_io_increment_outstanding(ch, ch->shared_resource); 1136 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1137 bdev_io->internal.memory_domain_ctx, 1138 &bdev_io->internal.orig_md_iov, 1, 1139 &bdev_io->internal.bounce_md_iov, 1, 1140 bdev_io_pull_md_buf_done, bdev_io); 1141 if (rc == 0) { 1142 /* Continue to submit IO in completion callback */ 1143 return; 1144 } 1145 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1146 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1147 if (rc != -ENOMEM) { 1148 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1149 spdk_memory_domain_get_dma_device_id( 1150 bdev_io->internal.memory_domain), rc); 1151 } 1152 } else { 1153 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1154 bdev_io->internal.orig_md_iov.iov_base, 1155 bdev_io->internal.orig_md_iov.iov_len); 1156 } 1157 } 1158 1159 if (spdk_unlikely(rc == -ENOMEM)) { 1160 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1161 } else { 1162 assert(bdev_io->internal.data_transfer_cpl); 1163 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1164 } 1165 } 1166 1167 static void 1168 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1169 { 1170 /* save original md_buf */ 1171 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1172 bdev_io->internal.orig_md_iov.iov_len = len; 1173 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1174 bdev_io->internal.bounce_md_iov.iov_len = len; 1175 /* set bounce md_buf */ 1176 bdev_io->u.bdev.md_buf = md_buf; 1177 1178 bdev_io_pull_md_buf(bdev_io); 1179 } 1180 1181 static void 1182 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1183 { 1184 struct spdk_bdev *bdev = bdev_io->bdev; 1185 uint64_t md_len; 1186 void *buf; 1187 1188 if (spdk_bdev_is_md_separate(bdev)) { 1189 assert(!bdev_io_use_accel_sequence(bdev_io)); 1190 1191 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1192 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1193 1194 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1195 1196 if (bdev_io->u.bdev.md_buf != NULL) { 1197 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1198 return; 1199 } else { 1200 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1201 } 1202 } 1203 1204 bdev_io_get_buf_complete(bdev_io, true); 1205 } 1206 1207 static inline void 1208 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1209 { 1210 if (rc) { 1211 SPDK_ERRLOG("Failed to get data buffer\n"); 1212 assert(bdev_io->internal.data_transfer_cpl); 1213 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1214 return; 1215 } 1216 1217 _bdev_io_set_md_buf(bdev_io); 1218 } 1219 1220 static void 1221 bdev_io_pull_data_done_and_track(void *ctx, int status) 1222 { 1223 struct spdk_bdev_io *bdev_io = ctx; 1224 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1225 1226 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1227 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1228 1229 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1230 bdev_ch_retry_io(ch); 1231 } 1232 1233 bdev_io_pull_data_done(bdev_io, status); 1234 } 1235 1236 static void 1237 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1238 { 1239 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1240 int rc = 0; 1241 1242 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1243 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1244 * operation */ 1245 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1246 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1247 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1248 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1249 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1250 NULL, NULL, 1251 bdev_io->internal.orig_iovs, 1252 bdev_io->internal.orig_iovcnt, 1253 bdev_io->internal.memory_domain, 1254 bdev_io->internal.memory_domain_ctx, 1255 0, NULL, NULL); 1256 } else { 1257 /* We need to reverse the src/dst for reads */ 1258 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1259 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1260 bdev_io->internal.orig_iovs, 1261 bdev_io->internal.orig_iovcnt, 1262 bdev_io->internal.memory_domain, 1263 bdev_io->internal.memory_domain_ctx, 1264 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1265 NULL, NULL, 0, NULL, NULL); 1266 } 1267 1268 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1269 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1270 bdev_io->internal.accel_sequence); 1271 } 1272 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1273 /* if this is write path, copy data from original buffer to bounce buffer */ 1274 if (bdev_io_use_memory_domain(bdev_io)) { 1275 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1276 bdev_io_increment_outstanding(ch, ch->shared_resource); 1277 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1278 bdev_io->internal.memory_domain_ctx, 1279 bdev_io->internal.orig_iovs, 1280 (uint32_t) bdev_io->internal.orig_iovcnt, 1281 bdev_io->u.bdev.iovs, 1, 1282 bdev_io_pull_data_done_and_track, 1283 bdev_io); 1284 if (rc == 0) { 1285 /* Continue to submit IO in completion callback */ 1286 return; 1287 } 1288 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1289 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1290 if (rc != -ENOMEM) { 1291 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1292 spdk_memory_domain_get_dma_device_id( 1293 bdev_io->internal.memory_domain)); 1294 } 1295 } else { 1296 assert(bdev_io->u.bdev.iovcnt == 1); 1297 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1298 bdev_io->u.bdev.iovs[0].iov_len, 1299 bdev_io->internal.orig_iovs, 1300 bdev_io->internal.orig_iovcnt); 1301 } 1302 } 1303 1304 if (spdk_unlikely(rc == -ENOMEM)) { 1305 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1306 } else { 1307 bdev_io_pull_data_done(bdev_io, rc); 1308 } 1309 } 1310 1311 static void 1312 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1313 bdev_copy_bounce_buffer_cpl cpl_cb) 1314 { 1315 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1316 1317 bdev_io->internal.data_transfer_cpl = cpl_cb; 1318 /* save original iovec */ 1319 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1320 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1321 /* set bounce iov */ 1322 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1323 bdev_io->u.bdev.iovcnt = 1; 1324 /* set bounce buffer for this operation */ 1325 bdev_io->u.bdev.iovs[0].iov_base = buf; 1326 bdev_io->u.bdev.iovs[0].iov_len = len; 1327 1328 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1329 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1330 } else { 1331 bdev_io_pull_data(bdev_io); 1332 } 1333 } 1334 1335 static void 1336 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1337 { 1338 struct spdk_bdev *bdev = bdev_io->bdev; 1339 bool buf_allocated; 1340 uint64_t alignment; 1341 void *aligned_buf; 1342 1343 bdev_io->internal.buf = buf; 1344 1345 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1346 bdev_io_get_buf_complete(bdev_io, true); 1347 return; 1348 } 1349 1350 alignment = spdk_bdev_get_buf_align(bdev); 1351 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1352 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1353 1354 if (buf_allocated) { 1355 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1356 /* Continue in completion callback */ 1357 return; 1358 } else { 1359 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1360 } 1361 1362 _bdev_io_set_md_buf(bdev_io); 1363 } 1364 1365 static inline uint64_t 1366 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1367 { 1368 struct spdk_bdev *bdev = bdev_io->bdev; 1369 uint64_t md_len, alignment; 1370 1371 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1372 1373 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1374 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1375 1376 return len + alignment + md_len; 1377 } 1378 1379 static void 1380 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1381 { 1382 struct spdk_bdev_mgmt_channel *ch; 1383 1384 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1385 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1386 } 1387 1388 static void 1389 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1390 { 1391 assert(bdev_io->internal.buf != NULL); 1392 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1393 bdev_io->internal.buf = NULL; 1394 } 1395 1396 void 1397 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1398 { 1399 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1400 1401 assert(buf != NULL); 1402 _bdev_io_put_buf(bdev_io, buf, len); 1403 } 1404 1405 static inline void 1406 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1407 struct spdk_bdev_io *bdev_io) 1408 { 1409 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1410 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1411 * sequence pointer to make sure we won't touch it anymore. */ 1412 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1413 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1414 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1415 bdev_io->internal.accel_sequence = NULL; 1416 } 1417 1418 bdev->fn_table->submit_request(ioch, bdev_io); 1419 } 1420 1421 static inline void 1422 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1423 { 1424 struct spdk_bdev *bdev = bdev_ch->bdev; 1425 1426 bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource); 1427 bdev_io->internal.error.nvme.cdw0 = 0; 1428 bdev_io->num_retries++; 1429 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1430 } 1431 1432 static void 1433 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1434 { 1435 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1436 struct spdk_bdev_io *bdev_io; 1437 1438 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1439 /* 1440 * Allow some more I/O to complete before retrying the nomem_io queue. 1441 * Some drivers (such as nvme) cannot immediately take a new I/O in 1442 * the context of a completion, because the resources for the I/O are 1443 * not released until control returns to the bdev poller. Also, we 1444 * may require several small I/O to complete before a larger I/O 1445 * (that requires splitting) can be submitted. 1446 */ 1447 return; 1448 } 1449 1450 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1451 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1452 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1453 1454 switch (bdev_io->internal.retry_state) { 1455 case BDEV_IO_RETRY_STATE_SUBMIT: 1456 bdev_ch_resubmit_io(bdev_ch, bdev_io); 1457 break; 1458 case BDEV_IO_RETRY_STATE_PULL: 1459 bdev_io_pull_data(bdev_io); 1460 break; 1461 case BDEV_IO_RETRY_STATE_PULL_MD: 1462 bdev_io_pull_md_buf(bdev_io); 1463 break; 1464 case BDEV_IO_RETRY_STATE_PUSH: 1465 bdev_io_push_bounce_data(bdev_io); 1466 break; 1467 case BDEV_IO_RETRY_STATE_PUSH_MD: 1468 bdev_io_push_bounce_md_buf(bdev_io); 1469 break; 1470 default: 1471 assert(0 && "invalid retry state"); 1472 break; 1473 } 1474 1475 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1476 /* This IO completed again with NOMEM status, so break the loop and 1477 * don't try anymore. Note that a bdev_io that fails with NOMEM 1478 * always gets requeued at the front of the list, to maintain 1479 * ordering. 1480 */ 1481 break; 1482 } 1483 } 1484 } 1485 1486 static inline bool 1487 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1488 { 1489 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1490 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1491 1492 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1493 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1494 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1495 1496 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1497 * ownership of that sequence is transferred back to the bdev layer, so we need to 1498 * restore internal.accel_sequence to make sure that the sequence is handled 1499 * correctly in case the I/O is later aborted. */ 1500 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1501 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1502 assert(bdev_io->internal.accel_sequence == NULL); 1503 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1504 } 1505 1506 return true; 1507 } 1508 1509 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1510 bdev_ch_retry_io(bdev_ch); 1511 } 1512 1513 return false; 1514 } 1515 1516 static void 1517 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1518 { 1519 struct spdk_bdev_io *bdev_io = ctx; 1520 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1521 1522 if (rc) { 1523 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1524 } 1525 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1526 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1527 */ 1528 bdev_io_put_buf(bdev_io); 1529 1530 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1531 bdev_ch_retry_io(ch); 1532 } 1533 1534 /* Continue with IO completion flow */ 1535 bdev_io_complete(bdev_io); 1536 } 1537 1538 static void 1539 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1540 { 1541 struct spdk_bdev_io *bdev_io = ctx; 1542 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1543 1544 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1545 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1546 1547 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1548 bdev_ch_retry_io(ch); 1549 } 1550 1551 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1552 } 1553 1554 static inline void 1555 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1556 { 1557 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1558 int rc = 0; 1559 1560 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1561 /* do the same for metadata buffer */ 1562 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1563 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1564 1565 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1566 if (bdev_io_use_memory_domain(bdev_io)) { 1567 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1568 bdev_io_increment_outstanding(ch, ch->shared_resource); 1569 /* If memory domain is used then we need to call async push function */ 1570 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1571 bdev_io->internal.memory_domain_ctx, 1572 &bdev_io->internal.orig_md_iov, 1573 (uint32_t)bdev_io->internal.orig_iovcnt, 1574 &bdev_io->internal.bounce_md_iov, 1, 1575 bdev_io_push_bounce_md_buf_done, 1576 bdev_io); 1577 if (rc == 0) { 1578 /* Continue IO completion in async callback */ 1579 return; 1580 } 1581 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1582 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1583 if (rc != -ENOMEM) { 1584 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1585 spdk_memory_domain_get_dma_device_id( 1586 bdev_io->internal.memory_domain)); 1587 } 1588 } else { 1589 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1590 bdev_io->internal.orig_md_iov.iov_len); 1591 } 1592 } 1593 } 1594 1595 if (spdk_unlikely(rc == -ENOMEM)) { 1596 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1597 } else { 1598 assert(bdev_io->internal.data_transfer_cpl); 1599 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1600 } 1601 } 1602 1603 static inline void 1604 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1605 { 1606 assert(bdev_io->internal.data_transfer_cpl); 1607 if (rc) { 1608 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1609 return; 1610 } 1611 1612 /* set original buffer for this io */ 1613 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1614 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1615 /* disable bouncing buffer for this io */ 1616 bdev_io->internal.orig_iovcnt = 0; 1617 bdev_io->internal.orig_iovs = NULL; 1618 1619 bdev_io_push_bounce_md_buf(bdev_io); 1620 } 1621 1622 static void 1623 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1624 { 1625 struct spdk_bdev_io *bdev_io = ctx; 1626 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1627 1628 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1629 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1630 1631 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1632 bdev_ch_retry_io(ch); 1633 } 1634 1635 bdev_io_push_bounce_data_done(bdev_io, status); 1636 } 1637 1638 static inline void 1639 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1640 { 1641 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1642 int rc = 0; 1643 1644 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1645 assert(!bdev_io_use_accel_sequence(bdev_io)); 1646 1647 /* if this is read path, copy data from bounce buffer to original buffer */ 1648 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1649 if (bdev_io_use_memory_domain(bdev_io)) { 1650 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1651 bdev_io_increment_outstanding(ch, ch->shared_resource); 1652 /* If memory domain is used then we need to call async push function */ 1653 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1654 bdev_io->internal.memory_domain_ctx, 1655 bdev_io->internal.orig_iovs, 1656 (uint32_t)bdev_io->internal.orig_iovcnt, 1657 &bdev_io->internal.bounce_iov, 1, 1658 bdev_io_push_bounce_data_done_and_track, 1659 bdev_io); 1660 if (rc == 0) { 1661 /* Continue IO completion in async callback */ 1662 return; 1663 } 1664 1665 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1666 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1667 if (rc != -ENOMEM) { 1668 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1669 spdk_memory_domain_get_dma_device_id( 1670 bdev_io->internal.memory_domain)); 1671 } 1672 } else { 1673 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1674 bdev_io->internal.orig_iovcnt, 1675 bdev_io->internal.bounce_iov.iov_base, 1676 bdev_io->internal.bounce_iov.iov_len); 1677 } 1678 } 1679 1680 if (spdk_unlikely(rc == -ENOMEM)) { 1681 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1682 } else { 1683 bdev_io_push_bounce_data_done(bdev_io, rc); 1684 } 1685 } 1686 1687 static inline void 1688 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1689 { 1690 bdev_io->internal.data_transfer_cpl = cpl_cb; 1691 bdev_io_push_bounce_data(bdev_io); 1692 } 1693 1694 static void 1695 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1696 { 1697 struct spdk_bdev_io *bdev_io; 1698 1699 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1700 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1701 } 1702 1703 static void 1704 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1705 { 1706 struct spdk_bdev_mgmt_channel *mgmt_ch; 1707 uint64_t max_len; 1708 void *buf; 1709 1710 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1711 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1712 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1713 1714 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1715 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1716 bdev_io_get_buf_complete(bdev_io, false); 1717 return; 1718 } 1719 1720 bdev_io->internal.buf_len = len; 1721 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1722 bdev_io_get_iobuf_cb); 1723 if (buf != NULL) { 1724 _bdev_io_set_buf(bdev_io, buf, len); 1725 } 1726 } 1727 1728 void 1729 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1730 { 1731 struct spdk_bdev *bdev = bdev_io->bdev; 1732 uint64_t alignment; 1733 1734 assert(cb != NULL); 1735 bdev_io->internal.get_buf_cb = cb; 1736 1737 alignment = spdk_bdev_get_buf_align(bdev); 1738 1739 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1740 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1741 /* Buffer already present and aligned */ 1742 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1743 return; 1744 } 1745 1746 bdev_io_get_buf(bdev_io, len); 1747 } 1748 1749 static void 1750 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1751 bool success) 1752 { 1753 if (!success) { 1754 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1755 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1756 bdev_io_complete_unsubmitted(bdev_io); 1757 return; 1758 } 1759 1760 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1761 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1762 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1763 return; 1764 } 1765 /* For reads we'll execute the sequence after the data is read, so, for now, only 1766 * clear out accel_sequence pointer and submit the IO */ 1767 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1768 bdev_io->u.bdev.accel_sequence = NULL; 1769 } 1770 1771 bdev_io_submit(bdev_io); 1772 } 1773 1774 static void 1775 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1776 uint64_t len) 1777 { 1778 assert(cb != NULL); 1779 bdev_io->internal.get_buf_cb = cb; 1780 1781 bdev_io_get_buf(bdev_io, len); 1782 } 1783 1784 void 1785 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1786 { 1787 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1788 1789 assert(cb != NULL); 1790 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1791 bdev_io->internal.get_aux_buf_cb = cb; 1792 bdev_io_get_buf(bdev_io, len); 1793 } 1794 1795 static int 1796 bdev_module_get_max_ctx_size(void) 1797 { 1798 struct spdk_bdev_module *bdev_module; 1799 int max_bdev_module_size = 0; 1800 1801 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1802 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1803 max_bdev_module_size = bdev_module->get_ctx_size(); 1804 } 1805 } 1806 1807 return max_bdev_module_size; 1808 } 1809 1810 static void 1811 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1812 { 1813 int i; 1814 struct spdk_bdev_qos *qos = bdev->internal.qos; 1815 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1816 1817 if (!qos) { 1818 return; 1819 } 1820 1821 spdk_bdev_get_qos_rate_limits(bdev, limits); 1822 1823 spdk_json_write_object_begin(w); 1824 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1825 1826 spdk_json_write_named_object_begin(w, "params"); 1827 spdk_json_write_named_string(w, "name", bdev->name); 1828 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1829 if (limits[i] > 0) { 1830 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1831 } 1832 } 1833 spdk_json_write_object_end(w); 1834 1835 spdk_json_write_object_end(w); 1836 } 1837 1838 void 1839 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1840 { 1841 struct spdk_bdev_module *bdev_module; 1842 struct spdk_bdev *bdev; 1843 1844 assert(w != NULL); 1845 1846 spdk_json_write_array_begin(w); 1847 1848 spdk_json_write_object_begin(w); 1849 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1850 spdk_json_write_named_object_begin(w, "params"); 1851 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1852 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1853 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1854 spdk_json_write_object_end(w); 1855 spdk_json_write_object_end(w); 1856 1857 bdev_examine_allowlist_config_json(w); 1858 1859 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1860 if (bdev_module->config_json) { 1861 bdev_module->config_json(w); 1862 } 1863 } 1864 1865 spdk_spin_lock(&g_bdev_mgr.spinlock); 1866 1867 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1868 if (bdev->fn_table->write_config_json) { 1869 bdev->fn_table->write_config_json(bdev, w); 1870 } 1871 1872 bdev_qos_config_json(bdev, w); 1873 } 1874 1875 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1876 1877 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1878 spdk_json_write_object_begin(w); 1879 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1880 spdk_json_write_object_end(w); 1881 1882 spdk_json_write_array_end(w); 1883 } 1884 1885 static void 1886 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1887 { 1888 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1889 struct spdk_bdev_io *bdev_io; 1890 1891 spdk_iobuf_channel_fini(&ch->iobuf); 1892 1893 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1894 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1895 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1896 ch->per_thread_cache_count--; 1897 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1898 } 1899 1900 assert(ch->per_thread_cache_count == 0); 1901 } 1902 1903 static int 1904 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1905 { 1906 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1907 struct spdk_bdev_io *bdev_io; 1908 uint32_t i; 1909 int rc; 1910 1911 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1912 if (rc != 0) { 1913 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1914 return -1; 1915 } 1916 1917 STAILQ_INIT(&ch->per_thread_cache); 1918 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1919 1920 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1921 ch->per_thread_cache_count = 0; 1922 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1923 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1924 if (bdev_io == NULL) { 1925 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1926 assert(false); 1927 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1928 return -1; 1929 } 1930 ch->per_thread_cache_count++; 1931 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1932 } 1933 1934 TAILQ_INIT(&ch->shared_resources); 1935 TAILQ_INIT(&ch->io_wait_queue); 1936 1937 return 0; 1938 } 1939 1940 static void 1941 bdev_init_complete(int rc) 1942 { 1943 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1944 void *cb_arg = g_init_cb_arg; 1945 struct spdk_bdev_module *m; 1946 1947 g_bdev_mgr.init_complete = true; 1948 g_init_cb_fn = NULL; 1949 g_init_cb_arg = NULL; 1950 1951 /* 1952 * For modules that need to know when subsystem init is complete, 1953 * inform them now. 1954 */ 1955 if (rc == 0) { 1956 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1957 if (m->init_complete) { 1958 m->init_complete(); 1959 } 1960 } 1961 } 1962 1963 cb_fn(cb_arg, rc); 1964 } 1965 1966 static bool 1967 bdev_module_all_actions_completed(void) 1968 { 1969 struct spdk_bdev_module *m; 1970 1971 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1972 if (m->internal.action_in_progress > 0) { 1973 return false; 1974 } 1975 } 1976 return true; 1977 } 1978 1979 static void 1980 bdev_module_action_complete(void) 1981 { 1982 /* 1983 * Don't finish bdev subsystem initialization if 1984 * module pre-initialization is still in progress, or 1985 * the subsystem been already initialized. 1986 */ 1987 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1988 return; 1989 } 1990 1991 /* 1992 * Check all bdev modules for inits/examinations in progress. If any 1993 * exist, return immediately since we cannot finish bdev subsystem 1994 * initialization until all are completed. 1995 */ 1996 if (!bdev_module_all_actions_completed()) { 1997 return; 1998 } 1999 2000 /* 2001 * Modules already finished initialization - now that all 2002 * the bdev modules have finished their asynchronous I/O 2003 * processing, the entire bdev layer can be marked as complete. 2004 */ 2005 bdev_init_complete(0); 2006 } 2007 2008 static void 2009 bdev_module_action_done(struct spdk_bdev_module *module) 2010 { 2011 spdk_spin_lock(&module->internal.spinlock); 2012 assert(module->internal.action_in_progress > 0); 2013 module->internal.action_in_progress--; 2014 spdk_spin_unlock(&module->internal.spinlock); 2015 bdev_module_action_complete(); 2016 } 2017 2018 void 2019 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2020 { 2021 assert(module->async_init); 2022 bdev_module_action_done(module); 2023 } 2024 2025 void 2026 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2027 { 2028 bdev_module_action_done(module); 2029 } 2030 2031 /** The last initialized bdev module */ 2032 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2033 2034 static void 2035 bdev_init_failed(void *cb_arg) 2036 { 2037 struct spdk_bdev_module *module = cb_arg; 2038 2039 spdk_spin_lock(&module->internal.spinlock); 2040 assert(module->internal.action_in_progress > 0); 2041 module->internal.action_in_progress--; 2042 spdk_spin_unlock(&module->internal.spinlock); 2043 bdev_init_complete(-1); 2044 } 2045 2046 static int 2047 bdev_modules_init(void) 2048 { 2049 struct spdk_bdev_module *module; 2050 int rc = 0; 2051 2052 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2053 g_resume_bdev_module = module; 2054 if (module->async_init) { 2055 spdk_spin_lock(&module->internal.spinlock); 2056 module->internal.action_in_progress = 1; 2057 spdk_spin_unlock(&module->internal.spinlock); 2058 } 2059 rc = module->module_init(); 2060 if (rc != 0) { 2061 /* Bump action_in_progress to prevent other modules from completion of modules_init 2062 * Send message to defer application shutdown until resources are cleaned up */ 2063 spdk_spin_lock(&module->internal.spinlock); 2064 module->internal.action_in_progress = 1; 2065 spdk_spin_unlock(&module->internal.spinlock); 2066 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2067 return rc; 2068 } 2069 } 2070 2071 g_resume_bdev_module = NULL; 2072 return 0; 2073 } 2074 2075 void 2076 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2077 { 2078 int rc = 0; 2079 char mempool_name[32]; 2080 2081 assert(cb_fn != NULL); 2082 2083 g_init_cb_fn = cb_fn; 2084 g_init_cb_arg = cb_arg; 2085 2086 spdk_notify_type_register("bdev_register"); 2087 spdk_notify_type_register("bdev_unregister"); 2088 2089 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2090 2091 rc = spdk_iobuf_register_module("bdev"); 2092 if (rc != 0) { 2093 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2094 bdev_init_complete(-1); 2095 return; 2096 } 2097 2098 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2099 g_bdev_opts.bdev_io_pool_size, 2100 sizeof(struct spdk_bdev_io) + 2101 bdev_module_get_max_ctx_size(), 2102 0, 2103 SPDK_ENV_SOCKET_ID_ANY); 2104 2105 if (g_bdev_mgr.bdev_io_pool == NULL) { 2106 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2107 bdev_init_complete(-1); 2108 return; 2109 } 2110 2111 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2112 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2113 if (!g_bdev_mgr.zero_buffer) { 2114 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2115 bdev_init_complete(-1); 2116 return; 2117 } 2118 2119 #ifdef SPDK_CONFIG_VTUNE 2120 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2121 #endif 2122 2123 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2124 bdev_mgmt_channel_destroy, 2125 sizeof(struct spdk_bdev_mgmt_channel), 2126 "bdev_mgr"); 2127 2128 rc = bdev_modules_init(); 2129 g_bdev_mgr.module_init_complete = true; 2130 if (rc != 0) { 2131 SPDK_ERRLOG("bdev modules init failed\n"); 2132 return; 2133 } 2134 2135 bdev_module_action_complete(); 2136 } 2137 2138 static void 2139 bdev_mgr_unregister_cb(void *io_device) 2140 { 2141 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2142 2143 if (g_bdev_mgr.bdev_io_pool) { 2144 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2145 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2146 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2147 g_bdev_opts.bdev_io_pool_size); 2148 } 2149 2150 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2151 } 2152 2153 spdk_free(g_bdev_mgr.zero_buffer); 2154 2155 bdev_examine_allowlist_free(); 2156 2157 cb_fn(g_fini_cb_arg); 2158 g_fini_cb_fn = NULL; 2159 g_fini_cb_arg = NULL; 2160 g_bdev_mgr.init_complete = false; 2161 g_bdev_mgr.module_init_complete = false; 2162 } 2163 2164 static void 2165 bdev_module_fini_iter(void *arg) 2166 { 2167 struct spdk_bdev_module *bdev_module; 2168 2169 /* FIXME: Handling initialization failures is broken now, 2170 * so we won't even try cleaning up after successfully 2171 * initialized modules. if module_init_complete is false, 2172 * just call spdk_bdev_mgr_unregister_cb 2173 */ 2174 if (!g_bdev_mgr.module_init_complete) { 2175 bdev_mgr_unregister_cb(NULL); 2176 return; 2177 } 2178 2179 /* Start iterating from the last touched module */ 2180 if (!g_resume_bdev_module) { 2181 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2182 } else { 2183 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2184 internal.tailq); 2185 } 2186 2187 while (bdev_module) { 2188 if (bdev_module->async_fini) { 2189 /* Save our place so we can resume later. We must 2190 * save the variable here, before calling module_fini() 2191 * below, because in some cases the module may immediately 2192 * call spdk_bdev_module_fini_done() and re-enter 2193 * this function to continue iterating. */ 2194 g_resume_bdev_module = bdev_module; 2195 } 2196 2197 if (bdev_module->module_fini) { 2198 bdev_module->module_fini(); 2199 } 2200 2201 if (bdev_module->async_fini) { 2202 return; 2203 } 2204 2205 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2206 internal.tailq); 2207 } 2208 2209 g_resume_bdev_module = NULL; 2210 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2211 } 2212 2213 void 2214 spdk_bdev_module_fini_done(void) 2215 { 2216 if (spdk_get_thread() != g_fini_thread) { 2217 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2218 } else { 2219 bdev_module_fini_iter(NULL); 2220 } 2221 } 2222 2223 static void 2224 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2225 { 2226 struct spdk_bdev *bdev = cb_arg; 2227 2228 if (bdeverrno && bdev) { 2229 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2230 bdev->name); 2231 2232 /* 2233 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2234 * bdev; try to continue by manually removing this bdev from the list and continue 2235 * with the next bdev in the list. 2236 */ 2237 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2238 } 2239 2240 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2241 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2242 /* 2243 * Bdev module finish need to be deferred as we might be in the middle of some context 2244 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2245 * after returning. 2246 */ 2247 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2248 return; 2249 } 2250 2251 /* 2252 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2253 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2254 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2255 * base bdevs. 2256 * 2257 * Also, walk the list in the reverse order. 2258 */ 2259 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2260 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2261 spdk_spin_lock(&bdev->internal.spinlock); 2262 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2263 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2264 spdk_spin_unlock(&bdev->internal.spinlock); 2265 continue; 2266 } 2267 spdk_spin_unlock(&bdev->internal.spinlock); 2268 2269 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2270 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2271 return; 2272 } 2273 2274 /* 2275 * If any bdev fails to unclaim underlying bdev properly, we may face the 2276 * case of bdev list consisting of claimed bdevs only (if claims are managed 2277 * correctly, this would mean there's a loop in the claims graph which is 2278 * clearly impossible). Warn and unregister last bdev on the list then. 2279 */ 2280 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2281 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2282 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2283 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2284 return; 2285 } 2286 } 2287 2288 static void 2289 bdev_module_fini_start_iter(void *arg) 2290 { 2291 struct spdk_bdev_module *bdev_module; 2292 2293 if (!g_resume_bdev_module) { 2294 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2295 } else { 2296 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2297 } 2298 2299 while (bdev_module) { 2300 if (bdev_module->async_fini_start) { 2301 /* Save our place so we can resume later. We must 2302 * save the variable here, before calling fini_start() 2303 * below, because in some cases the module may immediately 2304 * call spdk_bdev_module_fini_start_done() and re-enter 2305 * this function to continue iterating. */ 2306 g_resume_bdev_module = bdev_module; 2307 } 2308 2309 if (bdev_module->fini_start) { 2310 bdev_module->fini_start(); 2311 } 2312 2313 if (bdev_module->async_fini_start) { 2314 return; 2315 } 2316 2317 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2318 } 2319 2320 g_resume_bdev_module = NULL; 2321 2322 bdev_finish_unregister_bdevs_iter(NULL, 0); 2323 } 2324 2325 void 2326 spdk_bdev_module_fini_start_done(void) 2327 { 2328 if (spdk_get_thread() != g_fini_thread) { 2329 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2330 } else { 2331 bdev_module_fini_start_iter(NULL); 2332 } 2333 } 2334 2335 static void 2336 bdev_finish_wait_for_examine_done(void *cb_arg) 2337 { 2338 bdev_module_fini_start_iter(NULL); 2339 } 2340 2341 static void bdev_open_async_fini(void); 2342 2343 void 2344 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2345 { 2346 int rc; 2347 2348 assert(cb_fn != NULL); 2349 2350 g_fini_thread = spdk_get_thread(); 2351 2352 g_fini_cb_fn = cb_fn; 2353 g_fini_cb_arg = cb_arg; 2354 2355 bdev_open_async_fini(); 2356 2357 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2358 if (rc != 0) { 2359 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2360 bdev_finish_wait_for_examine_done(NULL); 2361 } 2362 } 2363 2364 struct spdk_bdev_io * 2365 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2366 { 2367 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2368 struct spdk_bdev_io *bdev_io; 2369 2370 if (ch->per_thread_cache_count > 0) { 2371 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2372 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2373 ch->per_thread_cache_count--; 2374 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2375 /* 2376 * Don't try to look for bdev_ios in the global pool if there are 2377 * waiters on bdev_ios - we don't want this caller to jump the line. 2378 */ 2379 bdev_io = NULL; 2380 } else { 2381 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2382 } 2383 2384 return bdev_io; 2385 } 2386 2387 void 2388 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2389 { 2390 struct spdk_bdev_mgmt_channel *ch; 2391 2392 assert(bdev_io != NULL); 2393 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2394 2395 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2396 2397 if (bdev_io->internal.buf != NULL) { 2398 bdev_io_put_buf(bdev_io); 2399 } 2400 2401 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2402 ch->per_thread_cache_count++; 2403 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2404 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2405 struct spdk_bdev_io_wait_entry *entry; 2406 2407 entry = TAILQ_FIRST(&ch->io_wait_queue); 2408 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2409 entry->cb_fn(entry->cb_arg); 2410 } 2411 } else { 2412 /* We should never have a full cache with entries on the io wait queue. */ 2413 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2414 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2415 } 2416 } 2417 2418 static bool 2419 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2420 { 2421 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2422 2423 switch (limit) { 2424 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2425 return true; 2426 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2427 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2428 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2429 return false; 2430 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2431 default: 2432 return false; 2433 } 2434 } 2435 2436 static bool 2437 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2438 { 2439 switch (bdev_io->type) { 2440 case SPDK_BDEV_IO_TYPE_NVME_IO: 2441 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2442 case SPDK_BDEV_IO_TYPE_READ: 2443 case SPDK_BDEV_IO_TYPE_WRITE: 2444 return true; 2445 case SPDK_BDEV_IO_TYPE_ZCOPY: 2446 if (bdev_io->u.bdev.zcopy.start) { 2447 return true; 2448 } else { 2449 return false; 2450 } 2451 default: 2452 return false; 2453 } 2454 } 2455 2456 static bool 2457 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2458 { 2459 switch (bdev_io->type) { 2460 case SPDK_BDEV_IO_TYPE_NVME_IO: 2461 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2462 /* Bit 1 (0x2) set for read operation */ 2463 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2464 return true; 2465 } else { 2466 return false; 2467 } 2468 case SPDK_BDEV_IO_TYPE_READ: 2469 return true; 2470 case SPDK_BDEV_IO_TYPE_ZCOPY: 2471 /* Populate to read from disk */ 2472 if (bdev_io->u.bdev.zcopy.populate) { 2473 return true; 2474 } else { 2475 return false; 2476 } 2477 default: 2478 return false; 2479 } 2480 } 2481 2482 static uint64_t 2483 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2484 { 2485 struct spdk_bdev *bdev = bdev_io->bdev; 2486 2487 switch (bdev_io->type) { 2488 case SPDK_BDEV_IO_TYPE_NVME_IO: 2489 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2490 return bdev_io->u.nvme_passthru.nbytes; 2491 case SPDK_BDEV_IO_TYPE_READ: 2492 case SPDK_BDEV_IO_TYPE_WRITE: 2493 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2494 case SPDK_BDEV_IO_TYPE_ZCOPY: 2495 /* Track the data in the start phase only */ 2496 if (bdev_io->u.bdev.zcopy.start) { 2497 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2498 } else { 2499 return 0; 2500 } 2501 default: 2502 return 0; 2503 } 2504 } 2505 2506 static bool 2507 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2508 { 2509 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2510 return true; 2511 } else { 2512 return false; 2513 } 2514 } 2515 2516 static bool 2517 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2518 { 2519 if (bdev_is_read_io(io) == false) { 2520 return false; 2521 } 2522 2523 return bdev_qos_rw_queue_io(limit, io); 2524 } 2525 2526 static bool 2527 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2528 { 2529 if (bdev_is_read_io(io) == true) { 2530 return false; 2531 } 2532 2533 return bdev_qos_rw_queue_io(limit, io); 2534 } 2535 2536 static void 2537 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2538 { 2539 limit->remaining_this_timeslice--; 2540 } 2541 2542 static void 2543 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2544 { 2545 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2546 } 2547 2548 static void 2549 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2550 { 2551 if (bdev_is_read_io(io) == false) { 2552 return; 2553 } 2554 2555 return bdev_qos_rw_bps_update_quota(limit, io); 2556 } 2557 2558 static void 2559 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2560 { 2561 if (bdev_is_read_io(io) == true) { 2562 return; 2563 } 2564 2565 return bdev_qos_rw_bps_update_quota(limit, io); 2566 } 2567 2568 static void 2569 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2570 { 2571 int i; 2572 2573 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2574 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2575 qos->rate_limits[i].queue_io = NULL; 2576 qos->rate_limits[i].update_quota = NULL; 2577 continue; 2578 } 2579 2580 switch (i) { 2581 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2582 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2583 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2584 break; 2585 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2586 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2587 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2588 break; 2589 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2590 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2591 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2592 break; 2593 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2594 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2595 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2596 break; 2597 default: 2598 break; 2599 } 2600 } 2601 } 2602 2603 static void 2604 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2605 struct spdk_bdev_io *bdev_io, 2606 enum spdk_bdev_io_status status) 2607 { 2608 bdev_io->internal.in_submit_request = true; 2609 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2610 spdk_bdev_io_complete(bdev_io, status); 2611 bdev_io->internal.in_submit_request = false; 2612 } 2613 2614 static inline void 2615 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2616 { 2617 struct spdk_bdev *bdev = bdev_io->bdev; 2618 struct spdk_io_channel *ch = bdev_ch->channel; 2619 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2620 2621 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2622 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2623 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2624 2625 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2626 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2627 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2628 SPDK_BDEV_IO_STATUS_SUCCESS); 2629 return; 2630 } 2631 } 2632 2633 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2634 bdev_io->bdev->split_on_write_unit && 2635 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2636 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2637 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2638 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2639 return; 2640 } 2641 2642 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2643 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2644 bdev_io->internal.in_submit_request = true; 2645 bdev_submit_request(bdev, ch, bdev_io); 2646 bdev_io->internal.in_submit_request = false; 2647 } else { 2648 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2649 } 2650 } 2651 2652 static bool 2653 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2654 { 2655 int i; 2656 2657 if (bdev_qos_io_to_limit(bdev_io) == true) { 2658 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2659 if (!qos->rate_limits[i].queue_io) { 2660 continue; 2661 } 2662 2663 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2664 bdev_io) == true) { 2665 return true; 2666 } 2667 } 2668 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2669 if (!qos->rate_limits[i].update_quota) { 2670 continue; 2671 } 2672 2673 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2674 } 2675 } 2676 2677 return false; 2678 } 2679 2680 static inline void 2681 _bdev_io_do_submit(void *ctx) 2682 { 2683 struct spdk_bdev_io *bdev_io = ctx; 2684 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2685 2686 bdev_io_do_submit(ch, bdev_io); 2687 } 2688 2689 static int 2690 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2691 { 2692 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2693 int submitted_ios = 0; 2694 2695 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2696 if (!bdev_qos_queue_io(qos, bdev_io)) { 2697 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2698 2699 if (bdev_io->internal.io_submit_ch) { 2700 /* Send back the IO to the original thread for the actual processing. */ 2701 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2702 bdev_io->internal.io_submit_ch = NULL; 2703 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2704 _bdev_io_do_submit, bdev_io); 2705 } else { 2706 bdev_io_do_submit(ch, bdev_io); 2707 } 2708 2709 submitted_ios++; 2710 } 2711 } 2712 2713 return submitted_ios; 2714 } 2715 2716 static void 2717 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2718 { 2719 int rc; 2720 2721 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2722 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2723 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2724 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2725 &bdev_io->internal.waitq_entry); 2726 if (rc != 0) { 2727 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2728 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2729 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2730 } 2731 } 2732 2733 static bool 2734 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2735 { 2736 uint32_t io_boundary; 2737 struct spdk_bdev *bdev = bdev_io->bdev; 2738 uint32_t max_size = bdev->max_segment_size; 2739 int max_segs = bdev->max_num_segments; 2740 2741 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2742 io_boundary = bdev->write_unit_size; 2743 } else if (bdev->split_on_optimal_io_boundary) { 2744 io_boundary = bdev->optimal_io_boundary; 2745 } else { 2746 io_boundary = 0; 2747 } 2748 2749 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2750 return false; 2751 } 2752 2753 if (io_boundary) { 2754 uint64_t start_stripe, end_stripe; 2755 2756 start_stripe = bdev_io->u.bdev.offset_blocks; 2757 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2758 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2759 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2760 start_stripe >>= spdk_u32log2(io_boundary); 2761 end_stripe >>= spdk_u32log2(io_boundary); 2762 } else { 2763 start_stripe /= io_boundary; 2764 end_stripe /= io_boundary; 2765 } 2766 2767 if (start_stripe != end_stripe) { 2768 return true; 2769 } 2770 } 2771 2772 if (max_segs) { 2773 if (bdev_io->u.bdev.iovcnt > max_segs) { 2774 return true; 2775 } 2776 } 2777 2778 if (max_size) { 2779 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2780 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2781 return true; 2782 } 2783 } 2784 } 2785 2786 return false; 2787 } 2788 2789 static bool 2790 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2791 { 2792 uint32_t num_unmap_segments; 2793 2794 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2795 return false; 2796 } 2797 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2798 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2799 return true; 2800 } 2801 2802 return false; 2803 } 2804 2805 static bool 2806 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2807 { 2808 if (!bdev_io->bdev->max_write_zeroes) { 2809 return false; 2810 } 2811 2812 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2813 return true; 2814 } 2815 2816 return false; 2817 } 2818 2819 static bool 2820 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2821 { 2822 if (bdev_io->bdev->max_copy != 0 && 2823 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2824 return true; 2825 } 2826 2827 return false; 2828 } 2829 2830 static bool 2831 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2832 { 2833 switch (bdev_io->type) { 2834 case SPDK_BDEV_IO_TYPE_READ: 2835 case SPDK_BDEV_IO_TYPE_WRITE: 2836 return bdev_rw_should_split(bdev_io); 2837 case SPDK_BDEV_IO_TYPE_UNMAP: 2838 return bdev_unmap_should_split(bdev_io); 2839 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2840 return bdev_write_zeroes_should_split(bdev_io); 2841 case SPDK_BDEV_IO_TYPE_COPY: 2842 return bdev_copy_should_split(bdev_io); 2843 default: 2844 return false; 2845 } 2846 } 2847 2848 static uint32_t 2849 _to_next_boundary(uint64_t offset, uint32_t boundary) 2850 { 2851 return (boundary - (offset % boundary)); 2852 } 2853 2854 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2855 2856 static void _bdev_rw_split(void *_bdev_io); 2857 2858 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2859 2860 static void 2861 _bdev_unmap_split(void *_bdev_io) 2862 { 2863 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2864 } 2865 2866 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2867 2868 static void 2869 _bdev_write_zeroes_split(void *_bdev_io) 2870 { 2871 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2872 } 2873 2874 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2875 2876 static void 2877 _bdev_copy_split(void *_bdev_io) 2878 { 2879 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2880 } 2881 2882 static int 2883 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2884 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2885 { 2886 int rc; 2887 uint64_t current_offset, current_remaining, current_src_offset; 2888 spdk_bdev_io_wait_cb io_wait_fn; 2889 2890 current_offset = *offset; 2891 current_remaining = *remaining; 2892 2893 bdev_io->u.bdev.split_outstanding++; 2894 2895 io_wait_fn = _bdev_rw_split; 2896 switch (bdev_io->type) { 2897 case SPDK_BDEV_IO_TYPE_READ: 2898 assert(bdev_io->u.bdev.accel_sequence == NULL); 2899 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2900 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2901 iov, iovcnt, md_buf, current_offset, 2902 num_blocks, bdev_io->internal.memory_domain, 2903 bdev_io->internal.memory_domain_ctx, NULL, 2904 bdev_io_split_done, bdev_io); 2905 break; 2906 case SPDK_BDEV_IO_TYPE_WRITE: 2907 assert(bdev_io->u.bdev.accel_sequence == NULL); 2908 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2909 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2910 iov, iovcnt, md_buf, current_offset, 2911 num_blocks, bdev_io->internal.memory_domain, 2912 bdev_io->internal.memory_domain_ctx, NULL, 2913 bdev_io_split_done, bdev_io); 2914 break; 2915 case SPDK_BDEV_IO_TYPE_UNMAP: 2916 io_wait_fn = _bdev_unmap_split; 2917 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2918 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2919 current_offset, num_blocks, 2920 bdev_io_split_done, bdev_io); 2921 break; 2922 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2923 io_wait_fn = _bdev_write_zeroes_split; 2924 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2925 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2926 current_offset, num_blocks, 2927 bdev_io_split_done, bdev_io); 2928 break; 2929 case SPDK_BDEV_IO_TYPE_COPY: 2930 io_wait_fn = _bdev_copy_split; 2931 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2932 (current_offset - bdev_io->u.bdev.offset_blocks); 2933 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2934 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2935 current_offset, current_src_offset, num_blocks, 2936 bdev_io_split_done, bdev_io); 2937 break; 2938 default: 2939 assert(false); 2940 rc = -EINVAL; 2941 break; 2942 } 2943 2944 if (rc == 0) { 2945 current_offset += num_blocks; 2946 current_remaining -= num_blocks; 2947 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2948 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2949 *offset = current_offset; 2950 *remaining = current_remaining; 2951 } else { 2952 bdev_io->u.bdev.split_outstanding--; 2953 if (rc == -ENOMEM) { 2954 if (bdev_io->u.bdev.split_outstanding == 0) { 2955 /* No I/O is outstanding. Hence we should wait here. */ 2956 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2957 } 2958 } else { 2959 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2960 if (bdev_io->u.bdev.split_outstanding == 0) { 2961 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2962 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2963 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2964 } 2965 } 2966 } 2967 2968 return rc; 2969 } 2970 2971 static void 2972 _bdev_rw_split(void *_bdev_io) 2973 { 2974 struct iovec *parent_iov, *iov; 2975 struct spdk_bdev_io *bdev_io = _bdev_io; 2976 struct spdk_bdev *bdev = bdev_io->bdev; 2977 uint64_t parent_offset, current_offset, remaining; 2978 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2979 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2980 uint32_t iovcnt, iov_len, child_iovsize; 2981 uint32_t blocklen = bdev->blocklen; 2982 uint32_t io_boundary; 2983 uint32_t max_segment_size = bdev->max_segment_size; 2984 uint32_t max_child_iovcnt = bdev->max_num_segments; 2985 void *md_buf = NULL; 2986 int rc; 2987 2988 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2989 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2990 SPDK_BDEV_IO_NUM_CHILD_IOV; 2991 2992 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2993 io_boundary = bdev->write_unit_size; 2994 } else if (bdev->split_on_optimal_io_boundary) { 2995 io_boundary = bdev->optimal_io_boundary; 2996 } else { 2997 io_boundary = UINT32_MAX; 2998 } 2999 3000 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3001 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3002 parent_offset = bdev_io->u.bdev.offset_blocks; 3003 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3004 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3005 3006 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3007 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3008 if (parent_iov_offset < parent_iov->iov_len) { 3009 break; 3010 } 3011 parent_iov_offset -= parent_iov->iov_len; 3012 } 3013 3014 child_iovcnt = 0; 3015 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3016 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3017 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3018 to_next_boundary = spdk_min(remaining, to_next_boundary); 3019 to_next_boundary_bytes = to_next_boundary * blocklen; 3020 3021 iov = &bdev_io->child_iov[child_iovcnt]; 3022 iovcnt = 0; 3023 3024 if (bdev_io->u.bdev.md_buf) { 3025 md_buf = (char *)bdev_io->u.bdev.md_buf + 3026 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3027 } 3028 3029 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3030 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3031 iovcnt < child_iovsize) { 3032 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3033 iov_len = parent_iov->iov_len - parent_iov_offset; 3034 3035 iov_len = spdk_min(iov_len, max_segment_size); 3036 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3037 to_next_boundary_bytes -= iov_len; 3038 3039 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3040 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3041 3042 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3043 parent_iov_offset += iov_len; 3044 } else { 3045 parent_iovpos++; 3046 parent_iov_offset = 0; 3047 } 3048 child_iovcnt++; 3049 iovcnt++; 3050 } 3051 3052 if (to_next_boundary_bytes > 0) { 3053 /* We had to stop this child I/O early because we ran out of 3054 * child_iov space or were limited by max_num_segments. 3055 * Ensure the iovs to be aligned with block size and 3056 * then adjust to_next_boundary before starting the 3057 * child I/O. 3058 */ 3059 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3060 iovcnt == child_iovsize); 3061 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3062 if (to_last_block_bytes != 0) { 3063 uint32_t child_iovpos = child_iovcnt - 1; 3064 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3065 * so the loop will naturally end 3066 */ 3067 3068 to_last_block_bytes = blocklen - to_last_block_bytes; 3069 to_next_boundary_bytes += to_last_block_bytes; 3070 while (to_last_block_bytes > 0 && iovcnt > 0) { 3071 iov_len = spdk_min(to_last_block_bytes, 3072 bdev_io->child_iov[child_iovpos].iov_len); 3073 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3074 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3075 child_iovpos--; 3076 if (--iovcnt == 0) { 3077 /* If the child IO is less than a block size just return. 3078 * If the first child IO of any split round is less than 3079 * a block size, an error exit. 3080 */ 3081 if (bdev_io->u.bdev.split_outstanding == 0) { 3082 SPDK_ERRLOG("The first child io was less than a block size\n"); 3083 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3084 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3085 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3086 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3087 } 3088 3089 return; 3090 } 3091 } 3092 3093 to_last_block_bytes -= iov_len; 3094 3095 if (parent_iov_offset == 0) { 3096 parent_iovpos--; 3097 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3098 } 3099 parent_iov_offset -= iov_len; 3100 } 3101 3102 assert(to_last_block_bytes == 0); 3103 } 3104 to_next_boundary -= to_next_boundary_bytes / blocklen; 3105 } 3106 3107 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3108 ¤t_offset, &remaining); 3109 if (spdk_unlikely(rc)) { 3110 return; 3111 } 3112 } 3113 } 3114 3115 static void 3116 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3117 { 3118 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3119 uint32_t num_children_reqs = 0; 3120 int rc; 3121 3122 offset = bdev_io->u.bdev.split_current_offset_blocks; 3123 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3124 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3125 3126 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3127 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3128 3129 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3130 &offset, &remaining); 3131 if (spdk_likely(rc == 0)) { 3132 num_children_reqs++; 3133 } else { 3134 return; 3135 } 3136 } 3137 } 3138 3139 static void 3140 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3141 { 3142 uint64_t offset, write_zeroes_blocks, remaining; 3143 uint32_t num_children_reqs = 0; 3144 int rc; 3145 3146 offset = bdev_io->u.bdev.split_current_offset_blocks; 3147 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3148 3149 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3150 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3151 3152 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3153 &offset, &remaining); 3154 if (spdk_likely(rc == 0)) { 3155 num_children_reqs++; 3156 } else { 3157 return; 3158 } 3159 } 3160 } 3161 3162 static void 3163 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3164 { 3165 uint64_t offset, copy_blocks, remaining; 3166 uint32_t num_children_reqs = 0; 3167 int rc; 3168 3169 offset = bdev_io->u.bdev.split_current_offset_blocks; 3170 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3171 3172 assert(bdev_io->bdev->max_copy != 0); 3173 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3174 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3175 3176 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3177 &offset, &remaining); 3178 if (spdk_likely(rc == 0)) { 3179 num_children_reqs++; 3180 } else { 3181 return; 3182 } 3183 } 3184 } 3185 3186 static void 3187 parent_bdev_io_complete(void *ctx, int rc) 3188 { 3189 struct spdk_bdev_io *parent_io = ctx; 3190 3191 if (rc) { 3192 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3193 } 3194 3195 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3196 parent_io->internal.caller_ctx); 3197 } 3198 3199 static void 3200 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3201 { 3202 struct spdk_bdev_io *bdev_io = ctx; 3203 3204 /* u.bdev.accel_sequence should have already been cleared at this point */ 3205 assert(bdev_io->u.bdev.accel_sequence == NULL); 3206 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3207 bdev_io->internal.accel_sequence = NULL; 3208 3209 if (spdk_unlikely(status != 0)) { 3210 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3211 } 3212 3213 parent_bdev_io_complete(bdev_io, status); 3214 } 3215 3216 static void 3217 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3218 { 3219 struct spdk_bdev_io *parent_io = cb_arg; 3220 3221 spdk_bdev_free_io(bdev_io); 3222 3223 if (!success) { 3224 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3225 /* If any child I/O failed, stop further splitting process. */ 3226 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3227 parent_io->u.bdev.split_remaining_num_blocks = 0; 3228 } 3229 parent_io->u.bdev.split_outstanding--; 3230 if (parent_io->u.bdev.split_outstanding != 0) { 3231 return; 3232 } 3233 3234 /* 3235 * Parent I/O finishes when all blocks are consumed. 3236 */ 3237 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3238 assert(parent_io->internal.cb != bdev_io_split_done); 3239 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3240 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3241 3242 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3243 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3244 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3245 return; 3246 } else if (parent_io->internal.orig_iovcnt != 0 && 3247 !bdev_io_use_accel_sequence(bdev_io)) { 3248 /* bdev IO will be completed in the callback */ 3249 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3250 return; 3251 } 3252 } 3253 3254 parent_bdev_io_complete(parent_io, 0); 3255 return; 3256 } 3257 3258 /* 3259 * Continue with the splitting process. This function will complete the parent I/O if the 3260 * splitting is done. 3261 */ 3262 switch (parent_io->type) { 3263 case SPDK_BDEV_IO_TYPE_READ: 3264 case SPDK_BDEV_IO_TYPE_WRITE: 3265 _bdev_rw_split(parent_io); 3266 break; 3267 case SPDK_BDEV_IO_TYPE_UNMAP: 3268 bdev_unmap_split(parent_io); 3269 break; 3270 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3271 bdev_write_zeroes_split(parent_io); 3272 break; 3273 case SPDK_BDEV_IO_TYPE_COPY: 3274 bdev_copy_split(parent_io); 3275 break; 3276 default: 3277 assert(false); 3278 break; 3279 } 3280 } 3281 3282 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3283 bool success); 3284 3285 static void 3286 bdev_io_split(struct spdk_bdev_io *bdev_io) 3287 { 3288 assert(bdev_io_should_split(bdev_io)); 3289 3290 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3291 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3292 bdev_io->u.bdev.split_outstanding = 0; 3293 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3294 3295 switch (bdev_io->type) { 3296 case SPDK_BDEV_IO_TYPE_READ: 3297 case SPDK_BDEV_IO_TYPE_WRITE: 3298 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3299 _bdev_rw_split(bdev_io); 3300 } else { 3301 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3302 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3303 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3304 } 3305 break; 3306 case SPDK_BDEV_IO_TYPE_UNMAP: 3307 bdev_unmap_split(bdev_io); 3308 break; 3309 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3310 bdev_write_zeroes_split(bdev_io); 3311 break; 3312 case SPDK_BDEV_IO_TYPE_COPY: 3313 bdev_copy_split(bdev_io); 3314 break; 3315 default: 3316 assert(false); 3317 break; 3318 } 3319 } 3320 3321 static void 3322 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3323 { 3324 if (!success) { 3325 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3326 return; 3327 } 3328 3329 _bdev_rw_split(bdev_io); 3330 } 3331 3332 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3333 * be inlined, at least on some compilers. 3334 */ 3335 static inline void 3336 _bdev_io_submit(void *ctx) 3337 { 3338 struct spdk_bdev_io *bdev_io = ctx; 3339 struct spdk_bdev *bdev = bdev_io->bdev; 3340 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3341 3342 if (spdk_likely(bdev_ch->flags == 0)) { 3343 bdev_io_do_submit(bdev_ch, bdev_io); 3344 return; 3345 } 3346 3347 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3348 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3349 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3350 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3351 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3352 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3353 } else { 3354 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3355 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3356 } 3357 } else { 3358 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3359 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3360 } 3361 } 3362 3363 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3364 3365 bool 3366 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3367 { 3368 if (range1->length == 0 || range2->length == 0) { 3369 return false; 3370 } 3371 3372 if (range1->offset + range1->length <= range2->offset) { 3373 return false; 3374 } 3375 3376 if (range2->offset + range2->length <= range1->offset) { 3377 return false; 3378 } 3379 3380 return true; 3381 } 3382 3383 static bool 3384 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3385 { 3386 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3387 struct lba_range r; 3388 3389 switch (bdev_io->type) { 3390 case SPDK_BDEV_IO_TYPE_NVME_IO: 3391 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3392 /* Don't try to decode the NVMe command - just assume worst-case and that 3393 * it overlaps a locked range. 3394 */ 3395 return true; 3396 case SPDK_BDEV_IO_TYPE_WRITE: 3397 case SPDK_BDEV_IO_TYPE_UNMAP: 3398 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3399 case SPDK_BDEV_IO_TYPE_ZCOPY: 3400 case SPDK_BDEV_IO_TYPE_COPY: 3401 r.offset = bdev_io->u.bdev.offset_blocks; 3402 r.length = bdev_io->u.bdev.num_blocks; 3403 if (!bdev_lba_range_overlapped(range, &r)) { 3404 /* This I/O doesn't overlap the specified LBA range. */ 3405 return false; 3406 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3407 /* This I/O overlaps, but the I/O is on the same channel that locked this 3408 * range, and the caller_ctx is the same as the locked_ctx. This means 3409 * that this I/O is associated with the lock, and is allowed to execute. 3410 */ 3411 return false; 3412 } else { 3413 return true; 3414 } 3415 default: 3416 return false; 3417 } 3418 } 3419 3420 void 3421 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3422 { 3423 struct spdk_bdev *bdev = bdev_io->bdev; 3424 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3425 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3426 3427 assert(thread != NULL); 3428 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3429 3430 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3431 struct lba_range *range; 3432 3433 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3434 if (bdev_io_range_is_locked(bdev_io, range)) { 3435 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3436 return; 3437 } 3438 } 3439 } 3440 3441 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3442 3443 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3444 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3445 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3446 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3447 spdk_bdev_get_name(bdev)); 3448 3449 if (bdev_io->internal.split) { 3450 bdev_io_split(bdev_io); 3451 return; 3452 } 3453 3454 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3455 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3456 _bdev_io_submit(bdev_io); 3457 } else { 3458 bdev_io->internal.io_submit_ch = ch; 3459 bdev_io->internal.ch = bdev->internal.qos->ch; 3460 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3461 } 3462 } else { 3463 _bdev_io_submit(bdev_io); 3464 } 3465 } 3466 3467 static inline void 3468 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3469 { 3470 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3471 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3472 * For write operation we need to pull buffers from memory domain before submitting IO. 3473 * Once read operation completes, we need to use memory_domain push functionality to 3474 * update data in original memory domain IO buffer 3475 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3476 bdev_io->u.bdev.memory_domain = NULL; 3477 bdev_io->u.bdev.memory_domain_ctx = NULL; 3478 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3479 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3480 } 3481 3482 static inline void 3483 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3484 { 3485 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3486 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3487 3488 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3489 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3490 bdev_io_complete_unsubmitted(bdev_io); 3491 return; 3492 } 3493 3494 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3495 * support them, but we need to execute an accel sequence and the data buffer is from accel 3496 * memory domain (to avoid doing a push/pull from that domain). 3497 */ 3498 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3499 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3500 _bdev_io_ext_use_bounce_buffer(bdev_io); 3501 return; 3502 } 3503 3504 if (needs_exec) { 3505 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3506 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3507 return; 3508 } 3509 /* For reads we'll execute the sequence after the data is read, so, for now, only 3510 * clear out accel_sequence pointer and submit the IO */ 3511 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3512 bdev_io->u.bdev.accel_sequence = NULL; 3513 } 3514 3515 bdev_io_submit(bdev_io); 3516 } 3517 3518 static void 3519 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3520 { 3521 struct spdk_bdev *bdev = bdev_io->bdev; 3522 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3523 struct spdk_io_channel *ch = bdev_ch->channel; 3524 3525 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3526 3527 bdev_io->internal.in_submit_request = true; 3528 bdev_submit_request(bdev, ch, bdev_io); 3529 bdev_io->internal.in_submit_request = false; 3530 } 3531 3532 void 3533 bdev_io_init(struct spdk_bdev_io *bdev_io, 3534 struct spdk_bdev *bdev, void *cb_arg, 3535 spdk_bdev_io_completion_cb cb) 3536 { 3537 bdev_io->bdev = bdev; 3538 bdev_io->internal.caller_ctx = cb_arg; 3539 bdev_io->internal.cb = cb; 3540 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3541 bdev_io->internal.in_submit_request = false; 3542 bdev_io->internal.buf = NULL; 3543 bdev_io->internal.io_submit_ch = NULL; 3544 bdev_io->internal.orig_iovs = NULL; 3545 bdev_io->internal.orig_iovcnt = 0; 3546 bdev_io->internal.orig_md_iov.iov_base = NULL; 3547 bdev_io->internal.error.nvme.cdw0 = 0; 3548 bdev_io->num_retries = 0; 3549 bdev_io->internal.get_buf_cb = NULL; 3550 bdev_io->internal.get_aux_buf_cb = NULL; 3551 bdev_io->internal.memory_domain = NULL; 3552 bdev_io->internal.memory_domain_ctx = NULL; 3553 bdev_io->internal.data_transfer_cpl = NULL; 3554 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3555 bdev_io->internal.accel_sequence = NULL; 3556 bdev_io->internal.has_accel_sequence = false; 3557 } 3558 3559 static bool 3560 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3561 { 3562 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3563 } 3564 3565 bool 3566 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3567 { 3568 bool supported; 3569 3570 supported = bdev_io_type_supported(bdev, io_type); 3571 3572 if (!supported) { 3573 switch (io_type) { 3574 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3575 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3576 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3577 break; 3578 default: 3579 break; 3580 } 3581 } 3582 3583 return supported; 3584 } 3585 3586 uint64_t 3587 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3588 { 3589 return bdev_io->internal.submit_tsc; 3590 } 3591 3592 int 3593 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3594 { 3595 if (bdev->fn_table->dump_info_json) { 3596 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3597 } 3598 3599 return 0; 3600 } 3601 3602 static void 3603 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3604 { 3605 uint32_t max_per_timeslice = 0; 3606 int i; 3607 3608 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3609 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3610 qos->rate_limits[i].max_per_timeslice = 0; 3611 continue; 3612 } 3613 3614 max_per_timeslice = qos->rate_limits[i].limit * 3615 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3616 3617 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3618 qos->rate_limits[i].min_per_timeslice); 3619 3620 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3621 } 3622 3623 bdev_qos_set_ops(qos); 3624 } 3625 3626 static int 3627 bdev_channel_poll_qos(void *arg) 3628 { 3629 struct spdk_bdev_qos *qos = arg; 3630 uint64_t now = spdk_get_ticks(); 3631 int i; 3632 3633 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3634 /* We received our callback earlier than expected - return 3635 * immediately and wait to do accounting until at least one 3636 * timeslice has actually expired. This should never happen 3637 * with a well-behaved timer implementation. 3638 */ 3639 return SPDK_POLLER_IDLE; 3640 } 3641 3642 /* Reset for next round of rate limiting */ 3643 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3644 /* We may have allowed the IOs or bytes to slightly overrun in the last 3645 * timeslice. remaining_this_timeslice is signed, so if it's negative 3646 * here, we'll account for the overrun so that the next timeslice will 3647 * be appropriately reduced. 3648 */ 3649 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3650 qos->rate_limits[i].remaining_this_timeslice = 0; 3651 } 3652 } 3653 3654 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3655 qos->last_timeslice += qos->timeslice_size; 3656 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3657 qos->rate_limits[i].remaining_this_timeslice += 3658 qos->rate_limits[i].max_per_timeslice; 3659 } 3660 } 3661 3662 return bdev_qos_io_submit(qos->ch, qos); 3663 } 3664 3665 static void 3666 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3667 { 3668 struct spdk_bdev_shared_resource *shared_resource; 3669 struct lba_range *range; 3670 3671 bdev_free_io_stat(ch->stat); 3672 #ifdef SPDK_CONFIG_VTUNE 3673 bdev_free_io_stat(ch->prev_stat); 3674 #endif 3675 3676 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3677 range = TAILQ_FIRST(&ch->locked_ranges); 3678 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3679 free(range); 3680 } 3681 3682 spdk_put_io_channel(ch->channel); 3683 spdk_put_io_channel(ch->accel_channel); 3684 3685 shared_resource = ch->shared_resource; 3686 3687 assert(TAILQ_EMPTY(&ch->io_locked)); 3688 assert(TAILQ_EMPTY(&ch->io_submitted)); 3689 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3690 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3691 assert(ch->io_outstanding == 0); 3692 assert(shared_resource->ref > 0); 3693 shared_resource->ref--; 3694 if (shared_resource->ref == 0) { 3695 assert(shared_resource->io_outstanding == 0); 3696 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3697 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3698 free(shared_resource); 3699 } 3700 } 3701 3702 static void 3703 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3704 { 3705 struct spdk_bdev_qos *qos = bdev->internal.qos; 3706 int i; 3707 3708 assert(spdk_spin_held(&bdev->internal.spinlock)); 3709 3710 /* Rate limiting on this bdev enabled */ 3711 if (qos) { 3712 if (qos->ch == NULL) { 3713 struct spdk_io_channel *io_ch; 3714 3715 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3716 bdev->name, spdk_get_thread()); 3717 3718 /* No qos channel has been selected, so set one up */ 3719 3720 /* Take another reference to ch */ 3721 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3722 assert(io_ch != NULL); 3723 qos->ch = ch; 3724 3725 qos->thread = spdk_io_channel_get_thread(io_ch); 3726 3727 TAILQ_INIT(&qos->queued); 3728 3729 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3730 if (bdev_qos_is_iops_rate_limit(i) == true) { 3731 qos->rate_limits[i].min_per_timeslice = 3732 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3733 } else { 3734 qos->rate_limits[i].min_per_timeslice = 3735 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3736 } 3737 3738 if (qos->rate_limits[i].limit == 0) { 3739 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3740 } 3741 } 3742 bdev_qos_update_max_quota_per_timeslice(qos); 3743 qos->timeslice_size = 3744 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3745 qos->last_timeslice = spdk_get_ticks(); 3746 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3747 qos, 3748 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3749 } 3750 3751 ch->flags |= BDEV_CH_QOS_ENABLED; 3752 } 3753 } 3754 3755 struct poll_timeout_ctx { 3756 struct spdk_bdev_desc *desc; 3757 uint64_t timeout_in_sec; 3758 spdk_bdev_io_timeout_cb cb_fn; 3759 void *cb_arg; 3760 }; 3761 3762 static void 3763 bdev_desc_free(struct spdk_bdev_desc *desc) 3764 { 3765 spdk_spin_destroy(&desc->spinlock); 3766 free(desc->media_events_buffer); 3767 free(desc); 3768 } 3769 3770 static void 3771 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3772 { 3773 struct poll_timeout_ctx *ctx = _ctx; 3774 struct spdk_bdev_desc *desc = ctx->desc; 3775 3776 free(ctx); 3777 3778 spdk_spin_lock(&desc->spinlock); 3779 desc->refs--; 3780 if (desc->closed == true && desc->refs == 0) { 3781 spdk_spin_unlock(&desc->spinlock); 3782 bdev_desc_free(desc); 3783 return; 3784 } 3785 spdk_spin_unlock(&desc->spinlock); 3786 } 3787 3788 static void 3789 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3790 struct spdk_io_channel *io_ch, void *_ctx) 3791 { 3792 struct poll_timeout_ctx *ctx = _ctx; 3793 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3794 struct spdk_bdev_desc *desc = ctx->desc; 3795 struct spdk_bdev_io *bdev_io; 3796 uint64_t now; 3797 3798 spdk_spin_lock(&desc->spinlock); 3799 if (desc->closed == true) { 3800 spdk_spin_unlock(&desc->spinlock); 3801 spdk_bdev_for_each_channel_continue(i, -1); 3802 return; 3803 } 3804 spdk_spin_unlock(&desc->spinlock); 3805 3806 now = spdk_get_ticks(); 3807 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3808 /* Exclude any I/O that are generated via splitting. */ 3809 if (bdev_io->internal.cb == bdev_io_split_done) { 3810 continue; 3811 } 3812 3813 /* Once we find an I/O that has not timed out, we can immediately 3814 * exit the loop. 3815 */ 3816 if (now < (bdev_io->internal.submit_tsc + 3817 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3818 goto end; 3819 } 3820 3821 if (bdev_io->internal.desc == desc) { 3822 ctx->cb_fn(ctx->cb_arg, bdev_io); 3823 } 3824 } 3825 3826 end: 3827 spdk_bdev_for_each_channel_continue(i, 0); 3828 } 3829 3830 static int 3831 bdev_poll_timeout_io(void *arg) 3832 { 3833 struct spdk_bdev_desc *desc = arg; 3834 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3835 struct poll_timeout_ctx *ctx; 3836 3837 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3838 if (!ctx) { 3839 SPDK_ERRLOG("failed to allocate memory\n"); 3840 return SPDK_POLLER_BUSY; 3841 } 3842 ctx->desc = desc; 3843 ctx->cb_arg = desc->cb_arg; 3844 ctx->cb_fn = desc->cb_fn; 3845 ctx->timeout_in_sec = desc->timeout_in_sec; 3846 3847 /* Take a ref on the descriptor in case it gets closed while we are checking 3848 * all of the channels. 3849 */ 3850 spdk_spin_lock(&desc->spinlock); 3851 desc->refs++; 3852 spdk_spin_unlock(&desc->spinlock); 3853 3854 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3855 bdev_channel_poll_timeout_io_done); 3856 3857 return SPDK_POLLER_BUSY; 3858 } 3859 3860 int 3861 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3862 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3863 { 3864 assert(desc->thread == spdk_get_thread()); 3865 3866 spdk_poller_unregister(&desc->io_timeout_poller); 3867 3868 if (timeout_in_sec) { 3869 assert(cb_fn != NULL); 3870 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3871 desc, 3872 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3873 1000); 3874 if (desc->io_timeout_poller == NULL) { 3875 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3876 return -1; 3877 } 3878 } 3879 3880 desc->cb_fn = cb_fn; 3881 desc->cb_arg = cb_arg; 3882 desc->timeout_in_sec = timeout_in_sec; 3883 3884 return 0; 3885 } 3886 3887 static int 3888 bdev_channel_create(void *io_device, void *ctx_buf) 3889 { 3890 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3891 struct spdk_bdev_channel *ch = ctx_buf; 3892 struct spdk_io_channel *mgmt_io_ch; 3893 struct spdk_bdev_mgmt_channel *mgmt_ch; 3894 struct spdk_bdev_shared_resource *shared_resource; 3895 struct lba_range *range; 3896 3897 ch->bdev = bdev; 3898 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3899 if (!ch->channel) { 3900 return -1; 3901 } 3902 3903 ch->accel_channel = spdk_accel_get_io_channel(); 3904 if (!ch->accel_channel) { 3905 spdk_put_io_channel(ch->channel); 3906 return -1; 3907 } 3908 3909 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3910 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3911 3912 assert(ch->histogram == NULL); 3913 if (bdev->internal.histogram_enabled) { 3914 ch->histogram = spdk_histogram_data_alloc(); 3915 if (ch->histogram == NULL) { 3916 SPDK_ERRLOG("Could not allocate histogram\n"); 3917 } 3918 } 3919 3920 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3921 if (!mgmt_io_ch) { 3922 spdk_put_io_channel(ch->channel); 3923 spdk_put_io_channel(ch->accel_channel); 3924 return -1; 3925 } 3926 3927 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3928 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3929 if (shared_resource->shared_ch == ch->channel) { 3930 spdk_put_io_channel(mgmt_io_ch); 3931 shared_resource->ref++; 3932 break; 3933 } 3934 } 3935 3936 if (shared_resource == NULL) { 3937 shared_resource = calloc(1, sizeof(*shared_resource)); 3938 if (shared_resource == NULL) { 3939 spdk_put_io_channel(ch->channel); 3940 spdk_put_io_channel(ch->accel_channel); 3941 spdk_put_io_channel(mgmt_io_ch); 3942 return -1; 3943 } 3944 3945 shared_resource->mgmt_ch = mgmt_ch; 3946 shared_resource->io_outstanding = 0; 3947 TAILQ_INIT(&shared_resource->nomem_io); 3948 shared_resource->nomem_threshold = 0; 3949 shared_resource->shared_ch = ch->channel; 3950 shared_resource->ref = 1; 3951 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3952 } 3953 3954 ch->io_outstanding = 0; 3955 TAILQ_INIT(&ch->queued_resets); 3956 TAILQ_INIT(&ch->locked_ranges); 3957 ch->flags = 0; 3958 ch->shared_resource = shared_resource; 3959 3960 TAILQ_INIT(&ch->io_submitted); 3961 TAILQ_INIT(&ch->io_locked); 3962 TAILQ_INIT(&ch->io_accel_exec); 3963 TAILQ_INIT(&ch->io_memory_domain); 3964 3965 ch->stat = bdev_alloc_io_stat(false); 3966 if (ch->stat == NULL) { 3967 bdev_channel_destroy_resource(ch); 3968 return -1; 3969 } 3970 3971 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3972 3973 #ifdef SPDK_CONFIG_VTUNE 3974 { 3975 char *name; 3976 __itt_init_ittlib(NULL, 0); 3977 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3978 if (!name) { 3979 bdev_channel_destroy_resource(ch); 3980 return -1; 3981 } 3982 ch->handle = __itt_string_handle_create(name); 3983 free(name); 3984 ch->start_tsc = spdk_get_ticks(); 3985 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3986 ch->prev_stat = bdev_alloc_io_stat(false); 3987 if (ch->prev_stat == NULL) { 3988 bdev_channel_destroy_resource(ch); 3989 return -1; 3990 } 3991 } 3992 #endif 3993 3994 spdk_spin_lock(&bdev->internal.spinlock); 3995 bdev_enable_qos(bdev, ch); 3996 3997 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3998 struct lba_range *new_range; 3999 4000 new_range = calloc(1, sizeof(*new_range)); 4001 if (new_range == NULL) { 4002 spdk_spin_unlock(&bdev->internal.spinlock); 4003 bdev_channel_destroy_resource(ch); 4004 return -1; 4005 } 4006 new_range->length = range->length; 4007 new_range->offset = range->offset; 4008 new_range->locked_ctx = range->locked_ctx; 4009 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4010 } 4011 4012 spdk_spin_unlock(&bdev->internal.spinlock); 4013 4014 return 0; 4015 } 4016 4017 static int 4018 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4019 void *cb_ctx) 4020 { 4021 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4022 struct spdk_bdev_io *bdev_io; 4023 uint64_t buf_len; 4024 4025 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4026 if (bdev_io->internal.ch == bdev_ch) { 4027 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4028 spdk_iobuf_entry_abort(ch, entry, buf_len); 4029 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4030 } 4031 4032 return 0; 4033 } 4034 4035 /* 4036 * Abort I/O that are waiting on a data buffer. 4037 */ 4038 static void 4039 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4040 { 4041 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4042 bdev_abort_all_buf_io_cb, ch); 4043 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4044 bdev_abort_all_buf_io_cb, ch); 4045 } 4046 4047 /* 4048 * Abort I/O that are queued waiting for submission. These types of I/O are 4049 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4050 */ 4051 static void 4052 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4053 { 4054 struct spdk_bdev_io *bdev_io, *tmp; 4055 4056 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4057 if (bdev_io->internal.ch == ch) { 4058 TAILQ_REMOVE(queue, bdev_io, internal.link); 4059 /* 4060 * spdk_bdev_io_complete() assumes that the completed I/O had 4061 * been submitted to the bdev module. Since in this case it 4062 * hadn't, bump io_outstanding to account for the decrement 4063 * that spdk_bdev_io_complete() will do. 4064 */ 4065 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4066 bdev_io_increment_outstanding(ch, ch->shared_resource); 4067 } 4068 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4069 } 4070 } 4071 } 4072 4073 static bool 4074 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4075 { 4076 struct spdk_bdev_io *bdev_io; 4077 4078 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4079 if (bdev_io == bio_to_abort) { 4080 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4081 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4082 return true; 4083 } 4084 } 4085 4086 return false; 4087 } 4088 4089 static int 4090 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4091 { 4092 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4093 uint64_t buf_len; 4094 4095 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4096 if (bdev_io == bio_to_abort) { 4097 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4098 spdk_iobuf_entry_abort(ch, entry, buf_len); 4099 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4100 return 1; 4101 } 4102 4103 return 0; 4104 } 4105 4106 static bool 4107 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4108 { 4109 int rc; 4110 4111 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4112 bdev_abort_buf_io_cb, bio_to_abort); 4113 if (rc == 1) { 4114 return true; 4115 } 4116 4117 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4118 bdev_abort_buf_io_cb, bio_to_abort); 4119 return rc == 1; 4120 } 4121 4122 static void 4123 bdev_qos_channel_destroy(void *cb_arg) 4124 { 4125 struct spdk_bdev_qos *qos = cb_arg; 4126 4127 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4128 spdk_poller_unregister(&qos->poller); 4129 4130 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4131 4132 free(qos); 4133 } 4134 4135 static int 4136 bdev_qos_destroy(struct spdk_bdev *bdev) 4137 { 4138 int i; 4139 4140 /* 4141 * Cleanly shutting down the QoS poller is tricky, because 4142 * during the asynchronous operation the user could open 4143 * a new descriptor and create a new channel, spawning 4144 * a new QoS poller. 4145 * 4146 * The strategy is to create a new QoS structure here and swap it 4147 * in. The shutdown path then continues to refer to the old one 4148 * until it completes and then releases it. 4149 */ 4150 struct spdk_bdev_qos *new_qos, *old_qos; 4151 4152 old_qos = bdev->internal.qos; 4153 4154 new_qos = calloc(1, sizeof(*new_qos)); 4155 if (!new_qos) { 4156 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4157 return -ENOMEM; 4158 } 4159 4160 /* Copy the old QoS data into the newly allocated structure */ 4161 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4162 4163 /* Zero out the key parts of the QoS structure */ 4164 new_qos->ch = NULL; 4165 new_qos->thread = NULL; 4166 new_qos->poller = NULL; 4167 TAILQ_INIT(&new_qos->queued); 4168 /* 4169 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4170 * It will be used later for the new QoS structure. 4171 */ 4172 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4173 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4174 new_qos->rate_limits[i].min_per_timeslice = 0; 4175 new_qos->rate_limits[i].max_per_timeslice = 0; 4176 } 4177 4178 bdev->internal.qos = new_qos; 4179 4180 if (old_qos->thread == NULL) { 4181 free(old_qos); 4182 } else { 4183 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4184 } 4185 4186 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4187 * been destroyed yet. The destruction path will end up waiting for the final 4188 * channel to be put before it releases resources. */ 4189 4190 return 0; 4191 } 4192 4193 void 4194 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4195 { 4196 total->bytes_read += add->bytes_read; 4197 total->num_read_ops += add->num_read_ops; 4198 total->bytes_written += add->bytes_written; 4199 total->num_write_ops += add->num_write_ops; 4200 total->bytes_unmapped += add->bytes_unmapped; 4201 total->num_unmap_ops += add->num_unmap_ops; 4202 total->bytes_copied += add->bytes_copied; 4203 total->num_copy_ops += add->num_copy_ops; 4204 total->read_latency_ticks += add->read_latency_ticks; 4205 total->write_latency_ticks += add->write_latency_ticks; 4206 total->unmap_latency_ticks += add->unmap_latency_ticks; 4207 total->copy_latency_ticks += add->copy_latency_ticks; 4208 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4209 total->max_read_latency_ticks = add->max_read_latency_ticks; 4210 } 4211 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4212 total->min_read_latency_ticks = add->min_read_latency_ticks; 4213 } 4214 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4215 total->max_write_latency_ticks = add->max_write_latency_ticks; 4216 } 4217 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4218 total->min_write_latency_ticks = add->min_write_latency_ticks; 4219 } 4220 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4221 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4222 } 4223 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4224 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4225 } 4226 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4227 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4228 } 4229 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4230 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4231 } 4232 } 4233 4234 static void 4235 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4236 { 4237 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4238 4239 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4240 memcpy(to_stat->io_error, from_stat->io_error, 4241 sizeof(struct spdk_bdev_io_error_stat)); 4242 } 4243 } 4244 4245 void 4246 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4247 { 4248 stat->max_read_latency_ticks = 0; 4249 stat->min_read_latency_ticks = UINT64_MAX; 4250 stat->max_write_latency_ticks = 0; 4251 stat->min_write_latency_ticks = UINT64_MAX; 4252 stat->max_unmap_latency_ticks = 0; 4253 stat->min_unmap_latency_ticks = UINT64_MAX; 4254 stat->max_copy_latency_ticks = 0; 4255 stat->min_copy_latency_ticks = UINT64_MAX; 4256 4257 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4258 return; 4259 } 4260 4261 stat->bytes_read = 0; 4262 stat->num_read_ops = 0; 4263 stat->bytes_written = 0; 4264 stat->num_write_ops = 0; 4265 stat->bytes_unmapped = 0; 4266 stat->num_unmap_ops = 0; 4267 stat->bytes_copied = 0; 4268 stat->num_copy_ops = 0; 4269 stat->read_latency_ticks = 0; 4270 stat->write_latency_ticks = 0; 4271 stat->unmap_latency_ticks = 0; 4272 stat->copy_latency_ticks = 0; 4273 4274 if (stat->io_error != NULL) { 4275 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4276 } 4277 } 4278 4279 struct spdk_bdev_io_stat * 4280 bdev_alloc_io_stat(bool io_error_stat) 4281 { 4282 struct spdk_bdev_io_stat *stat; 4283 4284 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4285 if (stat == NULL) { 4286 return NULL; 4287 } 4288 4289 if (io_error_stat) { 4290 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4291 if (stat->io_error == NULL) { 4292 free(stat); 4293 return NULL; 4294 } 4295 } else { 4296 stat->io_error = NULL; 4297 } 4298 4299 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4300 4301 return stat; 4302 } 4303 4304 void 4305 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4306 { 4307 if (stat != NULL) { 4308 free(stat->io_error); 4309 free(stat); 4310 } 4311 } 4312 4313 void 4314 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4315 { 4316 int i; 4317 4318 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4319 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4320 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4321 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4322 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4323 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4324 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4325 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4326 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4327 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4328 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4329 stat->min_read_latency_ticks != UINT64_MAX ? 4330 stat->min_read_latency_ticks : 0); 4331 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4332 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4333 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4334 stat->min_write_latency_ticks != UINT64_MAX ? 4335 stat->min_write_latency_ticks : 0); 4336 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4337 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4338 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4339 stat->min_unmap_latency_ticks != UINT64_MAX ? 4340 stat->min_unmap_latency_ticks : 0); 4341 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4342 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4343 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4344 stat->min_copy_latency_ticks != UINT64_MAX ? 4345 stat->min_copy_latency_ticks : 0); 4346 4347 if (stat->io_error != NULL) { 4348 spdk_json_write_named_object_begin(w, "io_error"); 4349 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4350 if (stat->io_error->error_status[i] != 0) { 4351 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4352 stat->io_error->error_status[i]); 4353 } 4354 } 4355 spdk_json_write_object_end(w); 4356 } 4357 } 4358 4359 static void 4360 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4361 { 4362 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4363 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4364 4365 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4366 bdev_abort_all_buf_io(mgmt_ch, ch); 4367 } 4368 4369 static void 4370 bdev_channel_destroy(void *io_device, void *ctx_buf) 4371 { 4372 struct spdk_bdev_channel *ch = ctx_buf; 4373 4374 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4375 spdk_get_thread()); 4376 4377 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4378 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4379 4380 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4381 spdk_spin_lock(&ch->bdev->internal.spinlock); 4382 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4383 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4384 4385 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4386 4387 bdev_channel_abort_queued_ios(ch); 4388 4389 if (ch->histogram) { 4390 spdk_histogram_data_free(ch->histogram); 4391 } 4392 4393 bdev_channel_destroy_resource(ch); 4394 } 4395 4396 /* 4397 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4398 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4399 */ 4400 static int 4401 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4402 { 4403 struct spdk_bdev_name *tmp; 4404 4405 bdev_name->name = strdup(name); 4406 if (bdev_name->name == NULL) { 4407 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4408 return -ENOMEM; 4409 } 4410 4411 bdev_name->bdev = bdev; 4412 4413 spdk_spin_lock(&g_bdev_mgr.spinlock); 4414 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4415 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4416 4417 if (tmp != NULL) { 4418 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4419 free(bdev_name->name); 4420 return -EEXIST; 4421 } 4422 4423 return 0; 4424 } 4425 4426 static void 4427 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4428 { 4429 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4430 free(bdev_name->name); 4431 } 4432 4433 static void 4434 bdev_name_del(struct spdk_bdev_name *bdev_name) 4435 { 4436 spdk_spin_lock(&g_bdev_mgr.spinlock); 4437 bdev_name_del_unsafe(bdev_name); 4438 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4439 } 4440 4441 int 4442 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4443 { 4444 struct spdk_bdev_alias *tmp; 4445 int ret; 4446 4447 if (alias == NULL) { 4448 SPDK_ERRLOG("Empty alias passed\n"); 4449 return -EINVAL; 4450 } 4451 4452 tmp = calloc(1, sizeof(*tmp)); 4453 if (tmp == NULL) { 4454 SPDK_ERRLOG("Unable to allocate alias\n"); 4455 return -ENOMEM; 4456 } 4457 4458 ret = bdev_name_add(&tmp->alias, bdev, alias); 4459 if (ret != 0) { 4460 free(tmp); 4461 return ret; 4462 } 4463 4464 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4465 4466 return 0; 4467 } 4468 4469 static int 4470 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4471 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4472 { 4473 struct spdk_bdev_alias *tmp; 4474 4475 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4476 if (strcmp(alias, tmp->alias.name) == 0) { 4477 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4478 alias_del_fn(&tmp->alias); 4479 free(tmp); 4480 return 0; 4481 } 4482 } 4483 4484 return -ENOENT; 4485 } 4486 4487 int 4488 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4489 { 4490 int rc; 4491 4492 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4493 if (rc == -ENOENT) { 4494 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4495 } 4496 4497 return rc; 4498 } 4499 4500 void 4501 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4502 { 4503 struct spdk_bdev_alias *p, *tmp; 4504 4505 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4506 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4507 bdev_name_del(&p->alias); 4508 free(p); 4509 } 4510 } 4511 4512 struct spdk_io_channel * 4513 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4514 { 4515 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4516 } 4517 4518 void * 4519 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4520 { 4521 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4522 void *ctx = NULL; 4523 4524 if (bdev->fn_table->get_module_ctx) { 4525 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4526 } 4527 4528 return ctx; 4529 } 4530 4531 const char * 4532 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4533 { 4534 return bdev->module->name; 4535 } 4536 4537 const char * 4538 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4539 { 4540 return bdev->name; 4541 } 4542 4543 const char * 4544 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4545 { 4546 return bdev->product_name; 4547 } 4548 4549 const struct spdk_bdev_aliases_list * 4550 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4551 { 4552 return &bdev->aliases; 4553 } 4554 4555 uint32_t 4556 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4557 { 4558 return bdev->blocklen; 4559 } 4560 4561 uint32_t 4562 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4563 { 4564 return bdev->write_unit_size; 4565 } 4566 4567 uint64_t 4568 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4569 { 4570 return bdev->blockcnt; 4571 } 4572 4573 const char * 4574 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4575 { 4576 return qos_rpc_type[type]; 4577 } 4578 4579 void 4580 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4581 { 4582 int i; 4583 4584 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4585 4586 spdk_spin_lock(&bdev->internal.spinlock); 4587 if (bdev->internal.qos) { 4588 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4589 if (bdev->internal.qos->rate_limits[i].limit != 4590 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4591 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4592 if (bdev_qos_is_iops_rate_limit(i) == false) { 4593 /* Change from Byte to Megabyte which is user visible. */ 4594 limits[i] = limits[i] / 1024 / 1024; 4595 } 4596 } 4597 } 4598 } 4599 spdk_spin_unlock(&bdev->internal.spinlock); 4600 } 4601 4602 size_t 4603 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4604 { 4605 return 1 << bdev->required_alignment; 4606 } 4607 4608 uint32_t 4609 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4610 { 4611 return bdev->optimal_io_boundary; 4612 } 4613 4614 bool 4615 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4616 { 4617 return bdev->write_cache; 4618 } 4619 4620 const struct spdk_uuid * 4621 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4622 { 4623 return &bdev->uuid; 4624 } 4625 4626 uint16_t 4627 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4628 { 4629 return bdev->acwu; 4630 } 4631 4632 uint32_t 4633 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4634 { 4635 return bdev->md_len; 4636 } 4637 4638 bool 4639 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4640 { 4641 return (bdev->md_len != 0) && bdev->md_interleave; 4642 } 4643 4644 bool 4645 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4646 { 4647 return (bdev->md_len != 0) && !bdev->md_interleave; 4648 } 4649 4650 bool 4651 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4652 { 4653 return bdev->zoned; 4654 } 4655 4656 uint32_t 4657 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4658 { 4659 if (spdk_bdev_is_md_interleaved(bdev)) { 4660 return bdev->blocklen - bdev->md_len; 4661 } else { 4662 return bdev->blocklen; 4663 } 4664 } 4665 4666 uint32_t 4667 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4668 { 4669 return bdev->phys_blocklen; 4670 } 4671 4672 static uint32_t 4673 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4674 { 4675 if (!spdk_bdev_is_md_interleaved(bdev)) { 4676 return bdev->blocklen + bdev->md_len; 4677 } else { 4678 return bdev->blocklen; 4679 } 4680 } 4681 4682 /* We have to use the typedef in the function declaration to appease astyle. */ 4683 typedef enum spdk_dif_type spdk_dif_type_t; 4684 4685 spdk_dif_type_t 4686 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4687 { 4688 if (bdev->md_len != 0) { 4689 return bdev->dif_type; 4690 } else { 4691 return SPDK_DIF_DISABLE; 4692 } 4693 } 4694 4695 bool 4696 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4697 { 4698 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4699 return bdev->dif_is_head_of_md; 4700 } else { 4701 return false; 4702 } 4703 } 4704 4705 bool 4706 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4707 enum spdk_dif_check_type check_type) 4708 { 4709 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4710 return false; 4711 } 4712 4713 switch (check_type) { 4714 case SPDK_DIF_CHECK_TYPE_REFTAG: 4715 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4716 case SPDK_DIF_CHECK_TYPE_APPTAG: 4717 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4718 case SPDK_DIF_CHECK_TYPE_GUARD: 4719 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4720 default: 4721 return false; 4722 } 4723 } 4724 4725 static uint32_t 4726 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4727 { 4728 uint64_t aligned_length, max_write_blocks; 4729 4730 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4731 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4732 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4733 4734 return max_write_blocks; 4735 } 4736 4737 uint32_t 4738 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4739 { 4740 return bdev->max_copy; 4741 } 4742 4743 uint64_t 4744 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4745 { 4746 return bdev->internal.measured_queue_depth; 4747 } 4748 4749 uint64_t 4750 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4751 { 4752 return bdev->internal.period; 4753 } 4754 4755 uint64_t 4756 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4757 { 4758 return bdev->internal.weighted_io_time; 4759 } 4760 4761 uint64_t 4762 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4763 { 4764 return bdev->internal.io_time; 4765 } 4766 4767 static void bdev_update_qd_sampling_period(void *ctx); 4768 4769 static void 4770 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4771 { 4772 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4773 4774 if (bdev->internal.measured_queue_depth) { 4775 bdev->internal.io_time += bdev->internal.period; 4776 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4777 } 4778 4779 bdev->internal.qd_poll_in_progress = false; 4780 4781 bdev_update_qd_sampling_period(bdev); 4782 } 4783 4784 static void 4785 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4786 struct spdk_io_channel *io_ch, void *_ctx) 4787 { 4788 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4789 4790 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4791 spdk_bdev_for_each_channel_continue(i, 0); 4792 } 4793 4794 static int 4795 bdev_calculate_measured_queue_depth(void *ctx) 4796 { 4797 struct spdk_bdev *bdev = ctx; 4798 4799 bdev->internal.qd_poll_in_progress = true; 4800 bdev->internal.temporary_queue_depth = 0; 4801 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4802 return SPDK_POLLER_BUSY; 4803 } 4804 4805 static void 4806 bdev_update_qd_sampling_period(void *ctx) 4807 { 4808 struct spdk_bdev *bdev = ctx; 4809 4810 if (bdev->internal.period == bdev->internal.new_period) { 4811 return; 4812 } 4813 4814 if (bdev->internal.qd_poll_in_progress) { 4815 return; 4816 } 4817 4818 bdev->internal.period = bdev->internal.new_period; 4819 4820 spdk_poller_unregister(&bdev->internal.qd_poller); 4821 if (bdev->internal.period != 0) { 4822 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4823 bdev, bdev->internal.period); 4824 } else { 4825 spdk_bdev_close(bdev->internal.qd_desc); 4826 bdev->internal.qd_desc = NULL; 4827 } 4828 } 4829 4830 static void 4831 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4832 { 4833 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4834 } 4835 4836 void 4837 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4838 { 4839 int rc; 4840 4841 if (bdev->internal.new_period == period) { 4842 return; 4843 } 4844 4845 bdev->internal.new_period = period; 4846 4847 if (bdev->internal.qd_desc != NULL) { 4848 assert(bdev->internal.period != 0); 4849 4850 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4851 bdev_update_qd_sampling_period, bdev); 4852 return; 4853 } 4854 4855 assert(bdev->internal.period == 0); 4856 4857 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4858 NULL, &bdev->internal.qd_desc); 4859 if (rc != 0) { 4860 return; 4861 } 4862 4863 bdev->internal.period = period; 4864 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4865 bdev, period); 4866 } 4867 4868 struct bdev_get_current_qd_ctx { 4869 uint64_t current_qd; 4870 spdk_bdev_get_current_qd_cb cb_fn; 4871 void *cb_arg; 4872 }; 4873 4874 static void 4875 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4876 { 4877 struct bdev_get_current_qd_ctx *ctx = _ctx; 4878 4879 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4880 4881 free(ctx); 4882 } 4883 4884 static void 4885 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4886 struct spdk_io_channel *io_ch, void *_ctx) 4887 { 4888 struct bdev_get_current_qd_ctx *ctx = _ctx; 4889 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4890 4891 ctx->current_qd += bdev_ch->io_outstanding; 4892 4893 spdk_bdev_for_each_channel_continue(i, 0); 4894 } 4895 4896 void 4897 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4898 void *cb_arg) 4899 { 4900 struct bdev_get_current_qd_ctx *ctx; 4901 4902 assert(cb_fn != NULL); 4903 4904 ctx = calloc(1, sizeof(*ctx)); 4905 if (ctx == NULL) { 4906 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4907 return; 4908 } 4909 4910 ctx->cb_fn = cb_fn; 4911 ctx->cb_arg = cb_arg; 4912 4913 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4914 } 4915 4916 static void 4917 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4918 { 4919 assert(desc->thread == spdk_get_thread()); 4920 4921 spdk_spin_lock(&desc->spinlock); 4922 desc->refs--; 4923 if (!desc->closed) { 4924 spdk_spin_unlock(&desc->spinlock); 4925 desc->callback.event_fn(type, 4926 desc->bdev, 4927 desc->callback.ctx); 4928 return; 4929 } else if (desc->refs == 0) { 4930 /* This descriptor was closed after this event_notify message was sent. 4931 * spdk_bdev_close() could not free the descriptor since this message was 4932 * in flight, so we free it now using bdev_desc_free(). 4933 */ 4934 spdk_spin_unlock(&desc->spinlock); 4935 bdev_desc_free(desc); 4936 return; 4937 } 4938 spdk_spin_unlock(&desc->spinlock); 4939 } 4940 4941 static void 4942 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4943 { 4944 spdk_spin_lock(&desc->spinlock); 4945 desc->refs++; 4946 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4947 spdk_spin_unlock(&desc->spinlock); 4948 } 4949 4950 static void 4951 _resize_notify(void *ctx) 4952 { 4953 struct spdk_bdev_desc *desc = ctx; 4954 4955 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4956 } 4957 4958 int 4959 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4960 { 4961 struct spdk_bdev_desc *desc; 4962 int ret; 4963 4964 if (size == bdev->blockcnt) { 4965 return 0; 4966 } 4967 4968 spdk_spin_lock(&bdev->internal.spinlock); 4969 4970 /* bdev has open descriptors */ 4971 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4972 bdev->blockcnt > size) { 4973 ret = -EBUSY; 4974 } else { 4975 bdev->blockcnt = size; 4976 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4977 event_notify(desc, _resize_notify); 4978 } 4979 ret = 0; 4980 } 4981 4982 spdk_spin_unlock(&bdev->internal.spinlock); 4983 4984 return ret; 4985 } 4986 4987 /* 4988 * Convert I/O offset and length from bytes to blocks. 4989 * 4990 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4991 */ 4992 static uint64_t 4993 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4994 uint64_t num_bytes, uint64_t *num_blocks) 4995 { 4996 uint32_t block_size = bdev->blocklen; 4997 uint8_t shift_cnt; 4998 4999 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5000 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5001 shift_cnt = spdk_u32log2(block_size); 5002 *offset_blocks = offset_bytes >> shift_cnt; 5003 *num_blocks = num_bytes >> shift_cnt; 5004 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5005 (num_bytes - (*num_blocks << shift_cnt)); 5006 } else { 5007 *offset_blocks = offset_bytes / block_size; 5008 *num_blocks = num_bytes / block_size; 5009 return (offset_bytes % block_size) | (num_bytes % block_size); 5010 } 5011 } 5012 5013 static bool 5014 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5015 { 5016 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5017 * has been an overflow and hence the offset has been wrapped around */ 5018 if (offset_blocks + num_blocks < offset_blocks) { 5019 return false; 5020 } 5021 5022 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5023 if (offset_blocks + num_blocks > bdev->blockcnt) { 5024 return false; 5025 } 5026 5027 return true; 5028 } 5029 5030 static void 5031 bdev_seek_complete_cb(void *ctx) 5032 { 5033 struct spdk_bdev_io *bdev_io = ctx; 5034 5035 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5036 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5037 } 5038 5039 static int 5040 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5041 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5042 spdk_bdev_io_completion_cb cb, void *cb_arg) 5043 { 5044 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5045 struct spdk_bdev_io *bdev_io; 5046 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5047 5048 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5049 5050 /* Check if offset_blocks is valid looking at the validity of one block */ 5051 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5052 return -EINVAL; 5053 } 5054 5055 bdev_io = bdev_channel_get_io(channel); 5056 if (!bdev_io) { 5057 return -ENOMEM; 5058 } 5059 5060 bdev_io->internal.ch = channel; 5061 bdev_io->internal.desc = desc; 5062 bdev_io->type = io_type; 5063 bdev_io->u.bdev.offset_blocks = offset_blocks; 5064 bdev_io->u.bdev.memory_domain = NULL; 5065 bdev_io->u.bdev.memory_domain_ctx = NULL; 5066 bdev_io->u.bdev.accel_sequence = NULL; 5067 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5068 5069 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5070 /* In case bdev doesn't support seek to next data/hole offset, 5071 * it is assumed that only data and no holes are present */ 5072 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5073 bdev_io->u.bdev.seek.offset = offset_blocks; 5074 } else { 5075 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5076 } 5077 5078 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5079 return 0; 5080 } 5081 5082 bdev_io_submit(bdev_io); 5083 return 0; 5084 } 5085 5086 int 5087 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5088 uint64_t offset_blocks, 5089 spdk_bdev_io_completion_cb cb, void *cb_arg) 5090 { 5091 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5092 } 5093 5094 int 5095 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5096 uint64_t offset_blocks, 5097 spdk_bdev_io_completion_cb cb, void *cb_arg) 5098 { 5099 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5100 } 5101 5102 uint64_t 5103 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5104 { 5105 return bdev_io->u.bdev.seek.offset; 5106 } 5107 5108 static int 5109 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5110 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5111 spdk_bdev_io_completion_cb cb, void *cb_arg) 5112 { 5113 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5114 struct spdk_bdev_io *bdev_io; 5115 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5116 5117 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5118 return -EINVAL; 5119 } 5120 5121 bdev_io = bdev_channel_get_io(channel); 5122 if (!bdev_io) { 5123 return -ENOMEM; 5124 } 5125 5126 bdev_io->internal.ch = channel; 5127 bdev_io->internal.desc = desc; 5128 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5129 bdev_io->u.bdev.iovs = &bdev_io->iov; 5130 bdev_io->u.bdev.iovs[0].iov_base = buf; 5131 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5132 bdev_io->u.bdev.iovcnt = 1; 5133 bdev_io->u.bdev.md_buf = md_buf; 5134 bdev_io->u.bdev.num_blocks = num_blocks; 5135 bdev_io->u.bdev.offset_blocks = offset_blocks; 5136 bdev_io->u.bdev.memory_domain = NULL; 5137 bdev_io->u.bdev.memory_domain_ctx = NULL; 5138 bdev_io->u.bdev.accel_sequence = NULL; 5139 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5140 5141 bdev_io_submit(bdev_io); 5142 return 0; 5143 } 5144 5145 int 5146 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5147 void *buf, uint64_t offset, uint64_t nbytes, 5148 spdk_bdev_io_completion_cb cb, void *cb_arg) 5149 { 5150 uint64_t offset_blocks, num_blocks; 5151 5152 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5153 nbytes, &num_blocks) != 0) { 5154 return -EINVAL; 5155 } 5156 5157 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5158 } 5159 5160 int 5161 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5162 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5163 spdk_bdev_io_completion_cb cb, void *cb_arg) 5164 { 5165 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5166 } 5167 5168 int 5169 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5170 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5171 spdk_bdev_io_completion_cb cb, void *cb_arg) 5172 { 5173 struct iovec iov = { 5174 .iov_base = buf, 5175 }; 5176 5177 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5178 return -EINVAL; 5179 } 5180 5181 if (md_buf && !_is_buf_allocated(&iov)) { 5182 return -EINVAL; 5183 } 5184 5185 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5186 cb, cb_arg); 5187 } 5188 5189 int 5190 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5191 struct iovec *iov, int iovcnt, 5192 uint64_t offset, uint64_t nbytes, 5193 spdk_bdev_io_completion_cb cb, void *cb_arg) 5194 { 5195 uint64_t offset_blocks, num_blocks; 5196 5197 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5198 nbytes, &num_blocks) != 0) { 5199 return -EINVAL; 5200 } 5201 5202 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5203 } 5204 5205 static int 5206 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5207 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5208 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5209 struct spdk_accel_sequence *seq, 5210 spdk_bdev_io_completion_cb cb, void *cb_arg) 5211 { 5212 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5213 struct spdk_bdev_io *bdev_io; 5214 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5215 5216 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5217 return -EINVAL; 5218 } 5219 5220 bdev_io = bdev_channel_get_io(channel); 5221 if (!bdev_io) { 5222 return -ENOMEM; 5223 } 5224 5225 bdev_io->internal.ch = channel; 5226 bdev_io->internal.desc = desc; 5227 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5228 bdev_io->u.bdev.iovs = iov; 5229 bdev_io->u.bdev.iovcnt = iovcnt; 5230 bdev_io->u.bdev.md_buf = md_buf; 5231 bdev_io->u.bdev.num_blocks = num_blocks; 5232 bdev_io->u.bdev.offset_blocks = offset_blocks; 5233 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5234 bdev_io->internal.memory_domain = domain; 5235 bdev_io->internal.memory_domain_ctx = domain_ctx; 5236 bdev_io->internal.accel_sequence = seq; 5237 bdev_io->internal.has_accel_sequence = seq != NULL; 5238 bdev_io->u.bdev.memory_domain = domain; 5239 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5240 bdev_io->u.bdev.accel_sequence = seq; 5241 5242 _bdev_io_submit_ext(desc, bdev_io); 5243 5244 return 0; 5245 } 5246 5247 int 5248 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5249 struct iovec *iov, int iovcnt, 5250 uint64_t offset_blocks, uint64_t num_blocks, 5251 spdk_bdev_io_completion_cb cb, void *cb_arg) 5252 { 5253 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5254 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5255 } 5256 5257 int 5258 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5259 struct iovec *iov, int iovcnt, void *md_buf, 5260 uint64_t offset_blocks, uint64_t num_blocks, 5261 spdk_bdev_io_completion_cb cb, void *cb_arg) 5262 { 5263 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5264 return -EINVAL; 5265 } 5266 5267 if (md_buf && !_is_buf_allocated(iov)) { 5268 return -EINVAL; 5269 } 5270 5271 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5272 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5273 } 5274 5275 static inline bool 5276 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5277 { 5278 /* 5279 * We check if opts size is at least of size when we first introduced 5280 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5281 * are not checked internal. 5282 */ 5283 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5284 sizeof(opts->metadata) && 5285 opts->size <= sizeof(*opts) && 5286 /* When memory domain is used, the user must provide data buffers */ 5287 (!opts->memory_domain || (iov && iov[0].iov_base)); 5288 } 5289 5290 int 5291 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5292 struct iovec *iov, int iovcnt, 5293 uint64_t offset_blocks, uint64_t num_blocks, 5294 spdk_bdev_io_completion_cb cb, void *cb_arg, 5295 struct spdk_bdev_ext_io_opts *opts) 5296 { 5297 void *md = NULL; 5298 5299 if (opts) { 5300 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5301 return -EINVAL; 5302 } 5303 md = opts->metadata; 5304 } 5305 5306 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5307 return -EINVAL; 5308 } 5309 5310 if (md && !_is_buf_allocated(iov)) { 5311 return -EINVAL; 5312 } 5313 5314 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5315 num_blocks, 5316 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5317 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5318 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5319 cb, cb_arg); 5320 } 5321 5322 static int 5323 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5324 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5325 spdk_bdev_io_completion_cb cb, void *cb_arg) 5326 { 5327 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5328 struct spdk_bdev_io *bdev_io; 5329 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5330 5331 if (!desc->write) { 5332 return -EBADF; 5333 } 5334 5335 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5336 return -EINVAL; 5337 } 5338 5339 bdev_io = bdev_channel_get_io(channel); 5340 if (!bdev_io) { 5341 return -ENOMEM; 5342 } 5343 5344 bdev_io->internal.ch = channel; 5345 bdev_io->internal.desc = desc; 5346 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5347 bdev_io->u.bdev.iovs = &bdev_io->iov; 5348 bdev_io->u.bdev.iovs[0].iov_base = buf; 5349 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5350 bdev_io->u.bdev.iovcnt = 1; 5351 bdev_io->u.bdev.md_buf = md_buf; 5352 bdev_io->u.bdev.num_blocks = num_blocks; 5353 bdev_io->u.bdev.offset_blocks = offset_blocks; 5354 bdev_io->u.bdev.memory_domain = NULL; 5355 bdev_io->u.bdev.memory_domain_ctx = NULL; 5356 bdev_io->u.bdev.accel_sequence = NULL; 5357 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5358 5359 bdev_io_submit(bdev_io); 5360 return 0; 5361 } 5362 5363 int 5364 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5365 void *buf, uint64_t offset, uint64_t nbytes, 5366 spdk_bdev_io_completion_cb cb, void *cb_arg) 5367 { 5368 uint64_t offset_blocks, num_blocks; 5369 5370 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5371 nbytes, &num_blocks) != 0) { 5372 return -EINVAL; 5373 } 5374 5375 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5376 } 5377 5378 int 5379 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5380 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5381 spdk_bdev_io_completion_cb cb, void *cb_arg) 5382 { 5383 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5384 cb, cb_arg); 5385 } 5386 5387 int 5388 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5389 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5390 spdk_bdev_io_completion_cb cb, void *cb_arg) 5391 { 5392 struct iovec iov = { 5393 .iov_base = buf, 5394 }; 5395 5396 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5397 return -EINVAL; 5398 } 5399 5400 if (md_buf && !_is_buf_allocated(&iov)) { 5401 return -EINVAL; 5402 } 5403 5404 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5405 cb, cb_arg); 5406 } 5407 5408 static int 5409 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5410 struct iovec *iov, int iovcnt, void *md_buf, 5411 uint64_t offset_blocks, uint64_t num_blocks, 5412 struct spdk_memory_domain *domain, void *domain_ctx, 5413 struct spdk_accel_sequence *seq, 5414 spdk_bdev_io_completion_cb cb, void *cb_arg) 5415 { 5416 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5417 struct spdk_bdev_io *bdev_io; 5418 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5419 5420 if (!desc->write) { 5421 return -EBADF; 5422 } 5423 5424 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5425 return -EINVAL; 5426 } 5427 5428 bdev_io = bdev_channel_get_io(channel); 5429 if (!bdev_io) { 5430 return -ENOMEM; 5431 } 5432 5433 bdev_io->internal.ch = channel; 5434 bdev_io->internal.desc = desc; 5435 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5436 bdev_io->u.bdev.iovs = iov; 5437 bdev_io->u.bdev.iovcnt = iovcnt; 5438 bdev_io->u.bdev.md_buf = md_buf; 5439 bdev_io->u.bdev.num_blocks = num_blocks; 5440 bdev_io->u.bdev.offset_blocks = offset_blocks; 5441 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5442 bdev_io->internal.memory_domain = domain; 5443 bdev_io->internal.memory_domain_ctx = domain_ctx; 5444 bdev_io->internal.accel_sequence = seq; 5445 bdev_io->internal.has_accel_sequence = seq != NULL; 5446 bdev_io->u.bdev.memory_domain = domain; 5447 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5448 bdev_io->u.bdev.accel_sequence = seq; 5449 5450 _bdev_io_submit_ext(desc, bdev_io); 5451 5452 return 0; 5453 } 5454 5455 int 5456 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5457 struct iovec *iov, int iovcnt, 5458 uint64_t offset, uint64_t len, 5459 spdk_bdev_io_completion_cb cb, void *cb_arg) 5460 { 5461 uint64_t offset_blocks, num_blocks; 5462 5463 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5464 len, &num_blocks) != 0) { 5465 return -EINVAL; 5466 } 5467 5468 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5469 } 5470 5471 int 5472 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5473 struct iovec *iov, int iovcnt, 5474 uint64_t offset_blocks, uint64_t num_blocks, 5475 spdk_bdev_io_completion_cb cb, void *cb_arg) 5476 { 5477 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5478 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5479 } 5480 5481 int 5482 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5483 struct iovec *iov, int iovcnt, void *md_buf, 5484 uint64_t offset_blocks, uint64_t num_blocks, 5485 spdk_bdev_io_completion_cb cb, void *cb_arg) 5486 { 5487 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5488 return -EINVAL; 5489 } 5490 5491 if (md_buf && !_is_buf_allocated(iov)) { 5492 return -EINVAL; 5493 } 5494 5495 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5496 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5497 } 5498 5499 int 5500 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5501 struct iovec *iov, int iovcnt, 5502 uint64_t offset_blocks, uint64_t num_blocks, 5503 spdk_bdev_io_completion_cb cb, void *cb_arg, 5504 struct spdk_bdev_ext_io_opts *opts) 5505 { 5506 void *md = NULL; 5507 5508 if (opts) { 5509 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5510 return -EINVAL; 5511 } 5512 md = opts->metadata; 5513 } 5514 5515 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5516 return -EINVAL; 5517 } 5518 5519 if (md && !_is_buf_allocated(iov)) { 5520 return -EINVAL; 5521 } 5522 5523 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5524 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5525 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5526 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5527 cb, cb_arg); 5528 } 5529 5530 static void 5531 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5532 { 5533 struct spdk_bdev_io *parent_io = cb_arg; 5534 struct spdk_bdev *bdev = parent_io->bdev; 5535 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5536 int i, rc = 0; 5537 5538 if (!success) { 5539 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5540 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5541 spdk_bdev_free_io(bdev_io); 5542 return; 5543 } 5544 5545 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5546 rc = memcmp(read_buf, 5547 parent_io->u.bdev.iovs[i].iov_base, 5548 parent_io->u.bdev.iovs[i].iov_len); 5549 if (rc) { 5550 break; 5551 } 5552 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5553 } 5554 5555 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5556 rc = memcmp(bdev_io->u.bdev.md_buf, 5557 parent_io->u.bdev.md_buf, 5558 spdk_bdev_get_md_size(bdev)); 5559 } 5560 5561 spdk_bdev_free_io(bdev_io); 5562 5563 if (rc == 0) { 5564 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5565 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5566 } else { 5567 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5568 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5569 } 5570 } 5571 5572 static void 5573 bdev_compare_do_read(void *_bdev_io) 5574 { 5575 struct spdk_bdev_io *bdev_io = _bdev_io; 5576 int rc; 5577 5578 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5579 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5580 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5581 bdev_compare_do_read_done, bdev_io); 5582 5583 if (rc == -ENOMEM) { 5584 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5585 } else if (rc != 0) { 5586 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5587 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5588 } 5589 } 5590 5591 static int 5592 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5593 struct iovec *iov, int iovcnt, void *md_buf, 5594 uint64_t offset_blocks, uint64_t num_blocks, 5595 spdk_bdev_io_completion_cb cb, void *cb_arg) 5596 { 5597 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5598 struct spdk_bdev_io *bdev_io; 5599 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5600 5601 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5602 return -EINVAL; 5603 } 5604 5605 bdev_io = bdev_channel_get_io(channel); 5606 if (!bdev_io) { 5607 return -ENOMEM; 5608 } 5609 5610 bdev_io->internal.ch = channel; 5611 bdev_io->internal.desc = desc; 5612 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5613 bdev_io->u.bdev.iovs = iov; 5614 bdev_io->u.bdev.iovcnt = iovcnt; 5615 bdev_io->u.bdev.md_buf = md_buf; 5616 bdev_io->u.bdev.num_blocks = num_blocks; 5617 bdev_io->u.bdev.offset_blocks = offset_blocks; 5618 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5619 bdev_io->u.bdev.memory_domain = NULL; 5620 bdev_io->u.bdev.memory_domain_ctx = NULL; 5621 bdev_io->u.bdev.accel_sequence = NULL; 5622 5623 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5624 bdev_io_submit(bdev_io); 5625 return 0; 5626 } 5627 5628 bdev_compare_do_read(bdev_io); 5629 5630 return 0; 5631 } 5632 5633 int 5634 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5635 struct iovec *iov, int iovcnt, 5636 uint64_t offset_blocks, uint64_t num_blocks, 5637 spdk_bdev_io_completion_cb cb, void *cb_arg) 5638 { 5639 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5640 num_blocks, cb, cb_arg); 5641 } 5642 5643 int 5644 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5645 struct iovec *iov, int iovcnt, void *md_buf, 5646 uint64_t offset_blocks, uint64_t num_blocks, 5647 spdk_bdev_io_completion_cb cb, void *cb_arg) 5648 { 5649 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5650 return -EINVAL; 5651 } 5652 5653 if (md_buf && !_is_buf_allocated(iov)) { 5654 return -EINVAL; 5655 } 5656 5657 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5658 num_blocks, cb, cb_arg); 5659 } 5660 5661 static int 5662 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5663 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5664 spdk_bdev_io_completion_cb cb, void *cb_arg) 5665 { 5666 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5667 struct spdk_bdev_io *bdev_io; 5668 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5669 5670 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5671 return -EINVAL; 5672 } 5673 5674 bdev_io = bdev_channel_get_io(channel); 5675 if (!bdev_io) { 5676 return -ENOMEM; 5677 } 5678 5679 bdev_io->internal.ch = channel; 5680 bdev_io->internal.desc = desc; 5681 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5682 bdev_io->u.bdev.iovs = &bdev_io->iov; 5683 bdev_io->u.bdev.iovs[0].iov_base = buf; 5684 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5685 bdev_io->u.bdev.iovcnt = 1; 5686 bdev_io->u.bdev.md_buf = md_buf; 5687 bdev_io->u.bdev.num_blocks = num_blocks; 5688 bdev_io->u.bdev.offset_blocks = offset_blocks; 5689 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5690 bdev_io->u.bdev.memory_domain = NULL; 5691 bdev_io->u.bdev.memory_domain_ctx = NULL; 5692 bdev_io->u.bdev.accel_sequence = NULL; 5693 5694 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5695 bdev_io_submit(bdev_io); 5696 return 0; 5697 } 5698 5699 bdev_compare_do_read(bdev_io); 5700 5701 return 0; 5702 } 5703 5704 int 5705 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5706 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5707 spdk_bdev_io_completion_cb cb, void *cb_arg) 5708 { 5709 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5710 cb, cb_arg); 5711 } 5712 5713 int 5714 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5715 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5716 spdk_bdev_io_completion_cb cb, void *cb_arg) 5717 { 5718 struct iovec iov = { 5719 .iov_base = buf, 5720 }; 5721 5722 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5723 return -EINVAL; 5724 } 5725 5726 if (md_buf && !_is_buf_allocated(&iov)) { 5727 return -EINVAL; 5728 } 5729 5730 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5731 cb, cb_arg); 5732 } 5733 5734 static void 5735 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5736 { 5737 struct spdk_bdev_io *bdev_io = ctx; 5738 5739 if (unlock_status) { 5740 SPDK_ERRLOG("LBA range unlock failed\n"); 5741 } 5742 5743 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5744 false, bdev_io->internal.caller_ctx); 5745 } 5746 5747 static void 5748 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5749 { 5750 bdev_io->internal.status = status; 5751 5752 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5753 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5754 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5755 } 5756 5757 static void 5758 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5759 { 5760 struct spdk_bdev_io *parent_io = cb_arg; 5761 5762 if (!success) { 5763 SPDK_ERRLOG("Compare and write operation failed\n"); 5764 } 5765 5766 spdk_bdev_free_io(bdev_io); 5767 5768 bdev_comparev_and_writev_blocks_unlock(parent_io, 5769 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5770 } 5771 5772 static void 5773 bdev_compare_and_write_do_write(void *_bdev_io) 5774 { 5775 struct spdk_bdev_io *bdev_io = _bdev_io; 5776 int rc; 5777 5778 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5779 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5780 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5781 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5782 bdev_compare_and_write_do_write_done, bdev_io); 5783 5784 5785 if (rc == -ENOMEM) { 5786 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5787 } else if (rc != 0) { 5788 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5789 } 5790 } 5791 5792 static void 5793 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5794 { 5795 struct spdk_bdev_io *parent_io = cb_arg; 5796 5797 spdk_bdev_free_io(bdev_io); 5798 5799 if (!success) { 5800 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5801 return; 5802 } 5803 5804 bdev_compare_and_write_do_write(parent_io); 5805 } 5806 5807 static void 5808 bdev_compare_and_write_do_compare(void *_bdev_io) 5809 { 5810 struct spdk_bdev_io *bdev_io = _bdev_io; 5811 int rc; 5812 5813 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5814 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5815 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5816 bdev_compare_and_write_do_compare_done, bdev_io); 5817 5818 if (rc == -ENOMEM) { 5819 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5820 } else if (rc != 0) { 5821 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5822 } 5823 } 5824 5825 static void 5826 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5827 { 5828 struct spdk_bdev_io *bdev_io = ctx; 5829 5830 if (status) { 5831 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5832 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5833 return; 5834 } 5835 5836 bdev_compare_and_write_do_compare(bdev_io); 5837 } 5838 5839 int 5840 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5841 struct iovec *compare_iov, int compare_iovcnt, 5842 struct iovec *write_iov, int write_iovcnt, 5843 uint64_t offset_blocks, uint64_t num_blocks, 5844 spdk_bdev_io_completion_cb cb, void *cb_arg) 5845 { 5846 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5847 struct spdk_bdev_io *bdev_io; 5848 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5849 5850 if (!desc->write) { 5851 return -EBADF; 5852 } 5853 5854 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5855 return -EINVAL; 5856 } 5857 5858 if (num_blocks > bdev->acwu) { 5859 return -EINVAL; 5860 } 5861 5862 bdev_io = bdev_channel_get_io(channel); 5863 if (!bdev_io) { 5864 return -ENOMEM; 5865 } 5866 5867 bdev_io->internal.ch = channel; 5868 bdev_io->internal.desc = desc; 5869 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5870 bdev_io->u.bdev.iovs = compare_iov; 5871 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5872 bdev_io->u.bdev.fused_iovs = write_iov; 5873 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5874 bdev_io->u.bdev.md_buf = NULL; 5875 bdev_io->u.bdev.num_blocks = num_blocks; 5876 bdev_io->u.bdev.offset_blocks = offset_blocks; 5877 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5878 bdev_io->u.bdev.memory_domain = NULL; 5879 bdev_io->u.bdev.memory_domain_ctx = NULL; 5880 bdev_io->u.bdev.accel_sequence = NULL; 5881 5882 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5883 bdev_io_submit(bdev_io); 5884 return 0; 5885 } 5886 5887 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5888 bdev_comparev_and_writev_blocks_locked, bdev_io); 5889 } 5890 5891 int 5892 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5893 struct iovec *iov, int iovcnt, 5894 uint64_t offset_blocks, uint64_t num_blocks, 5895 bool populate, 5896 spdk_bdev_io_completion_cb cb, void *cb_arg) 5897 { 5898 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5899 struct spdk_bdev_io *bdev_io; 5900 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5901 5902 if (!desc->write) { 5903 return -EBADF; 5904 } 5905 5906 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5907 return -EINVAL; 5908 } 5909 5910 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5911 return -ENOTSUP; 5912 } 5913 5914 bdev_io = bdev_channel_get_io(channel); 5915 if (!bdev_io) { 5916 return -ENOMEM; 5917 } 5918 5919 bdev_io->internal.ch = channel; 5920 bdev_io->internal.desc = desc; 5921 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5922 bdev_io->u.bdev.num_blocks = num_blocks; 5923 bdev_io->u.bdev.offset_blocks = offset_blocks; 5924 bdev_io->u.bdev.iovs = iov; 5925 bdev_io->u.bdev.iovcnt = iovcnt; 5926 bdev_io->u.bdev.md_buf = NULL; 5927 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5928 bdev_io->u.bdev.zcopy.commit = 0; 5929 bdev_io->u.bdev.zcopy.start = 1; 5930 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5931 bdev_io->u.bdev.memory_domain = NULL; 5932 bdev_io->u.bdev.memory_domain_ctx = NULL; 5933 bdev_io->u.bdev.accel_sequence = NULL; 5934 5935 bdev_io_submit(bdev_io); 5936 5937 return 0; 5938 } 5939 5940 int 5941 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5942 spdk_bdev_io_completion_cb cb, void *cb_arg) 5943 { 5944 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5945 return -EINVAL; 5946 } 5947 5948 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5949 bdev_io->u.bdev.zcopy.start = 0; 5950 bdev_io->internal.caller_ctx = cb_arg; 5951 bdev_io->internal.cb = cb; 5952 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5953 5954 bdev_io_submit(bdev_io); 5955 5956 return 0; 5957 } 5958 5959 int 5960 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5961 uint64_t offset, uint64_t len, 5962 spdk_bdev_io_completion_cb cb, void *cb_arg) 5963 { 5964 uint64_t offset_blocks, num_blocks; 5965 5966 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5967 len, &num_blocks) != 0) { 5968 return -EINVAL; 5969 } 5970 5971 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5972 } 5973 5974 int 5975 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5976 uint64_t offset_blocks, uint64_t num_blocks, 5977 spdk_bdev_io_completion_cb cb, void *cb_arg) 5978 { 5979 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5980 struct spdk_bdev_io *bdev_io; 5981 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5982 5983 if (!desc->write) { 5984 return -EBADF; 5985 } 5986 5987 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5988 return -EINVAL; 5989 } 5990 5991 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5992 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5993 return -ENOTSUP; 5994 } 5995 5996 bdev_io = bdev_channel_get_io(channel); 5997 5998 if (!bdev_io) { 5999 return -ENOMEM; 6000 } 6001 6002 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6003 bdev_io->internal.ch = channel; 6004 bdev_io->internal.desc = desc; 6005 bdev_io->u.bdev.offset_blocks = offset_blocks; 6006 bdev_io->u.bdev.num_blocks = num_blocks; 6007 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6008 bdev_io->u.bdev.memory_domain = NULL; 6009 bdev_io->u.bdev.memory_domain_ctx = NULL; 6010 bdev_io->u.bdev.accel_sequence = NULL; 6011 6012 /* If the write_zeroes size is large and should be split, use the generic split 6013 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6014 * 6015 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6016 * or emulate it using regular write request otherwise. 6017 */ 6018 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6019 bdev_io->internal.split) { 6020 bdev_io_submit(bdev_io); 6021 return 0; 6022 } 6023 6024 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6025 6026 return bdev_write_zero_buffer(bdev_io); 6027 } 6028 6029 int 6030 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6031 uint64_t offset, uint64_t nbytes, 6032 spdk_bdev_io_completion_cb cb, void *cb_arg) 6033 { 6034 uint64_t offset_blocks, num_blocks; 6035 6036 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6037 nbytes, &num_blocks) != 0) { 6038 return -EINVAL; 6039 } 6040 6041 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6042 } 6043 6044 int 6045 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6046 uint64_t offset_blocks, uint64_t num_blocks, 6047 spdk_bdev_io_completion_cb cb, void *cb_arg) 6048 { 6049 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6050 struct spdk_bdev_io *bdev_io; 6051 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6052 6053 if (!desc->write) { 6054 return -EBADF; 6055 } 6056 6057 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6058 return -EINVAL; 6059 } 6060 6061 if (num_blocks == 0) { 6062 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6063 return -EINVAL; 6064 } 6065 6066 bdev_io = bdev_channel_get_io(channel); 6067 if (!bdev_io) { 6068 return -ENOMEM; 6069 } 6070 6071 bdev_io->internal.ch = channel; 6072 bdev_io->internal.desc = desc; 6073 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6074 6075 bdev_io->u.bdev.iovs = &bdev_io->iov; 6076 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6077 bdev_io->u.bdev.iovs[0].iov_len = 0; 6078 bdev_io->u.bdev.iovcnt = 1; 6079 6080 bdev_io->u.bdev.offset_blocks = offset_blocks; 6081 bdev_io->u.bdev.num_blocks = num_blocks; 6082 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6083 bdev_io->u.bdev.memory_domain = NULL; 6084 bdev_io->u.bdev.memory_domain_ctx = NULL; 6085 bdev_io->u.bdev.accel_sequence = NULL; 6086 6087 bdev_io_submit(bdev_io); 6088 return 0; 6089 } 6090 6091 int 6092 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6093 uint64_t offset, uint64_t length, 6094 spdk_bdev_io_completion_cb cb, void *cb_arg) 6095 { 6096 uint64_t offset_blocks, num_blocks; 6097 6098 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6099 length, &num_blocks) != 0) { 6100 return -EINVAL; 6101 } 6102 6103 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6104 } 6105 6106 int 6107 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6108 uint64_t offset_blocks, uint64_t num_blocks, 6109 spdk_bdev_io_completion_cb cb, void *cb_arg) 6110 { 6111 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6112 struct spdk_bdev_io *bdev_io; 6113 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6114 6115 if (!desc->write) { 6116 return -EBADF; 6117 } 6118 6119 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6120 return -EINVAL; 6121 } 6122 6123 bdev_io = bdev_channel_get_io(channel); 6124 if (!bdev_io) { 6125 return -ENOMEM; 6126 } 6127 6128 bdev_io->internal.ch = channel; 6129 bdev_io->internal.desc = desc; 6130 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6131 bdev_io->u.bdev.iovs = NULL; 6132 bdev_io->u.bdev.iovcnt = 0; 6133 bdev_io->u.bdev.offset_blocks = offset_blocks; 6134 bdev_io->u.bdev.num_blocks = num_blocks; 6135 bdev_io->u.bdev.memory_domain = NULL; 6136 bdev_io->u.bdev.memory_domain_ctx = NULL; 6137 bdev_io->u.bdev.accel_sequence = NULL; 6138 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6139 6140 bdev_io_submit(bdev_io); 6141 return 0; 6142 } 6143 6144 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6145 6146 static void 6147 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6148 { 6149 struct spdk_bdev_channel *ch = _ctx; 6150 struct spdk_bdev_io *bdev_io; 6151 6152 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6153 6154 if (status == -EBUSY) { 6155 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6156 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6157 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6158 } else { 6159 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6160 6161 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6162 /* If outstanding IOs are still present and reset_io_drain_timeout 6163 * seconds passed, start the reset. */ 6164 bdev_io_submit_reset(bdev_io); 6165 } else { 6166 /* We still have in progress memory domain pull/push or we're 6167 * executing accel sequence. Since we cannot abort either of those 6168 * operaions, fail the reset request. */ 6169 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6170 } 6171 } 6172 } else { 6173 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6174 SPDK_DEBUGLOG(bdev, 6175 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6176 ch->bdev->name); 6177 /* Mark the completion status as a SUCCESS and complete the reset. */ 6178 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6179 } 6180 } 6181 6182 static void 6183 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6184 struct spdk_io_channel *io_ch, void *_ctx) 6185 { 6186 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6187 int status = 0; 6188 6189 if (cur_ch->io_outstanding > 0 || 6190 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6191 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6192 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6193 * further iteration over the rest of the channels and pass non-zero status 6194 * to the callback function. */ 6195 status = -EBUSY; 6196 } 6197 spdk_bdev_for_each_channel_continue(i, status); 6198 } 6199 6200 static int 6201 bdev_reset_poll_for_outstanding_io(void *ctx) 6202 { 6203 struct spdk_bdev_channel *ch = ctx; 6204 struct spdk_bdev_io *bdev_io; 6205 6206 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6207 6208 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6209 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6210 bdev_reset_check_outstanding_io_done); 6211 6212 return SPDK_POLLER_BUSY; 6213 } 6214 6215 static void 6216 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6217 { 6218 struct spdk_bdev_channel *ch = _ctx; 6219 struct spdk_bdev_io *bdev_io; 6220 6221 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6222 6223 if (bdev->reset_io_drain_timeout == 0) { 6224 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6225 6226 bdev_io_submit_reset(bdev_io); 6227 return; 6228 } 6229 6230 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6231 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6232 6233 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6234 * submit the reset to the underlying module only if outstanding I/O 6235 * remain after reset_io_drain_timeout seconds have passed. */ 6236 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6237 bdev_reset_check_outstanding_io_done); 6238 } 6239 6240 static void 6241 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6242 struct spdk_io_channel *ch, void *_ctx) 6243 { 6244 struct spdk_bdev_channel *channel; 6245 struct spdk_bdev_mgmt_channel *mgmt_channel; 6246 struct spdk_bdev_shared_resource *shared_resource; 6247 bdev_io_tailq_t tmp_queued; 6248 6249 TAILQ_INIT(&tmp_queued); 6250 6251 channel = __io_ch_to_bdev_ch(ch); 6252 shared_resource = channel->shared_resource; 6253 mgmt_channel = shared_resource->mgmt_ch; 6254 6255 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6256 6257 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6258 /* The QoS object is always valid and readable while 6259 * the channel flag is set, so the lock here should not 6260 * be necessary. We're not in the fast path though, so 6261 * just take it anyway. */ 6262 spdk_spin_lock(&channel->bdev->internal.spinlock); 6263 if (channel->bdev->internal.qos->ch == channel) { 6264 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6265 } 6266 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6267 } 6268 6269 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6270 bdev_abort_all_buf_io(mgmt_channel, channel); 6271 bdev_abort_all_queued_io(&tmp_queued, channel); 6272 6273 spdk_bdev_for_each_channel_continue(i, 0); 6274 } 6275 6276 static void 6277 bdev_start_reset(void *ctx) 6278 { 6279 struct spdk_bdev_channel *ch = ctx; 6280 6281 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6282 bdev_reset_freeze_channel_done); 6283 } 6284 6285 static void 6286 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6287 { 6288 struct spdk_bdev *bdev = ch->bdev; 6289 6290 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6291 6292 spdk_spin_lock(&bdev->internal.spinlock); 6293 if (bdev->internal.reset_in_progress == NULL) { 6294 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6295 /* 6296 * Take a channel reference for the target bdev for the life of this 6297 * reset. This guards against the channel getting destroyed while 6298 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6299 * progress. We will release the reference when this reset is 6300 * completed. 6301 */ 6302 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6303 bdev_start_reset(ch); 6304 } 6305 spdk_spin_unlock(&bdev->internal.spinlock); 6306 } 6307 6308 int 6309 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6310 spdk_bdev_io_completion_cb cb, void *cb_arg) 6311 { 6312 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6313 struct spdk_bdev_io *bdev_io; 6314 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6315 6316 bdev_io = bdev_channel_get_io(channel); 6317 if (!bdev_io) { 6318 return -ENOMEM; 6319 } 6320 6321 bdev_io->internal.ch = channel; 6322 bdev_io->internal.desc = desc; 6323 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6324 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6325 bdev_io->u.reset.ch_ref = NULL; 6326 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6327 6328 spdk_spin_lock(&bdev->internal.spinlock); 6329 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6330 spdk_spin_unlock(&bdev->internal.spinlock); 6331 6332 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6333 internal.ch_link); 6334 6335 bdev_channel_start_reset(channel); 6336 6337 return 0; 6338 } 6339 6340 void 6341 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6342 struct spdk_bdev_io_stat *stat) 6343 { 6344 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6345 6346 bdev_get_io_stat(stat, channel->stat); 6347 } 6348 6349 static void 6350 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6351 { 6352 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6353 6354 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6355 bdev_iostat_ctx->cb_arg, 0); 6356 free(bdev_iostat_ctx); 6357 } 6358 6359 static void 6360 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6361 struct spdk_io_channel *ch, void *_ctx) 6362 { 6363 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6364 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6365 6366 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6367 spdk_bdev_for_each_channel_continue(i, 0); 6368 } 6369 6370 void 6371 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6372 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6373 { 6374 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6375 6376 assert(bdev != NULL); 6377 assert(stat != NULL); 6378 assert(cb != NULL); 6379 6380 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6381 if (bdev_iostat_ctx == NULL) { 6382 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6383 cb(bdev, stat, cb_arg, -ENOMEM); 6384 return; 6385 } 6386 6387 bdev_iostat_ctx->stat = stat; 6388 bdev_iostat_ctx->cb = cb; 6389 bdev_iostat_ctx->cb_arg = cb_arg; 6390 6391 /* Start with the statistics from previously deleted channels. */ 6392 spdk_spin_lock(&bdev->internal.spinlock); 6393 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6394 spdk_spin_unlock(&bdev->internal.spinlock); 6395 6396 /* Then iterate and add the statistics from each existing channel. */ 6397 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6398 bdev_get_device_stat_done); 6399 } 6400 6401 struct bdev_iostat_reset_ctx { 6402 enum spdk_bdev_reset_stat_mode mode; 6403 bdev_reset_device_stat_cb cb; 6404 void *cb_arg; 6405 }; 6406 6407 static void 6408 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6409 { 6410 struct bdev_iostat_reset_ctx *ctx = _ctx; 6411 6412 ctx->cb(bdev, ctx->cb_arg, 0); 6413 6414 free(ctx); 6415 } 6416 6417 static void 6418 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6419 struct spdk_io_channel *ch, void *_ctx) 6420 { 6421 struct bdev_iostat_reset_ctx *ctx = _ctx; 6422 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6423 6424 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6425 6426 spdk_bdev_for_each_channel_continue(i, 0); 6427 } 6428 6429 void 6430 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6431 bdev_reset_device_stat_cb cb, void *cb_arg) 6432 { 6433 struct bdev_iostat_reset_ctx *ctx; 6434 6435 assert(bdev != NULL); 6436 assert(cb != NULL); 6437 6438 ctx = calloc(1, sizeof(*ctx)); 6439 if (ctx == NULL) { 6440 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6441 cb(bdev, cb_arg, -ENOMEM); 6442 return; 6443 } 6444 6445 ctx->mode = mode; 6446 ctx->cb = cb; 6447 ctx->cb_arg = cb_arg; 6448 6449 spdk_spin_lock(&bdev->internal.spinlock); 6450 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6451 spdk_spin_unlock(&bdev->internal.spinlock); 6452 6453 spdk_bdev_for_each_channel(bdev, 6454 bdev_reset_each_channel_stat, 6455 ctx, 6456 bdev_reset_device_stat_done); 6457 } 6458 6459 int 6460 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6461 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6462 spdk_bdev_io_completion_cb cb, void *cb_arg) 6463 { 6464 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6465 struct spdk_bdev_io *bdev_io; 6466 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6467 6468 if (!desc->write) { 6469 return -EBADF; 6470 } 6471 6472 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6473 return -ENOTSUP; 6474 } 6475 6476 bdev_io = bdev_channel_get_io(channel); 6477 if (!bdev_io) { 6478 return -ENOMEM; 6479 } 6480 6481 bdev_io->internal.ch = channel; 6482 bdev_io->internal.desc = desc; 6483 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6484 bdev_io->u.nvme_passthru.cmd = *cmd; 6485 bdev_io->u.nvme_passthru.buf = buf; 6486 bdev_io->u.nvme_passthru.nbytes = nbytes; 6487 bdev_io->u.nvme_passthru.md_buf = NULL; 6488 bdev_io->u.nvme_passthru.md_len = 0; 6489 6490 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6491 6492 bdev_io_submit(bdev_io); 6493 return 0; 6494 } 6495 6496 int 6497 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6498 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6499 spdk_bdev_io_completion_cb cb, void *cb_arg) 6500 { 6501 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6502 struct spdk_bdev_io *bdev_io; 6503 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6504 6505 if (!desc->write) { 6506 /* 6507 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6508 * to easily determine if the command is a read or write, but for now just 6509 * do not allow io_passthru with a read-only descriptor. 6510 */ 6511 return -EBADF; 6512 } 6513 6514 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6515 return -ENOTSUP; 6516 } 6517 6518 bdev_io = bdev_channel_get_io(channel); 6519 if (!bdev_io) { 6520 return -ENOMEM; 6521 } 6522 6523 bdev_io->internal.ch = channel; 6524 bdev_io->internal.desc = desc; 6525 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6526 bdev_io->u.nvme_passthru.cmd = *cmd; 6527 bdev_io->u.nvme_passthru.buf = buf; 6528 bdev_io->u.nvme_passthru.nbytes = nbytes; 6529 bdev_io->u.nvme_passthru.md_buf = NULL; 6530 bdev_io->u.nvme_passthru.md_len = 0; 6531 6532 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6533 6534 bdev_io_submit(bdev_io); 6535 return 0; 6536 } 6537 6538 int 6539 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6540 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6541 spdk_bdev_io_completion_cb cb, void *cb_arg) 6542 { 6543 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6544 struct spdk_bdev_io *bdev_io; 6545 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6546 6547 if (!desc->write) { 6548 /* 6549 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6550 * to easily determine if the command is a read or write, but for now just 6551 * do not allow io_passthru with a read-only descriptor. 6552 */ 6553 return -EBADF; 6554 } 6555 6556 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6557 return -ENOTSUP; 6558 } 6559 6560 bdev_io = bdev_channel_get_io(channel); 6561 if (!bdev_io) { 6562 return -ENOMEM; 6563 } 6564 6565 bdev_io->internal.ch = channel; 6566 bdev_io->internal.desc = desc; 6567 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6568 bdev_io->u.nvme_passthru.cmd = *cmd; 6569 bdev_io->u.nvme_passthru.buf = buf; 6570 bdev_io->u.nvme_passthru.nbytes = nbytes; 6571 bdev_io->u.nvme_passthru.md_buf = md_buf; 6572 bdev_io->u.nvme_passthru.md_len = md_len; 6573 6574 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6575 6576 bdev_io_submit(bdev_io); 6577 return 0; 6578 } 6579 6580 static void bdev_abort_retry(void *ctx); 6581 static void bdev_abort(struct spdk_bdev_io *parent_io); 6582 6583 static void 6584 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6585 { 6586 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6587 struct spdk_bdev_io *parent_io = cb_arg; 6588 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6589 6590 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6591 6592 spdk_bdev_free_io(bdev_io); 6593 6594 if (!success) { 6595 /* Check if the target I/O completed in the meantime. */ 6596 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6597 if (tmp_io == bio_to_abort) { 6598 break; 6599 } 6600 } 6601 6602 /* If the target I/O still exists, set the parent to failed. */ 6603 if (tmp_io != NULL) { 6604 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6605 } 6606 } 6607 6608 parent_io->u.bdev.split_outstanding--; 6609 if (parent_io->u.bdev.split_outstanding == 0) { 6610 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6611 bdev_abort_retry(parent_io); 6612 } else { 6613 bdev_io_complete(parent_io); 6614 } 6615 } 6616 } 6617 6618 static int 6619 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6620 struct spdk_bdev_io *bio_to_abort, 6621 spdk_bdev_io_completion_cb cb, void *cb_arg) 6622 { 6623 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6624 struct spdk_bdev_io *bdev_io; 6625 6626 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6627 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6628 /* TODO: Abort reset or abort request. */ 6629 return -ENOTSUP; 6630 } 6631 6632 bdev_io = bdev_channel_get_io(channel); 6633 if (bdev_io == NULL) { 6634 return -ENOMEM; 6635 } 6636 6637 bdev_io->internal.ch = channel; 6638 bdev_io->internal.desc = desc; 6639 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6640 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6641 6642 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6643 assert(bdev_io_should_split(bio_to_abort)); 6644 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6645 6646 /* Parent abort request is not submitted directly, but to manage its 6647 * execution add it to the submitted list here. 6648 */ 6649 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6650 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6651 6652 bdev_abort(bdev_io); 6653 6654 return 0; 6655 } 6656 6657 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6658 6659 /* Submit the abort request to the underlying bdev module. */ 6660 bdev_io_submit(bdev_io); 6661 6662 return 0; 6663 } 6664 6665 static bool 6666 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6667 { 6668 struct spdk_bdev_io *iter; 6669 6670 TAILQ_FOREACH(iter, tailq, internal.link) { 6671 if (iter == bdev_io) { 6672 return true; 6673 } 6674 } 6675 6676 return false; 6677 } 6678 6679 static uint32_t 6680 _bdev_abort(struct spdk_bdev_io *parent_io) 6681 { 6682 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6683 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6684 void *bio_cb_arg; 6685 struct spdk_bdev_io *bio_to_abort; 6686 uint32_t matched_ios; 6687 int rc; 6688 6689 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6690 6691 /* matched_ios is returned and will be kept by the caller. 6692 * 6693 * This function will be used for two cases, 1) the same cb_arg is used for 6694 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6695 * Incrementing split_outstanding directly here may confuse readers especially 6696 * for the 1st case. 6697 * 6698 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6699 * works as expected. 6700 */ 6701 matched_ios = 0; 6702 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6703 6704 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6705 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6706 continue; 6707 } 6708 6709 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6710 /* Any I/O which was submitted after this abort command should be excluded. */ 6711 continue; 6712 } 6713 6714 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6715 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6716 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6717 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6718 break; 6719 } 6720 6721 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6722 if (rc != 0) { 6723 if (rc == -ENOMEM) { 6724 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6725 } else { 6726 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6727 } 6728 break; 6729 } 6730 matched_ios++; 6731 } 6732 6733 return matched_ios; 6734 } 6735 6736 static void 6737 bdev_abort_retry(void *ctx) 6738 { 6739 struct spdk_bdev_io *parent_io = ctx; 6740 uint32_t matched_ios; 6741 6742 matched_ios = _bdev_abort(parent_io); 6743 6744 if (matched_ios == 0) { 6745 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6746 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6747 } else { 6748 /* For retry, the case that no target I/O was found is success 6749 * because it means target I/Os completed in the meantime. 6750 */ 6751 bdev_io_complete(parent_io); 6752 } 6753 return; 6754 } 6755 6756 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6757 parent_io->u.bdev.split_outstanding = matched_ios; 6758 } 6759 6760 static void 6761 bdev_abort(struct spdk_bdev_io *parent_io) 6762 { 6763 uint32_t matched_ios; 6764 6765 matched_ios = _bdev_abort(parent_io); 6766 6767 if (matched_ios == 0) { 6768 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6769 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6770 } else { 6771 /* The case the no target I/O was found is failure. */ 6772 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6773 bdev_io_complete(parent_io); 6774 } 6775 return; 6776 } 6777 6778 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6779 parent_io->u.bdev.split_outstanding = matched_ios; 6780 } 6781 6782 int 6783 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6784 void *bio_cb_arg, 6785 spdk_bdev_io_completion_cb cb, void *cb_arg) 6786 { 6787 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6788 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6789 struct spdk_bdev_io *bdev_io; 6790 6791 if (bio_cb_arg == NULL) { 6792 return -EINVAL; 6793 } 6794 6795 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6796 return -ENOTSUP; 6797 } 6798 6799 bdev_io = bdev_channel_get_io(channel); 6800 if (bdev_io == NULL) { 6801 return -ENOMEM; 6802 } 6803 6804 bdev_io->internal.ch = channel; 6805 bdev_io->internal.desc = desc; 6806 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6807 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6808 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6809 6810 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6811 6812 /* Parent abort request is not submitted directly, but to manage its execution, 6813 * add it to the submitted list here. 6814 */ 6815 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6816 6817 bdev_abort(bdev_io); 6818 6819 return 0; 6820 } 6821 6822 int 6823 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6824 struct spdk_bdev_io_wait_entry *entry) 6825 { 6826 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6827 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6828 6829 if (bdev != entry->bdev) { 6830 SPDK_ERRLOG("bdevs do not match\n"); 6831 return -EINVAL; 6832 } 6833 6834 if (mgmt_ch->per_thread_cache_count > 0) { 6835 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6836 return -EINVAL; 6837 } 6838 6839 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6840 return 0; 6841 } 6842 6843 static inline void 6844 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6845 { 6846 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6847 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6848 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6849 uint32_t blocklen = bdev_io->bdev->blocklen; 6850 6851 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6852 switch (bdev_io->type) { 6853 case SPDK_BDEV_IO_TYPE_READ: 6854 io_stat->bytes_read += num_blocks * blocklen; 6855 io_stat->num_read_ops++; 6856 io_stat->read_latency_ticks += tsc_diff; 6857 if (io_stat->max_read_latency_ticks < tsc_diff) { 6858 io_stat->max_read_latency_ticks = tsc_diff; 6859 } 6860 if (io_stat->min_read_latency_ticks > tsc_diff) { 6861 io_stat->min_read_latency_ticks = tsc_diff; 6862 } 6863 break; 6864 case SPDK_BDEV_IO_TYPE_WRITE: 6865 io_stat->bytes_written += num_blocks * blocklen; 6866 io_stat->num_write_ops++; 6867 io_stat->write_latency_ticks += tsc_diff; 6868 if (io_stat->max_write_latency_ticks < tsc_diff) { 6869 io_stat->max_write_latency_ticks = tsc_diff; 6870 } 6871 if (io_stat->min_write_latency_ticks > tsc_diff) { 6872 io_stat->min_write_latency_ticks = tsc_diff; 6873 } 6874 break; 6875 case SPDK_BDEV_IO_TYPE_UNMAP: 6876 io_stat->bytes_unmapped += num_blocks * blocklen; 6877 io_stat->num_unmap_ops++; 6878 io_stat->unmap_latency_ticks += tsc_diff; 6879 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6880 io_stat->max_unmap_latency_ticks = tsc_diff; 6881 } 6882 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6883 io_stat->min_unmap_latency_ticks = tsc_diff; 6884 } 6885 break; 6886 case SPDK_BDEV_IO_TYPE_ZCOPY: 6887 /* Track the data in the start phase only */ 6888 if (bdev_io->u.bdev.zcopy.start) { 6889 if (bdev_io->u.bdev.zcopy.populate) { 6890 io_stat->bytes_read += num_blocks * blocklen; 6891 io_stat->num_read_ops++; 6892 io_stat->read_latency_ticks += tsc_diff; 6893 if (io_stat->max_read_latency_ticks < tsc_diff) { 6894 io_stat->max_read_latency_ticks = tsc_diff; 6895 } 6896 if (io_stat->min_read_latency_ticks > tsc_diff) { 6897 io_stat->min_read_latency_ticks = tsc_diff; 6898 } 6899 } else { 6900 io_stat->bytes_written += num_blocks * blocklen; 6901 io_stat->num_write_ops++; 6902 io_stat->write_latency_ticks += tsc_diff; 6903 if (io_stat->max_write_latency_ticks < tsc_diff) { 6904 io_stat->max_write_latency_ticks = tsc_diff; 6905 } 6906 if (io_stat->min_write_latency_ticks > tsc_diff) { 6907 io_stat->min_write_latency_ticks = tsc_diff; 6908 } 6909 } 6910 } 6911 break; 6912 case SPDK_BDEV_IO_TYPE_COPY: 6913 io_stat->bytes_copied += num_blocks * blocklen; 6914 io_stat->num_copy_ops++; 6915 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6916 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6917 io_stat->max_copy_latency_ticks = tsc_diff; 6918 } 6919 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6920 io_stat->min_copy_latency_ticks = tsc_diff; 6921 } 6922 break; 6923 default: 6924 break; 6925 } 6926 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6927 io_stat = bdev_io->bdev->internal.stat; 6928 assert(io_stat->io_error != NULL); 6929 6930 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6931 io_stat->io_error->error_status[-io_status - 1]++; 6932 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6933 } 6934 6935 #ifdef SPDK_CONFIG_VTUNE 6936 uint64_t now_tsc = spdk_get_ticks(); 6937 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6938 uint64_t data[5]; 6939 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6940 6941 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6942 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6943 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6944 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6945 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6946 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6947 6948 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6949 __itt_metadata_u64, 5, data); 6950 6951 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6952 bdev_io->internal.ch->start_tsc = now_tsc; 6953 } 6954 #endif 6955 } 6956 6957 static inline void 6958 _bdev_io_complete(void *ctx) 6959 { 6960 struct spdk_bdev_io *bdev_io = ctx; 6961 6962 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6963 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6964 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6965 } 6966 6967 assert(bdev_io->internal.cb != NULL); 6968 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6969 6970 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6971 bdev_io->internal.caller_ctx); 6972 } 6973 6974 static inline void 6975 bdev_io_complete(void *ctx) 6976 { 6977 struct spdk_bdev_io *bdev_io = ctx; 6978 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6979 uint64_t tsc, tsc_diff; 6980 6981 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6982 /* 6983 * Defer completion to avoid potential infinite recursion if the 6984 * user's completion callback issues a new I/O. 6985 */ 6986 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6987 bdev_io_complete, bdev_io); 6988 return; 6989 } 6990 6991 tsc = spdk_get_ticks(); 6992 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6993 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6994 bdev_io->internal.caller_ctx); 6995 6996 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6997 6998 if (bdev_io->internal.ch->histogram) { 6999 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7000 } 7001 7002 bdev_io_update_io_stat(bdev_io, tsc_diff); 7003 _bdev_io_complete(bdev_io); 7004 } 7005 7006 /* The difference between this function and bdev_io_complete() is that this should be called to 7007 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7008 * io_submitted list and don't have submit_tsc updated. 7009 */ 7010 static inline void 7011 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7012 { 7013 /* Since the IO hasn't been submitted it's bound to be failed */ 7014 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7015 7016 /* At this point we don't know if the IO is completed from submission context or not, but, 7017 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7018 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7019 _bdev_io_complete, bdev_io); 7020 } 7021 7022 static void bdev_destroy_cb(void *io_device); 7023 7024 static void 7025 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7026 { 7027 struct spdk_bdev_io *bdev_io = _ctx; 7028 7029 if (bdev_io->u.reset.ch_ref != NULL) { 7030 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7031 bdev_io->u.reset.ch_ref = NULL; 7032 } 7033 7034 bdev_io_complete(bdev_io); 7035 7036 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7037 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7038 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7039 } 7040 } 7041 7042 static void 7043 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7044 struct spdk_io_channel *_ch, void *_ctx) 7045 { 7046 struct spdk_bdev_io *bdev_io = _ctx; 7047 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7048 struct spdk_bdev_io *queued_reset; 7049 7050 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7051 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7052 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7053 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7054 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7055 } 7056 7057 spdk_bdev_for_each_channel_continue(i, 0); 7058 } 7059 7060 static void 7061 bdev_io_complete_sequence_cb(void *ctx, int status) 7062 { 7063 struct spdk_bdev_io *bdev_io = ctx; 7064 7065 /* u.bdev.accel_sequence should have already been cleared at this point */ 7066 assert(bdev_io->u.bdev.accel_sequence == NULL); 7067 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7068 bdev_io->internal.accel_sequence = NULL; 7069 7070 if (spdk_unlikely(status != 0)) { 7071 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7072 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7073 } 7074 7075 bdev_io_complete(bdev_io); 7076 } 7077 7078 void 7079 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7080 { 7081 struct spdk_bdev *bdev = bdev_io->bdev; 7082 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7083 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7084 7085 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7086 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7087 spdk_bdev_get_module_name(bdev), 7088 bdev_io_status_get_string(bdev_io->internal.status)); 7089 assert(false); 7090 } 7091 bdev_io->internal.status = status; 7092 7093 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7094 bool unlock_channels = false; 7095 7096 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7097 SPDK_ERRLOG("NOMEM returned for reset\n"); 7098 } 7099 spdk_spin_lock(&bdev->internal.spinlock); 7100 if (bdev_io == bdev->internal.reset_in_progress) { 7101 bdev->internal.reset_in_progress = NULL; 7102 unlock_channels = true; 7103 } 7104 spdk_spin_unlock(&bdev->internal.spinlock); 7105 7106 if (unlock_channels) { 7107 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7108 bdev_reset_complete); 7109 return; 7110 } 7111 } else { 7112 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7113 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7114 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7115 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7116 return; 7117 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7118 !bdev_io_use_accel_sequence(bdev_io))) { 7119 _bdev_io_push_bounce_data_buffer(bdev_io, 7120 _bdev_io_complete_push_bounce_done); 7121 /* bdev IO will be completed in the callback */ 7122 return; 7123 } 7124 } 7125 7126 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7127 return; 7128 } 7129 } 7130 7131 bdev_io_complete(bdev_io); 7132 } 7133 7134 void 7135 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7136 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7137 { 7138 enum spdk_bdev_io_status status; 7139 7140 if (sc == SPDK_SCSI_STATUS_GOOD) { 7141 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7142 } else { 7143 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7144 bdev_io->internal.error.scsi.sc = sc; 7145 bdev_io->internal.error.scsi.sk = sk; 7146 bdev_io->internal.error.scsi.asc = asc; 7147 bdev_io->internal.error.scsi.ascq = ascq; 7148 } 7149 7150 spdk_bdev_io_complete(bdev_io, status); 7151 } 7152 7153 void 7154 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7155 int *sc, int *sk, int *asc, int *ascq) 7156 { 7157 assert(sc != NULL); 7158 assert(sk != NULL); 7159 assert(asc != NULL); 7160 assert(ascq != NULL); 7161 7162 switch (bdev_io->internal.status) { 7163 case SPDK_BDEV_IO_STATUS_SUCCESS: 7164 *sc = SPDK_SCSI_STATUS_GOOD; 7165 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7166 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7167 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7168 break; 7169 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7170 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7171 break; 7172 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7173 *sc = bdev_io->internal.error.scsi.sc; 7174 *sk = bdev_io->internal.error.scsi.sk; 7175 *asc = bdev_io->internal.error.scsi.asc; 7176 *ascq = bdev_io->internal.error.scsi.ascq; 7177 break; 7178 default: 7179 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7180 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7181 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7182 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7183 break; 7184 } 7185 } 7186 7187 void 7188 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7189 { 7190 enum spdk_bdev_io_status status; 7191 7192 if (aio_result == 0) { 7193 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7194 } else { 7195 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7196 } 7197 7198 bdev_io->internal.error.aio_result = aio_result; 7199 7200 spdk_bdev_io_complete(bdev_io, status); 7201 } 7202 7203 void 7204 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7205 { 7206 assert(aio_result != NULL); 7207 7208 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7209 *aio_result = bdev_io->internal.error.aio_result; 7210 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7211 *aio_result = 0; 7212 } else { 7213 *aio_result = -EIO; 7214 } 7215 } 7216 7217 void 7218 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7219 { 7220 enum spdk_bdev_io_status status; 7221 7222 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7223 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7224 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7225 status = SPDK_BDEV_IO_STATUS_ABORTED; 7226 } else { 7227 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7228 } 7229 7230 bdev_io->internal.error.nvme.cdw0 = cdw0; 7231 bdev_io->internal.error.nvme.sct = sct; 7232 bdev_io->internal.error.nvme.sc = sc; 7233 7234 spdk_bdev_io_complete(bdev_io, status); 7235 } 7236 7237 void 7238 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7239 { 7240 assert(sct != NULL); 7241 assert(sc != NULL); 7242 assert(cdw0 != NULL); 7243 7244 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7245 *sct = SPDK_NVME_SCT_GENERIC; 7246 *sc = SPDK_NVME_SC_SUCCESS; 7247 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7248 *cdw0 = 0; 7249 } else { 7250 *cdw0 = 1U; 7251 } 7252 return; 7253 } 7254 7255 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7256 *sct = bdev_io->internal.error.nvme.sct; 7257 *sc = bdev_io->internal.error.nvme.sc; 7258 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7259 *sct = SPDK_NVME_SCT_GENERIC; 7260 *sc = SPDK_NVME_SC_SUCCESS; 7261 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7262 *sct = SPDK_NVME_SCT_GENERIC; 7263 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7264 } else { 7265 *sct = SPDK_NVME_SCT_GENERIC; 7266 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7267 } 7268 7269 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7270 } 7271 7272 void 7273 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7274 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7275 { 7276 assert(first_sct != NULL); 7277 assert(first_sc != NULL); 7278 assert(second_sct != NULL); 7279 assert(second_sc != NULL); 7280 assert(cdw0 != NULL); 7281 7282 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7283 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7284 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7285 *first_sct = bdev_io->internal.error.nvme.sct; 7286 *first_sc = bdev_io->internal.error.nvme.sc; 7287 *second_sct = SPDK_NVME_SCT_GENERIC; 7288 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7289 } else { 7290 *first_sct = SPDK_NVME_SCT_GENERIC; 7291 *first_sc = SPDK_NVME_SC_SUCCESS; 7292 *second_sct = bdev_io->internal.error.nvme.sct; 7293 *second_sc = bdev_io->internal.error.nvme.sc; 7294 } 7295 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7296 *first_sct = SPDK_NVME_SCT_GENERIC; 7297 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7298 *second_sct = SPDK_NVME_SCT_GENERIC; 7299 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7300 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7301 *first_sct = SPDK_NVME_SCT_GENERIC; 7302 *first_sc = SPDK_NVME_SC_SUCCESS; 7303 *second_sct = SPDK_NVME_SCT_GENERIC; 7304 *second_sc = SPDK_NVME_SC_SUCCESS; 7305 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7306 *first_sct = SPDK_NVME_SCT_GENERIC; 7307 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7308 *second_sct = SPDK_NVME_SCT_GENERIC; 7309 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7310 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7311 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7312 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7313 *second_sct = SPDK_NVME_SCT_GENERIC; 7314 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7315 } else { 7316 *first_sct = SPDK_NVME_SCT_GENERIC; 7317 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7318 *second_sct = SPDK_NVME_SCT_GENERIC; 7319 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7320 } 7321 7322 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7323 } 7324 7325 struct spdk_thread * 7326 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7327 { 7328 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7329 } 7330 7331 struct spdk_io_channel * 7332 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7333 { 7334 return bdev_io->internal.ch->channel; 7335 } 7336 7337 static int 7338 bdev_register(struct spdk_bdev *bdev) 7339 { 7340 char *bdev_name; 7341 char uuid[SPDK_UUID_STRING_LEN]; 7342 struct spdk_iobuf_opts iobuf_opts; 7343 int ret, i; 7344 7345 assert(bdev->module != NULL); 7346 7347 if (!bdev->name) { 7348 SPDK_ERRLOG("Bdev name is NULL\n"); 7349 return -EINVAL; 7350 } 7351 7352 if (!strlen(bdev->name)) { 7353 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7354 return -EINVAL; 7355 } 7356 7357 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7358 if (bdev->fn_table->accel_sequence_supported == NULL) { 7359 continue; 7360 } 7361 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7362 (enum spdk_bdev_io_type)i)) { 7363 continue; 7364 } 7365 7366 if (spdk_bdev_is_md_separate(bdev)) { 7367 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7368 "accel sequence support\n"); 7369 return -EINVAL; 7370 } 7371 } 7372 7373 /* Users often register their own I/O devices using the bdev name. In 7374 * order to avoid conflicts, prepend bdev_. */ 7375 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7376 if (!bdev_name) { 7377 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7378 return -ENOMEM; 7379 } 7380 7381 bdev->internal.stat = bdev_alloc_io_stat(true); 7382 if (!bdev->internal.stat) { 7383 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7384 free(bdev_name); 7385 return -ENOMEM; 7386 } 7387 7388 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7389 bdev->internal.measured_queue_depth = UINT64_MAX; 7390 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7391 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7392 bdev->internal.qd_poller = NULL; 7393 bdev->internal.qos = NULL; 7394 7395 TAILQ_INIT(&bdev->internal.open_descs); 7396 TAILQ_INIT(&bdev->internal.locked_ranges); 7397 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7398 TAILQ_INIT(&bdev->aliases); 7399 7400 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7401 if (ret != 0) { 7402 bdev_free_io_stat(bdev->internal.stat); 7403 free(bdev_name); 7404 return ret; 7405 } 7406 7407 /* UUID may be specified by the user or defined by bdev itself. 7408 * Otherwise it will be generated here, so this field will never be empty. */ 7409 if (spdk_uuid_is_null(&bdev->uuid)) { 7410 spdk_uuid_generate(&bdev->uuid); 7411 } 7412 7413 /* Add the UUID alias only if it's different than the name */ 7414 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7415 if (strcmp(bdev->name, uuid) != 0) { 7416 ret = spdk_bdev_alias_add(bdev, uuid); 7417 if (ret != 0) { 7418 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7419 bdev_name_del(&bdev->internal.bdev_name); 7420 bdev_free_io_stat(bdev->internal.stat); 7421 free(bdev_name); 7422 return ret; 7423 } 7424 } 7425 7426 if (spdk_bdev_get_buf_align(bdev) > 1) { 7427 if (bdev->split_on_optimal_io_boundary) { 7428 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7429 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7430 } else { 7431 bdev->split_on_optimal_io_boundary = true; 7432 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7433 } 7434 } 7435 7436 /* If the user didn't specify a write unit size, set it to one. */ 7437 if (bdev->write_unit_size == 0) { 7438 bdev->write_unit_size = 1; 7439 } 7440 7441 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7442 if (bdev->acwu == 0) { 7443 bdev->acwu = bdev->write_unit_size; 7444 } 7445 7446 if (bdev->phys_blocklen == 0) { 7447 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7448 } 7449 7450 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7451 spdk_iobuf_get_opts(&iobuf_opts); 7452 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7453 } 7454 7455 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7456 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7457 } 7458 7459 bdev->internal.reset_in_progress = NULL; 7460 bdev->internal.qd_poll_in_progress = false; 7461 bdev->internal.period = 0; 7462 bdev->internal.new_period = 0; 7463 7464 spdk_io_device_register(__bdev_to_io_dev(bdev), 7465 bdev_channel_create, bdev_channel_destroy, 7466 sizeof(struct spdk_bdev_channel), 7467 bdev_name); 7468 7469 free(bdev_name); 7470 7471 spdk_spin_init(&bdev->internal.spinlock); 7472 7473 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7474 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7475 7476 return 0; 7477 } 7478 7479 static void 7480 bdev_destroy_cb(void *io_device) 7481 { 7482 int rc; 7483 struct spdk_bdev *bdev; 7484 spdk_bdev_unregister_cb cb_fn; 7485 void *cb_arg; 7486 7487 bdev = __bdev_from_io_dev(io_device); 7488 7489 if (bdev->internal.unregister_td != spdk_get_thread()) { 7490 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7491 return; 7492 } 7493 7494 cb_fn = bdev->internal.unregister_cb; 7495 cb_arg = bdev->internal.unregister_ctx; 7496 7497 spdk_spin_destroy(&bdev->internal.spinlock); 7498 free(bdev->internal.qos); 7499 bdev_free_io_stat(bdev->internal.stat); 7500 7501 rc = bdev->fn_table->destruct(bdev->ctxt); 7502 if (rc < 0) { 7503 SPDK_ERRLOG("destruct failed\n"); 7504 } 7505 if (rc <= 0 && cb_fn != NULL) { 7506 cb_fn(cb_arg, rc); 7507 } 7508 } 7509 7510 void 7511 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7512 { 7513 if (bdev->internal.unregister_cb != NULL) { 7514 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7515 } 7516 } 7517 7518 static void 7519 _remove_notify(void *arg) 7520 { 7521 struct spdk_bdev_desc *desc = arg; 7522 7523 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7524 } 7525 7526 /* returns: 0 - bdev removed and ready to be destructed. 7527 * -EBUSY - bdev can't be destructed yet. */ 7528 static int 7529 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7530 { 7531 struct spdk_bdev_desc *desc, *tmp; 7532 int rc = 0; 7533 char uuid[SPDK_UUID_STRING_LEN]; 7534 7535 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7536 assert(spdk_spin_held(&bdev->internal.spinlock)); 7537 7538 /* Notify each descriptor about hotremoval */ 7539 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7540 rc = -EBUSY; 7541 /* 7542 * Defer invocation of the event_cb to a separate message that will 7543 * run later on its thread. This ensures this context unwinds and 7544 * we don't recursively unregister this bdev again if the event_cb 7545 * immediately closes its descriptor. 7546 */ 7547 event_notify(desc, _remove_notify); 7548 } 7549 7550 /* If there are no descriptors, proceed removing the bdev */ 7551 if (rc == 0) { 7552 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7553 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7554 7555 /* Delete the name and the UUID alias */ 7556 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7557 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7558 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7559 7560 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7561 7562 if (bdev->internal.reset_in_progress != NULL) { 7563 /* If reset is in progress, let the completion callback for reset 7564 * unregister the bdev. 7565 */ 7566 rc = -EBUSY; 7567 } 7568 } 7569 7570 return rc; 7571 } 7572 7573 static void 7574 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7575 struct spdk_io_channel *io_ch, void *_ctx) 7576 { 7577 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7578 7579 bdev_channel_abort_queued_ios(bdev_ch); 7580 spdk_bdev_for_each_channel_continue(i, 0); 7581 } 7582 7583 static void 7584 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7585 { 7586 int rc; 7587 7588 spdk_spin_lock(&g_bdev_mgr.spinlock); 7589 spdk_spin_lock(&bdev->internal.spinlock); 7590 /* 7591 * Set the status to REMOVING after completing to abort channels. Otherwise, 7592 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7593 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7594 * may fail. 7595 */ 7596 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7597 rc = bdev_unregister_unsafe(bdev); 7598 spdk_spin_unlock(&bdev->internal.spinlock); 7599 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7600 7601 if (rc == 0) { 7602 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7603 } 7604 } 7605 7606 void 7607 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7608 { 7609 struct spdk_thread *thread; 7610 7611 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7612 7613 thread = spdk_get_thread(); 7614 if (!thread) { 7615 /* The user called this from a non-SPDK thread. */ 7616 if (cb_fn != NULL) { 7617 cb_fn(cb_arg, -ENOTSUP); 7618 } 7619 return; 7620 } 7621 7622 spdk_spin_lock(&g_bdev_mgr.spinlock); 7623 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7624 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7625 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7626 if (cb_fn) { 7627 cb_fn(cb_arg, -EBUSY); 7628 } 7629 return; 7630 } 7631 7632 spdk_spin_lock(&bdev->internal.spinlock); 7633 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7634 bdev->internal.unregister_cb = cb_fn; 7635 bdev->internal.unregister_ctx = cb_arg; 7636 bdev->internal.unregister_td = thread; 7637 spdk_spin_unlock(&bdev->internal.spinlock); 7638 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7639 7640 spdk_bdev_set_qd_sampling_period(bdev, 0); 7641 7642 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7643 bdev_unregister); 7644 } 7645 7646 int 7647 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7648 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7649 { 7650 struct spdk_bdev_desc *desc; 7651 struct spdk_bdev *bdev; 7652 int rc; 7653 7654 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7655 if (rc != 0) { 7656 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7657 return rc; 7658 } 7659 7660 bdev = spdk_bdev_desc_get_bdev(desc); 7661 7662 if (bdev->module != module) { 7663 spdk_bdev_close(desc); 7664 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7665 bdev_name); 7666 return -ENODEV; 7667 } 7668 7669 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7670 7671 spdk_bdev_close(desc); 7672 7673 return 0; 7674 } 7675 7676 static int 7677 bdev_start_qos(struct spdk_bdev *bdev) 7678 { 7679 struct set_qos_limit_ctx *ctx; 7680 7681 /* Enable QoS */ 7682 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7683 ctx = calloc(1, sizeof(*ctx)); 7684 if (ctx == NULL) { 7685 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7686 return -ENOMEM; 7687 } 7688 ctx->bdev = bdev; 7689 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7690 } 7691 7692 return 0; 7693 } 7694 7695 static void 7696 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7697 struct spdk_bdev *bdev) 7698 { 7699 enum spdk_bdev_claim_type type; 7700 const char *typename, *modname; 7701 extern struct spdk_log_flag SPDK_LOG_bdev; 7702 7703 assert(spdk_spin_held(&bdev->internal.spinlock)); 7704 7705 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7706 return; 7707 } 7708 7709 type = bdev->internal.claim_type; 7710 typename = spdk_bdev_claim_get_name(type); 7711 7712 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7713 modname = bdev->internal.claim.v1.module->name; 7714 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7715 bdev->name, detail, typename, modname); 7716 return; 7717 } 7718 7719 if (claim_type_is_v2(type)) { 7720 struct spdk_bdev_module_claim *claim; 7721 7722 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7723 modname = claim->module->name; 7724 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7725 bdev->name, detail, typename, modname); 7726 } 7727 return; 7728 } 7729 7730 assert(false); 7731 } 7732 7733 static int 7734 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7735 { 7736 struct spdk_thread *thread; 7737 int rc = 0; 7738 7739 thread = spdk_get_thread(); 7740 if (!thread) { 7741 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7742 return -ENOTSUP; 7743 } 7744 7745 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7746 spdk_get_thread()); 7747 7748 desc->bdev = bdev; 7749 desc->thread = thread; 7750 desc->write = write; 7751 7752 spdk_spin_lock(&bdev->internal.spinlock); 7753 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7754 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7755 spdk_spin_unlock(&bdev->internal.spinlock); 7756 return -ENODEV; 7757 } 7758 7759 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7760 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7761 spdk_spin_unlock(&bdev->internal.spinlock); 7762 return -EPERM; 7763 } 7764 7765 rc = bdev_start_qos(bdev); 7766 if (rc != 0) { 7767 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7768 spdk_spin_unlock(&bdev->internal.spinlock); 7769 return rc; 7770 } 7771 7772 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7773 7774 spdk_spin_unlock(&bdev->internal.spinlock); 7775 7776 return 0; 7777 } 7778 7779 static int 7780 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7781 struct spdk_bdev_desc **_desc) 7782 { 7783 struct spdk_bdev_desc *desc; 7784 unsigned int i; 7785 7786 desc = calloc(1, sizeof(*desc)); 7787 if (desc == NULL) { 7788 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7789 return -ENOMEM; 7790 } 7791 7792 TAILQ_INIT(&desc->pending_media_events); 7793 TAILQ_INIT(&desc->free_media_events); 7794 7795 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7796 desc->callback.event_fn = event_cb; 7797 desc->callback.ctx = event_ctx; 7798 spdk_spin_init(&desc->spinlock); 7799 7800 if (bdev->media_events) { 7801 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7802 sizeof(*desc->media_events_buffer)); 7803 if (desc->media_events_buffer == NULL) { 7804 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7805 bdev_desc_free(desc); 7806 return -ENOMEM; 7807 } 7808 7809 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7810 TAILQ_INSERT_TAIL(&desc->free_media_events, 7811 &desc->media_events_buffer[i], tailq); 7812 } 7813 } 7814 7815 if (bdev->fn_table->accel_sequence_supported != NULL) { 7816 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7817 desc->accel_sequence_supported[i] = 7818 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7819 (enum spdk_bdev_io_type)i); 7820 } 7821 } 7822 7823 *_desc = desc; 7824 7825 return 0; 7826 } 7827 7828 static int 7829 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7830 void *event_ctx, struct spdk_bdev_desc **_desc) 7831 { 7832 struct spdk_bdev_desc *desc; 7833 struct spdk_bdev *bdev; 7834 int rc; 7835 7836 bdev = bdev_get_by_name(bdev_name); 7837 7838 if (bdev == NULL) { 7839 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7840 return -ENODEV; 7841 } 7842 7843 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7844 if (rc != 0) { 7845 return rc; 7846 } 7847 7848 rc = bdev_open(bdev, write, desc); 7849 if (rc != 0) { 7850 bdev_desc_free(desc); 7851 desc = NULL; 7852 } 7853 7854 *_desc = desc; 7855 7856 return rc; 7857 } 7858 7859 int 7860 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7861 void *event_ctx, struct spdk_bdev_desc **_desc) 7862 { 7863 int rc; 7864 7865 if (event_cb == NULL) { 7866 SPDK_ERRLOG("Missing event callback function\n"); 7867 return -EINVAL; 7868 } 7869 7870 spdk_spin_lock(&g_bdev_mgr.spinlock); 7871 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 7872 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7873 7874 return rc; 7875 } 7876 7877 struct spdk_bdev_open_async_ctx { 7878 char *bdev_name; 7879 spdk_bdev_event_cb_t event_cb; 7880 void *event_ctx; 7881 bool write; 7882 int rc; 7883 spdk_bdev_open_async_cb_t cb_fn; 7884 void *cb_arg; 7885 struct spdk_bdev_desc *desc; 7886 struct spdk_bdev_open_async_opts opts; 7887 uint64_t start_ticks; 7888 struct spdk_thread *orig_thread; 7889 struct spdk_poller *poller; 7890 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 7891 }; 7892 7893 static void 7894 bdev_open_async_done(void *arg) 7895 { 7896 struct spdk_bdev_open_async_ctx *ctx = arg; 7897 7898 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 7899 7900 free(ctx->bdev_name); 7901 free(ctx); 7902 } 7903 7904 static void 7905 bdev_open_async_cancel(void *arg) 7906 { 7907 struct spdk_bdev_open_async_ctx *ctx = arg; 7908 7909 assert(ctx->rc == -ESHUTDOWN); 7910 7911 spdk_poller_unregister(&ctx->poller); 7912 7913 bdev_open_async_done(ctx); 7914 } 7915 7916 /* This is called when the bdev library finishes at shutdown. */ 7917 static void 7918 bdev_open_async_fini(void) 7919 { 7920 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 7921 7922 spdk_spin_lock(&g_bdev_mgr.spinlock); 7923 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 7924 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 7925 /* 7926 * We have to move to ctx->orig_thread to unregister ctx->poller. 7927 * However, there is a chance that ctx->poller is executed before 7928 * message is executed, which could result in bdev_open_async_done() 7929 * being called twice. To avoid such race condition, set ctx->rc to 7930 * -ESHUTDOWN. 7931 */ 7932 ctx->rc = -ESHUTDOWN; 7933 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 7934 } 7935 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7936 } 7937 7938 static int bdev_open_async(void *arg); 7939 7940 static void 7941 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 7942 { 7943 uint64_t timeout_ticks; 7944 7945 if (ctx->rc == -ESHUTDOWN) { 7946 /* This context is being canceled. Do nothing. */ 7947 return; 7948 } 7949 7950 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 7951 &ctx->desc); 7952 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 7953 goto exit; 7954 } 7955 7956 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 7957 if (spdk_get_ticks() >= timeout_ticks) { 7958 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 7959 ctx->rc = -ETIMEDOUT; 7960 goto exit; 7961 } 7962 7963 return; 7964 7965 exit: 7966 spdk_poller_unregister(&ctx->poller); 7967 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 7968 7969 /* Completion callback is processed after stack unwinding. */ 7970 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 7971 } 7972 7973 static int 7974 bdev_open_async(void *arg) 7975 { 7976 struct spdk_bdev_open_async_ctx *ctx = arg; 7977 7978 spdk_spin_lock(&g_bdev_mgr.spinlock); 7979 7980 _bdev_open_async(ctx); 7981 7982 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7983 7984 return SPDK_POLLER_BUSY; 7985 } 7986 7987 static void 7988 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 7989 struct spdk_bdev_open_async_opts *opts_src, 7990 size_t size) 7991 { 7992 assert(opts); 7993 assert(opts_src); 7994 7995 opts->size = size; 7996 7997 #define SET_FIELD(field) \ 7998 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 7999 opts->field = opts_src->field; \ 8000 } \ 8001 8002 SET_FIELD(timeout_ms); 8003 8004 /* Do not remove this statement, you should always update this statement when you adding a new field, 8005 * and do not forget to add the SET_FIELD statement for your added field. */ 8006 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8007 8008 #undef SET_FIELD 8009 } 8010 8011 static void 8012 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8013 { 8014 assert(opts); 8015 8016 opts->size = size; 8017 8018 #define SET_FIELD(field, value) \ 8019 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8020 opts->field = value; \ 8021 } \ 8022 8023 SET_FIELD(timeout_ms, 0); 8024 8025 #undef SET_FIELD 8026 } 8027 8028 int 8029 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8030 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8031 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8032 { 8033 struct spdk_bdev_open_async_ctx *ctx; 8034 8035 if (event_cb == NULL) { 8036 SPDK_ERRLOG("Missing event callback function\n"); 8037 return -EINVAL; 8038 } 8039 8040 if (open_cb == NULL) { 8041 SPDK_ERRLOG("Missing open callback function\n"); 8042 return -EINVAL; 8043 } 8044 8045 if (opts != NULL && opts->size == 0) { 8046 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8047 return -EINVAL; 8048 } 8049 8050 ctx = calloc(1, sizeof(*ctx)); 8051 if (ctx == NULL) { 8052 SPDK_ERRLOG("Failed to allocate open context\n"); 8053 return -ENOMEM; 8054 } 8055 8056 ctx->bdev_name = strdup(bdev_name); 8057 if (ctx->bdev_name == NULL) { 8058 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8059 free(ctx); 8060 return -ENOMEM; 8061 } 8062 8063 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8064 if (ctx->poller == NULL) { 8065 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8066 free(ctx->bdev_name); 8067 free(ctx); 8068 return -ENOMEM; 8069 } 8070 8071 ctx->cb_fn = open_cb; 8072 ctx->cb_arg = open_cb_arg; 8073 ctx->write = write; 8074 ctx->event_cb = event_cb; 8075 ctx->event_ctx = event_ctx; 8076 ctx->orig_thread = spdk_get_thread(); 8077 ctx->start_ticks = spdk_get_ticks(); 8078 8079 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8080 if (opts != NULL) { 8081 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8082 } 8083 8084 spdk_spin_lock(&g_bdev_mgr.spinlock); 8085 8086 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8087 _bdev_open_async(ctx); 8088 8089 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8090 8091 return 0; 8092 } 8093 8094 static void 8095 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8096 { 8097 int rc; 8098 8099 spdk_spin_lock(&bdev->internal.spinlock); 8100 spdk_spin_lock(&desc->spinlock); 8101 8102 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8103 8104 desc->closed = true; 8105 8106 if (desc->claim != NULL) { 8107 bdev_desc_release_claims(desc); 8108 } 8109 8110 if (0 == desc->refs) { 8111 spdk_spin_unlock(&desc->spinlock); 8112 bdev_desc_free(desc); 8113 } else { 8114 spdk_spin_unlock(&desc->spinlock); 8115 } 8116 8117 /* If no more descriptors, kill QoS channel */ 8118 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8119 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8120 bdev->name, spdk_get_thread()); 8121 8122 if (bdev_qos_destroy(bdev)) { 8123 /* There isn't anything we can do to recover here. Just let the 8124 * old QoS poller keep running. The QoS handling won't change 8125 * cores when the user allocates a new channel, but it won't break. */ 8126 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8127 } 8128 } 8129 8130 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8131 rc = bdev_unregister_unsafe(bdev); 8132 spdk_spin_unlock(&bdev->internal.spinlock); 8133 8134 if (rc == 0) { 8135 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8136 } 8137 } else { 8138 spdk_spin_unlock(&bdev->internal.spinlock); 8139 } 8140 } 8141 8142 void 8143 spdk_bdev_close(struct spdk_bdev_desc *desc) 8144 { 8145 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8146 8147 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8148 spdk_get_thread()); 8149 8150 assert(desc->thread == spdk_get_thread()); 8151 8152 spdk_poller_unregister(&desc->io_timeout_poller); 8153 8154 spdk_spin_lock(&g_bdev_mgr.spinlock); 8155 8156 bdev_close(bdev, desc); 8157 8158 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8159 } 8160 8161 static void 8162 bdev_register_finished(void *arg) 8163 { 8164 struct spdk_bdev_desc *desc = arg; 8165 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8166 8167 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8168 8169 spdk_spin_lock(&g_bdev_mgr.spinlock); 8170 8171 bdev_close(bdev, desc); 8172 8173 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8174 } 8175 8176 int 8177 spdk_bdev_register(struct spdk_bdev *bdev) 8178 { 8179 struct spdk_bdev_desc *desc; 8180 struct spdk_thread *thread = spdk_get_thread(); 8181 int rc; 8182 8183 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8184 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 8185 thread ? spdk_thread_get_name(thread) : "null"); 8186 return -EINVAL; 8187 } 8188 8189 rc = bdev_register(bdev); 8190 if (rc != 0) { 8191 return rc; 8192 } 8193 8194 /* A descriptor is opened to prevent bdev deletion during examination */ 8195 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8196 if (rc != 0) { 8197 spdk_bdev_unregister(bdev, NULL, NULL); 8198 return rc; 8199 } 8200 8201 rc = bdev_open(bdev, false, desc); 8202 if (rc != 0) { 8203 bdev_desc_free(desc); 8204 spdk_bdev_unregister(bdev, NULL, NULL); 8205 return rc; 8206 } 8207 8208 /* Examine configuration before initializing I/O */ 8209 bdev_examine(bdev); 8210 8211 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8212 if (rc != 0) { 8213 bdev_close(bdev, desc); 8214 spdk_bdev_unregister(bdev, NULL, NULL); 8215 } 8216 8217 return rc; 8218 } 8219 8220 int 8221 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8222 struct spdk_bdev_module *module) 8223 { 8224 spdk_spin_lock(&bdev->internal.spinlock); 8225 8226 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8227 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8228 spdk_spin_unlock(&bdev->internal.spinlock); 8229 return -EPERM; 8230 } 8231 8232 if (desc && !desc->write) { 8233 desc->write = true; 8234 } 8235 8236 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8237 bdev->internal.claim.v1.module = module; 8238 8239 spdk_spin_unlock(&bdev->internal.spinlock); 8240 return 0; 8241 } 8242 8243 void 8244 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8245 { 8246 spdk_spin_lock(&bdev->internal.spinlock); 8247 8248 assert(bdev->internal.claim.v1.module != NULL); 8249 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8250 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8251 bdev->internal.claim.v1.module = NULL; 8252 8253 spdk_spin_unlock(&bdev->internal.spinlock); 8254 } 8255 8256 /* 8257 * Start claims v2 8258 */ 8259 8260 const char * 8261 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8262 { 8263 switch (type) { 8264 case SPDK_BDEV_CLAIM_NONE: 8265 return "not_claimed"; 8266 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8267 return "exclusive_write"; 8268 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8269 return "read_many_write_one"; 8270 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8271 return "read_many_write_none"; 8272 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8273 return "read_many_write_many"; 8274 default: 8275 break; 8276 } 8277 return "invalid_claim"; 8278 } 8279 8280 static bool 8281 claim_type_is_v2(enum spdk_bdev_claim_type type) 8282 { 8283 switch (type) { 8284 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8285 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8286 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8287 return true; 8288 default: 8289 break; 8290 } 8291 return false; 8292 } 8293 8294 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8295 static bool 8296 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8297 { 8298 switch (type) { 8299 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8300 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8301 return true; 8302 default: 8303 break; 8304 } 8305 return false; 8306 } 8307 8308 void 8309 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8310 { 8311 if (opts == NULL) { 8312 SPDK_ERRLOG("opts should not be NULL\n"); 8313 assert(opts != NULL); 8314 return; 8315 } 8316 if (size == 0) { 8317 SPDK_ERRLOG("size should not be zero\n"); 8318 assert(size != 0); 8319 return; 8320 } 8321 8322 memset(opts, 0, size); 8323 opts->opts_size = size; 8324 8325 #define FIELD_OK(field) \ 8326 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8327 8328 #define SET_FIELD(field, value) \ 8329 if (FIELD_OK(field)) { \ 8330 opts->field = value; \ 8331 } \ 8332 8333 SET_FIELD(shared_claim_key, 0); 8334 8335 #undef FIELD_OK 8336 #undef SET_FIELD 8337 } 8338 8339 static int 8340 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8341 { 8342 if (src->opts_size == 0) { 8343 SPDK_ERRLOG("size should not be zero\n"); 8344 return -1; 8345 } 8346 8347 memset(dst, 0, sizeof(*dst)); 8348 dst->opts_size = src->opts_size; 8349 8350 #define FIELD_OK(field) \ 8351 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8352 8353 #define SET_FIELD(field) \ 8354 if (FIELD_OK(field)) { \ 8355 dst->field = src->field; \ 8356 } \ 8357 8358 if (FIELD_OK(name)) { 8359 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8360 } 8361 8362 SET_FIELD(shared_claim_key); 8363 8364 /* You should not remove this statement, but need to update the assert statement 8365 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8366 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8367 8368 #undef FIELD_OK 8369 #undef SET_FIELD 8370 return 0; 8371 } 8372 8373 /* Returns 0 if a read-write-once claim can be taken. */ 8374 static int 8375 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8376 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8377 { 8378 struct spdk_bdev *bdev = desc->bdev; 8379 struct spdk_bdev_desc *open_desc; 8380 8381 assert(spdk_spin_held(&bdev->internal.spinlock)); 8382 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8383 8384 if (opts->shared_claim_key != 0) { 8385 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8386 bdev->name); 8387 return -EINVAL; 8388 } 8389 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8390 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8391 return -EPERM; 8392 } 8393 if (desc->claim != NULL) { 8394 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8395 bdev->name, desc->claim->module->name); 8396 return -EPERM; 8397 } 8398 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8399 if (desc != open_desc && open_desc->write) { 8400 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8401 "another descriptor is open for writing\n", 8402 bdev->name); 8403 return -EPERM; 8404 } 8405 } 8406 8407 return 0; 8408 } 8409 8410 /* Returns 0 if a read-only-many claim can be taken. */ 8411 static int 8412 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8413 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8414 { 8415 struct spdk_bdev *bdev = desc->bdev; 8416 struct spdk_bdev_desc *open_desc; 8417 8418 assert(spdk_spin_held(&bdev->internal.spinlock)); 8419 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8420 assert(desc->claim == NULL); 8421 8422 if (desc->write) { 8423 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8424 bdev->name); 8425 return -EINVAL; 8426 } 8427 if (opts->shared_claim_key != 0) { 8428 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8429 return -EINVAL; 8430 } 8431 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8432 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8433 if (open_desc->write) { 8434 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8435 "another descriptor is open for writing\n", 8436 bdev->name); 8437 return -EPERM; 8438 } 8439 } 8440 } 8441 8442 return 0; 8443 } 8444 8445 /* Returns 0 if a read-write-many claim can be taken. */ 8446 static int 8447 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8448 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8449 { 8450 struct spdk_bdev *bdev = desc->bdev; 8451 struct spdk_bdev_desc *open_desc; 8452 8453 assert(spdk_spin_held(&bdev->internal.spinlock)); 8454 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8455 assert(desc->claim == NULL); 8456 8457 if (opts->shared_claim_key == 0) { 8458 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8459 bdev->name); 8460 return -EINVAL; 8461 } 8462 switch (bdev->internal.claim_type) { 8463 case SPDK_BDEV_CLAIM_NONE: 8464 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8465 if (open_desc == desc) { 8466 continue; 8467 } 8468 if (open_desc->write) { 8469 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8470 "another descriptor is open for writing without a " 8471 "claim\n", bdev->name); 8472 return -EPERM; 8473 } 8474 } 8475 break; 8476 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8477 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8478 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8479 return -EPERM; 8480 } 8481 break; 8482 default: 8483 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8484 return -EBUSY; 8485 } 8486 8487 return 0; 8488 } 8489 8490 /* Updates desc and its bdev with a v2 claim. */ 8491 static int 8492 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8493 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8494 { 8495 struct spdk_bdev *bdev = desc->bdev; 8496 struct spdk_bdev_module_claim *claim; 8497 8498 assert(spdk_spin_held(&bdev->internal.spinlock)); 8499 assert(claim_type_is_v2(type)); 8500 assert(desc->claim == NULL); 8501 8502 claim = calloc(1, sizeof(*desc->claim)); 8503 if (claim == NULL) { 8504 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8505 return -ENOMEM; 8506 } 8507 claim->module = module; 8508 claim->desc = desc; 8509 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8510 memcpy(claim->name, opts->name, sizeof(claim->name)); 8511 desc->claim = claim; 8512 8513 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8514 bdev->internal.claim_type = type; 8515 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8516 bdev->internal.claim.v2.key = opts->shared_claim_key; 8517 } 8518 assert(type == bdev->internal.claim_type); 8519 8520 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8521 8522 if (!desc->write && claim_type_promotes_to_write(type)) { 8523 desc->write = true; 8524 } 8525 8526 return 0; 8527 } 8528 8529 int 8530 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8531 struct spdk_bdev_claim_opts *_opts, 8532 struct spdk_bdev_module *module) 8533 { 8534 struct spdk_bdev *bdev; 8535 struct spdk_bdev_claim_opts opts; 8536 int rc = 0; 8537 8538 if (desc == NULL) { 8539 SPDK_ERRLOG("descriptor must not be NULL\n"); 8540 return -EINVAL; 8541 } 8542 8543 bdev = desc->bdev; 8544 8545 if (_opts == NULL) { 8546 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8547 } else if (claim_opts_copy(_opts, &opts) != 0) { 8548 return -EINVAL; 8549 } 8550 8551 spdk_spin_lock(&bdev->internal.spinlock); 8552 8553 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8554 bdev->internal.claim_type != type) { 8555 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8556 spdk_spin_unlock(&bdev->internal.spinlock); 8557 return -EPERM; 8558 } 8559 8560 if (claim_type_is_v2(type) && desc->claim != NULL) { 8561 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8562 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8563 spdk_spin_unlock(&bdev->internal.spinlock); 8564 return -EPERM; 8565 } 8566 8567 switch (type) { 8568 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8569 spdk_spin_unlock(&bdev->internal.spinlock); 8570 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8571 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8572 rc = claim_verify_rwo(desc, type, &opts, module); 8573 break; 8574 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8575 rc = claim_verify_rom(desc, type, &opts, module); 8576 break; 8577 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8578 rc = claim_verify_rwm(desc, type, &opts, module); 8579 break; 8580 default: 8581 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8582 rc = -ENOTSUP; 8583 } 8584 8585 if (rc == 0) { 8586 rc = claim_bdev(desc, type, &opts, module); 8587 } 8588 8589 spdk_spin_unlock(&bdev->internal.spinlock); 8590 return rc; 8591 } 8592 8593 static void 8594 claim_reset(struct spdk_bdev *bdev) 8595 { 8596 assert(spdk_spin_held(&bdev->internal.spinlock)); 8597 assert(claim_type_is_v2(bdev->internal.claim_type)); 8598 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8599 8600 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8601 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8602 } 8603 8604 static void 8605 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8606 { 8607 struct spdk_bdev *bdev = desc->bdev; 8608 8609 assert(spdk_spin_held(&bdev->internal.spinlock)); 8610 assert(claim_type_is_v2(bdev->internal.claim_type)); 8611 8612 if (bdev->internal.examine_in_progress == 0) { 8613 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8614 free(desc->claim); 8615 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8616 claim_reset(bdev); 8617 } 8618 } else { 8619 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8620 desc->claim->module = NULL; 8621 desc->claim->desc = NULL; 8622 } 8623 desc->claim = NULL; 8624 } 8625 8626 /* 8627 * End claims v2 8628 */ 8629 8630 struct spdk_bdev * 8631 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8632 { 8633 assert(desc != NULL); 8634 return desc->bdev; 8635 } 8636 8637 int 8638 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8639 { 8640 struct spdk_bdev *bdev, *tmp; 8641 struct spdk_bdev_desc *desc; 8642 int rc = 0; 8643 8644 assert(fn != NULL); 8645 8646 spdk_spin_lock(&g_bdev_mgr.spinlock); 8647 bdev = spdk_bdev_first(); 8648 while (bdev != NULL) { 8649 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8650 if (rc != 0) { 8651 break; 8652 } 8653 rc = bdev_open(bdev, false, desc); 8654 if (rc != 0) { 8655 bdev_desc_free(desc); 8656 if (rc == -ENODEV) { 8657 /* Ignore the error and move to the next bdev. */ 8658 rc = 0; 8659 bdev = spdk_bdev_next(bdev); 8660 continue; 8661 } 8662 break; 8663 } 8664 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8665 8666 rc = fn(ctx, bdev); 8667 8668 spdk_spin_lock(&g_bdev_mgr.spinlock); 8669 tmp = spdk_bdev_next(bdev); 8670 bdev_close(bdev, desc); 8671 if (rc != 0) { 8672 break; 8673 } 8674 bdev = tmp; 8675 } 8676 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8677 8678 return rc; 8679 } 8680 8681 int 8682 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8683 { 8684 struct spdk_bdev *bdev, *tmp; 8685 struct spdk_bdev_desc *desc; 8686 int rc = 0; 8687 8688 assert(fn != NULL); 8689 8690 spdk_spin_lock(&g_bdev_mgr.spinlock); 8691 bdev = spdk_bdev_first_leaf(); 8692 while (bdev != NULL) { 8693 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8694 if (rc != 0) { 8695 break; 8696 } 8697 rc = bdev_open(bdev, false, desc); 8698 if (rc != 0) { 8699 bdev_desc_free(desc); 8700 if (rc == -ENODEV) { 8701 /* Ignore the error and move to the next bdev. */ 8702 rc = 0; 8703 bdev = spdk_bdev_next_leaf(bdev); 8704 continue; 8705 } 8706 break; 8707 } 8708 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8709 8710 rc = fn(ctx, bdev); 8711 8712 spdk_spin_lock(&g_bdev_mgr.spinlock); 8713 tmp = spdk_bdev_next_leaf(bdev); 8714 bdev_close(bdev, desc); 8715 if (rc != 0) { 8716 break; 8717 } 8718 bdev = tmp; 8719 } 8720 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8721 8722 return rc; 8723 } 8724 8725 void 8726 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8727 { 8728 struct iovec *iovs; 8729 int iovcnt; 8730 8731 if (bdev_io == NULL) { 8732 return; 8733 } 8734 8735 switch (bdev_io->type) { 8736 case SPDK_BDEV_IO_TYPE_READ: 8737 case SPDK_BDEV_IO_TYPE_WRITE: 8738 case SPDK_BDEV_IO_TYPE_ZCOPY: 8739 iovs = bdev_io->u.bdev.iovs; 8740 iovcnt = bdev_io->u.bdev.iovcnt; 8741 break; 8742 default: 8743 iovs = NULL; 8744 iovcnt = 0; 8745 break; 8746 } 8747 8748 if (iovp) { 8749 *iovp = iovs; 8750 } 8751 if (iovcntp) { 8752 *iovcntp = iovcnt; 8753 } 8754 } 8755 8756 void * 8757 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8758 { 8759 if (bdev_io == NULL) { 8760 return NULL; 8761 } 8762 8763 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8764 return NULL; 8765 } 8766 8767 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8768 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8769 return bdev_io->u.bdev.md_buf; 8770 } 8771 8772 return NULL; 8773 } 8774 8775 void * 8776 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8777 { 8778 if (bdev_io == NULL) { 8779 assert(false); 8780 return NULL; 8781 } 8782 8783 return bdev_io->internal.caller_ctx; 8784 } 8785 8786 void 8787 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8788 { 8789 8790 if (spdk_bdev_module_list_find(bdev_module->name)) { 8791 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8792 assert(false); 8793 } 8794 8795 spdk_spin_init(&bdev_module->internal.spinlock); 8796 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8797 8798 /* 8799 * Modules with examine callbacks must be initialized first, so they are 8800 * ready to handle examine callbacks from later modules that will 8801 * register physical bdevs. 8802 */ 8803 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8804 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8805 } else { 8806 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8807 } 8808 } 8809 8810 struct spdk_bdev_module * 8811 spdk_bdev_module_list_find(const char *name) 8812 { 8813 struct spdk_bdev_module *bdev_module; 8814 8815 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8816 if (strcmp(name, bdev_module->name) == 0) { 8817 break; 8818 } 8819 } 8820 8821 return bdev_module; 8822 } 8823 8824 static int 8825 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8826 { 8827 uint64_t num_blocks; 8828 void *md_buf = NULL; 8829 8830 num_blocks = bdev_io->u.bdev.num_blocks; 8831 8832 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8833 md_buf = (char *)g_bdev_mgr.zero_buffer + 8834 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8835 } 8836 8837 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8838 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8839 g_bdev_mgr.zero_buffer, md_buf, 8840 bdev_io->u.bdev.offset_blocks, num_blocks, 8841 bdev_write_zero_buffer_done, bdev_io); 8842 } 8843 8844 static void 8845 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8846 { 8847 struct spdk_bdev_io *parent_io = cb_arg; 8848 8849 spdk_bdev_free_io(bdev_io); 8850 8851 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8852 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8853 } 8854 8855 static void 8856 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8857 { 8858 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8859 ctx->bdev->internal.qos_mod_in_progress = false; 8860 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8861 8862 if (ctx->cb_fn) { 8863 ctx->cb_fn(ctx->cb_arg, status); 8864 } 8865 free(ctx); 8866 } 8867 8868 static void 8869 bdev_disable_qos_done(void *cb_arg) 8870 { 8871 struct set_qos_limit_ctx *ctx = cb_arg; 8872 struct spdk_bdev *bdev = ctx->bdev; 8873 struct spdk_bdev_io *bdev_io; 8874 struct spdk_bdev_qos *qos; 8875 8876 spdk_spin_lock(&bdev->internal.spinlock); 8877 qos = bdev->internal.qos; 8878 bdev->internal.qos = NULL; 8879 spdk_spin_unlock(&bdev->internal.spinlock); 8880 8881 while (!TAILQ_EMPTY(&qos->queued)) { 8882 /* Send queued I/O back to their original thread for resubmission. */ 8883 bdev_io = TAILQ_FIRST(&qos->queued); 8884 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8885 8886 if (bdev_io->internal.io_submit_ch) { 8887 /* 8888 * Channel was changed when sending it to the QoS thread - change it back 8889 * before sending it back to the original thread. 8890 */ 8891 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8892 bdev_io->internal.io_submit_ch = NULL; 8893 } 8894 8895 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8896 _bdev_io_submit, bdev_io); 8897 } 8898 8899 if (qos->thread != NULL) { 8900 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8901 spdk_poller_unregister(&qos->poller); 8902 } 8903 8904 free(qos); 8905 8906 bdev_set_qos_limit_done(ctx, 0); 8907 } 8908 8909 static void 8910 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8911 { 8912 struct set_qos_limit_ctx *ctx = _ctx; 8913 struct spdk_thread *thread; 8914 8915 spdk_spin_lock(&bdev->internal.spinlock); 8916 thread = bdev->internal.qos->thread; 8917 spdk_spin_unlock(&bdev->internal.spinlock); 8918 8919 if (thread != NULL) { 8920 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8921 } else { 8922 bdev_disable_qos_done(ctx); 8923 } 8924 } 8925 8926 static void 8927 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8928 struct spdk_io_channel *ch, void *_ctx) 8929 { 8930 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8931 8932 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8933 8934 spdk_bdev_for_each_channel_continue(i, 0); 8935 } 8936 8937 static void 8938 bdev_update_qos_rate_limit_msg(void *cb_arg) 8939 { 8940 struct set_qos_limit_ctx *ctx = cb_arg; 8941 struct spdk_bdev *bdev = ctx->bdev; 8942 8943 spdk_spin_lock(&bdev->internal.spinlock); 8944 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8945 spdk_spin_unlock(&bdev->internal.spinlock); 8946 8947 bdev_set_qos_limit_done(ctx, 0); 8948 } 8949 8950 static void 8951 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8952 struct spdk_io_channel *ch, void *_ctx) 8953 { 8954 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8955 8956 spdk_spin_lock(&bdev->internal.spinlock); 8957 bdev_enable_qos(bdev, bdev_ch); 8958 spdk_spin_unlock(&bdev->internal.spinlock); 8959 spdk_bdev_for_each_channel_continue(i, 0); 8960 } 8961 8962 static void 8963 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8964 { 8965 struct set_qos_limit_ctx *ctx = _ctx; 8966 8967 bdev_set_qos_limit_done(ctx, status); 8968 } 8969 8970 static void 8971 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8972 { 8973 int i; 8974 8975 assert(bdev->internal.qos != NULL); 8976 8977 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8978 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8979 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8980 8981 if (limits[i] == 0) { 8982 bdev->internal.qos->rate_limits[i].limit = 8983 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8984 } 8985 } 8986 } 8987 } 8988 8989 void 8990 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8991 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8992 { 8993 struct set_qos_limit_ctx *ctx; 8994 uint32_t limit_set_complement; 8995 uint64_t min_limit_per_sec; 8996 int i; 8997 bool disable_rate_limit = true; 8998 8999 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9000 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9001 continue; 9002 } 9003 9004 if (limits[i] > 0) { 9005 disable_rate_limit = false; 9006 } 9007 9008 if (bdev_qos_is_iops_rate_limit(i) == true) { 9009 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9010 } else { 9011 /* Change from megabyte to byte rate limit */ 9012 limits[i] = limits[i] * 1024 * 1024; 9013 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9014 } 9015 9016 limit_set_complement = limits[i] % min_limit_per_sec; 9017 if (limit_set_complement) { 9018 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9019 limits[i], min_limit_per_sec); 9020 limits[i] += min_limit_per_sec - limit_set_complement; 9021 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9022 } 9023 } 9024 9025 ctx = calloc(1, sizeof(*ctx)); 9026 if (ctx == NULL) { 9027 cb_fn(cb_arg, -ENOMEM); 9028 return; 9029 } 9030 9031 ctx->cb_fn = cb_fn; 9032 ctx->cb_arg = cb_arg; 9033 ctx->bdev = bdev; 9034 9035 spdk_spin_lock(&bdev->internal.spinlock); 9036 if (bdev->internal.qos_mod_in_progress) { 9037 spdk_spin_unlock(&bdev->internal.spinlock); 9038 free(ctx); 9039 cb_fn(cb_arg, -EAGAIN); 9040 return; 9041 } 9042 bdev->internal.qos_mod_in_progress = true; 9043 9044 if (disable_rate_limit == true && bdev->internal.qos) { 9045 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9046 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9047 (bdev->internal.qos->rate_limits[i].limit > 0 && 9048 bdev->internal.qos->rate_limits[i].limit != 9049 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9050 disable_rate_limit = false; 9051 break; 9052 } 9053 } 9054 } 9055 9056 if (disable_rate_limit == false) { 9057 if (bdev->internal.qos == NULL) { 9058 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9059 if (!bdev->internal.qos) { 9060 spdk_spin_unlock(&bdev->internal.spinlock); 9061 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9062 bdev_set_qos_limit_done(ctx, -ENOMEM); 9063 return; 9064 } 9065 } 9066 9067 if (bdev->internal.qos->thread == NULL) { 9068 /* Enabling */ 9069 bdev_set_qos_rate_limits(bdev, limits); 9070 9071 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9072 bdev_enable_qos_done); 9073 } else { 9074 /* Updating */ 9075 bdev_set_qos_rate_limits(bdev, limits); 9076 9077 spdk_thread_send_msg(bdev->internal.qos->thread, 9078 bdev_update_qos_rate_limit_msg, ctx); 9079 } 9080 } else { 9081 if (bdev->internal.qos != NULL) { 9082 bdev_set_qos_rate_limits(bdev, limits); 9083 9084 /* Disabling */ 9085 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9086 bdev_disable_qos_msg_done); 9087 } else { 9088 spdk_spin_unlock(&bdev->internal.spinlock); 9089 bdev_set_qos_limit_done(ctx, 0); 9090 return; 9091 } 9092 } 9093 9094 spdk_spin_unlock(&bdev->internal.spinlock); 9095 } 9096 9097 struct spdk_bdev_histogram_ctx { 9098 spdk_bdev_histogram_status_cb cb_fn; 9099 void *cb_arg; 9100 struct spdk_bdev *bdev; 9101 int status; 9102 }; 9103 9104 static void 9105 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9106 { 9107 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9108 9109 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9110 ctx->bdev->internal.histogram_in_progress = false; 9111 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9112 ctx->cb_fn(ctx->cb_arg, ctx->status); 9113 free(ctx); 9114 } 9115 9116 static void 9117 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9118 struct spdk_io_channel *_ch, void *_ctx) 9119 { 9120 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9121 9122 if (ch->histogram != NULL) { 9123 spdk_histogram_data_free(ch->histogram); 9124 ch->histogram = NULL; 9125 } 9126 spdk_bdev_for_each_channel_continue(i, 0); 9127 } 9128 9129 static void 9130 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9131 { 9132 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9133 9134 if (status != 0) { 9135 ctx->status = status; 9136 ctx->bdev->internal.histogram_enabled = false; 9137 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9138 bdev_histogram_disable_channel_cb); 9139 } else { 9140 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9141 ctx->bdev->internal.histogram_in_progress = false; 9142 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9143 ctx->cb_fn(ctx->cb_arg, ctx->status); 9144 free(ctx); 9145 } 9146 } 9147 9148 static void 9149 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9150 struct spdk_io_channel *_ch, void *_ctx) 9151 { 9152 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9153 int status = 0; 9154 9155 if (ch->histogram == NULL) { 9156 ch->histogram = spdk_histogram_data_alloc(); 9157 if (ch->histogram == NULL) { 9158 status = -ENOMEM; 9159 } 9160 } 9161 9162 spdk_bdev_for_each_channel_continue(i, status); 9163 } 9164 9165 void 9166 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9167 void *cb_arg, bool enable) 9168 { 9169 struct spdk_bdev_histogram_ctx *ctx; 9170 9171 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9172 if (ctx == NULL) { 9173 cb_fn(cb_arg, -ENOMEM); 9174 return; 9175 } 9176 9177 ctx->bdev = bdev; 9178 ctx->status = 0; 9179 ctx->cb_fn = cb_fn; 9180 ctx->cb_arg = cb_arg; 9181 9182 spdk_spin_lock(&bdev->internal.spinlock); 9183 if (bdev->internal.histogram_in_progress) { 9184 spdk_spin_unlock(&bdev->internal.spinlock); 9185 free(ctx); 9186 cb_fn(cb_arg, -EAGAIN); 9187 return; 9188 } 9189 9190 bdev->internal.histogram_in_progress = true; 9191 spdk_spin_unlock(&bdev->internal.spinlock); 9192 9193 bdev->internal.histogram_enabled = enable; 9194 9195 if (enable) { 9196 /* Allocate histogram for each channel */ 9197 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9198 bdev_histogram_enable_channel_cb); 9199 } else { 9200 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9201 bdev_histogram_disable_channel_cb); 9202 } 9203 } 9204 9205 struct spdk_bdev_histogram_data_ctx { 9206 spdk_bdev_histogram_data_cb cb_fn; 9207 void *cb_arg; 9208 struct spdk_bdev *bdev; 9209 /** merged histogram data from all channels */ 9210 struct spdk_histogram_data *histogram; 9211 }; 9212 9213 static void 9214 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9215 { 9216 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9217 9218 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9219 free(ctx); 9220 } 9221 9222 static void 9223 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9224 struct spdk_io_channel *_ch, void *_ctx) 9225 { 9226 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9227 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9228 int status = 0; 9229 9230 if (ch->histogram == NULL) { 9231 status = -EFAULT; 9232 } else { 9233 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9234 } 9235 9236 spdk_bdev_for_each_channel_continue(i, status); 9237 } 9238 9239 void 9240 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9241 spdk_bdev_histogram_data_cb cb_fn, 9242 void *cb_arg) 9243 { 9244 struct spdk_bdev_histogram_data_ctx *ctx; 9245 9246 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9247 if (ctx == NULL) { 9248 cb_fn(cb_arg, -ENOMEM, NULL); 9249 return; 9250 } 9251 9252 ctx->bdev = bdev; 9253 ctx->cb_fn = cb_fn; 9254 ctx->cb_arg = cb_arg; 9255 9256 ctx->histogram = histogram; 9257 9258 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9259 bdev_histogram_get_channel_cb); 9260 } 9261 9262 void 9263 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9264 void *cb_arg) 9265 { 9266 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9267 int status = 0; 9268 9269 assert(cb_fn != NULL); 9270 9271 if (bdev_ch->histogram == NULL) { 9272 status = -EFAULT; 9273 } 9274 cb_fn(cb_arg, status, bdev_ch->histogram); 9275 } 9276 9277 size_t 9278 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9279 size_t max_events) 9280 { 9281 struct media_event_entry *entry; 9282 size_t num_events = 0; 9283 9284 for (; num_events < max_events; ++num_events) { 9285 entry = TAILQ_FIRST(&desc->pending_media_events); 9286 if (entry == NULL) { 9287 break; 9288 } 9289 9290 events[num_events] = entry->event; 9291 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9292 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9293 } 9294 9295 return num_events; 9296 } 9297 9298 int 9299 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9300 size_t num_events) 9301 { 9302 struct spdk_bdev_desc *desc; 9303 struct media_event_entry *entry; 9304 size_t event_id; 9305 int rc = 0; 9306 9307 assert(bdev->media_events); 9308 9309 spdk_spin_lock(&bdev->internal.spinlock); 9310 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9311 if (desc->write) { 9312 break; 9313 } 9314 } 9315 9316 if (desc == NULL || desc->media_events_buffer == NULL) { 9317 rc = -ENODEV; 9318 goto out; 9319 } 9320 9321 for (event_id = 0; event_id < num_events; ++event_id) { 9322 entry = TAILQ_FIRST(&desc->free_media_events); 9323 if (entry == NULL) { 9324 break; 9325 } 9326 9327 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9328 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9329 entry->event = events[event_id]; 9330 } 9331 9332 rc = event_id; 9333 out: 9334 spdk_spin_unlock(&bdev->internal.spinlock); 9335 return rc; 9336 } 9337 9338 static void 9339 _media_management_notify(void *arg) 9340 { 9341 struct spdk_bdev_desc *desc = arg; 9342 9343 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9344 } 9345 9346 void 9347 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9348 { 9349 struct spdk_bdev_desc *desc; 9350 9351 spdk_spin_lock(&bdev->internal.spinlock); 9352 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9353 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9354 event_notify(desc, _media_management_notify); 9355 } 9356 } 9357 spdk_spin_unlock(&bdev->internal.spinlock); 9358 } 9359 9360 struct locked_lba_range_ctx { 9361 struct lba_range range; 9362 struct lba_range *current_range; 9363 struct lba_range *owner_range; 9364 struct spdk_poller *poller; 9365 lock_range_cb cb_fn; 9366 void *cb_arg; 9367 }; 9368 9369 static void 9370 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9371 { 9372 struct locked_lba_range_ctx *ctx = _ctx; 9373 9374 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9375 free(ctx); 9376 } 9377 9378 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9379 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9380 9381 static void 9382 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9383 { 9384 struct locked_lba_range_ctx *ctx = _ctx; 9385 9386 if (status == -ENOMEM) { 9387 /* One of the channels could not allocate a range object. 9388 * So we have to go back and clean up any ranges that were 9389 * allocated successfully before we return error status to 9390 * the caller. We can reuse the unlock function to do that 9391 * clean up. 9392 */ 9393 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9394 bdev_lock_error_cleanup_cb); 9395 return; 9396 } 9397 9398 /* All channels have locked this range and no I/O overlapping the range 9399 * are outstanding! Set the owner_ch for the range object for the 9400 * locking channel, so that this channel will know that it is allowed 9401 * to write to this range. 9402 */ 9403 if (ctx->owner_range != NULL) { 9404 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9405 } 9406 9407 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9408 9409 /* Don't free the ctx here. Its range is in the bdev's global list of 9410 * locked ranges still, and will be removed and freed when this range 9411 * is later unlocked. 9412 */ 9413 } 9414 9415 static int 9416 bdev_lock_lba_range_check_io(void *_i) 9417 { 9418 struct spdk_bdev_channel_iter *i = _i; 9419 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9420 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9421 struct locked_lba_range_ctx *ctx = i->ctx; 9422 struct lba_range *range = ctx->current_range; 9423 struct spdk_bdev_io *bdev_io; 9424 9425 spdk_poller_unregister(&ctx->poller); 9426 9427 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9428 * range. But we need to wait until any outstanding IO overlapping with this range 9429 * are completed. 9430 */ 9431 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9432 if (bdev_io_range_is_locked(bdev_io, range)) { 9433 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9434 return SPDK_POLLER_BUSY; 9435 } 9436 } 9437 9438 spdk_bdev_for_each_channel_continue(i, 0); 9439 return SPDK_POLLER_BUSY; 9440 } 9441 9442 static void 9443 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9444 struct spdk_io_channel *_ch, void *_ctx) 9445 { 9446 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9447 struct locked_lba_range_ctx *ctx = _ctx; 9448 struct lba_range *range; 9449 9450 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9451 if (range->length == ctx->range.length && 9452 range->offset == ctx->range.offset && 9453 range->locked_ctx == ctx->range.locked_ctx) { 9454 /* This range already exists on this channel, so don't add 9455 * it again. This can happen when a new channel is created 9456 * while the for_each_channel operation is in progress. 9457 * Do not check for outstanding I/O in that case, since the 9458 * range was locked before any I/O could be submitted to the 9459 * new channel. 9460 */ 9461 spdk_bdev_for_each_channel_continue(i, 0); 9462 return; 9463 } 9464 } 9465 9466 range = calloc(1, sizeof(*range)); 9467 if (range == NULL) { 9468 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9469 return; 9470 } 9471 9472 range->length = ctx->range.length; 9473 range->offset = ctx->range.offset; 9474 range->locked_ctx = ctx->range.locked_ctx; 9475 ctx->current_range = range; 9476 if (ctx->range.owner_ch == ch) { 9477 /* This is the range object for the channel that will hold 9478 * the lock. Store it in the ctx object so that we can easily 9479 * set its owner_ch after the lock is finally acquired. 9480 */ 9481 ctx->owner_range = range; 9482 } 9483 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9484 bdev_lock_lba_range_check_io(i); 9485 } 9486 9487 static void 9488 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9489 { 9490 assert(spdk_get_thread() == ctx->range.owner_thread); 9491 assert(ctx->range.owner_ch == NULL || 9492 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9493 9494 /* We will add a copy of this range to each channel now. */ 9495 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9496 bdev_lock_lba_range_cb); 9497 } 9498 9499 static bool 9500 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9501 { 9502 struct lba_range *r; 9503 9504 TAILQ_FOREACH(r, tailq, tailq) { 9505 if (bdev_lba_range_overlapped(range, r)) { 9506 return true; 9507 } 9508 } 9509 return false; 9510 } 9511 9512 static int 9513 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9514 uint64_t offset, uint64_t length, 9515 lock_range_cb cb_fn, void *cb_arg) 9516 { 9517 struct locked_lba_range_ctx *ctx; 9518 9519 ctx = calloc(1, sizeof(*ctx)); 9520 if (ctx == NULL) { 9521 return -ENOMEM; 9522 } 9523 9524 ctx->range.offset = offset; 9525 ctx->range.length = length; 9526 ctx->range.owner_thread = spdk_get_thread(); 9527 ctx->range.owner_ch = ch; 9528 ctx->range.locked_ctx = cb_arg; 9529 ctx->range.bdev = bdev; 9530 ctx->cb_fn = cb_fn; 9531 ctx->cb_arg = cb_arg; 9532 9533 spdk_spin_lock(&bdev->internal.spinlock); 9534 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9535 /* There is an active lock overlapping with this range. 9536 * Put it on the pending list until this range no 9537 * longer overlaps with another. 9538 */ 9539 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9540 } else { 9541 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9542 bdev_lock_lba_range_ctx(bdev, ctx); 9543 } 9544 spdk_spin_unlock(&bdev->internal.spinlock); 9545 return 0; 9546 } 9547 9548 static int 9549 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9550 uint64_t offset, uint64_t length, 9551 lock_range_cb cb_fn, void *cb_arg) 9552 { 9553 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9554 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9555 9556 if (cb_arg == NULL) { 9557 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9558 return -EINVAL; 9559 } 9560 9561 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9562 } 9563 9564 static void 9565 bdev_lock_lba_range_ctx_msg(void *_ctx) 9566 { 9567 struct locked_lba_range_ctx *ctx = _ctx; 9568 9569 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9570 } 9571 9572 static void 9573 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9574 { 9575 struct locked_lba_range_ctx *ctx = _ctx; 9576 struct locked_lba_range_ctx *pending_ctx; 9577 struct lba_range *range, *tmp; 9578 9579 spdk_spin_lock(&bdev->internal.spinlock); 9580 /* Check if there are any pending locked ranges that overlap with this range 9581 * that was just unlocked. If there are, check that it doesn't overlap with any 9582 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9583 * the lock process. 9584 */ 9585 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9586 if (bdev_lba_range_overlapped(range, &ctx->range) && 9587 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9588 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9589 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9590 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9591 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9592 bdev_lock_lba_range_ctx_msg, pending_ctx); 9593 } 9594 } 9595 spdk_spin_unlock(&bdev->internal.spinlock); 9596 9597 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9598 free(ctx); 9599 } 9600 9601 static void 9602 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9603 struct spdk_io_channel *_ch, void *_ctx) 9604 { 9605 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9606 struct locked_lba_range_ctx *ctx = _ctx; 9607 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9608 struct spdk_bdev_io *bdev_io; 9609 struct lba_range *range; 9610 9611 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9612 if (ctx->range.offset == range->offset && 9613 ctx->range.length == range->length && 9614 ctx->range.locked_ctx == range->locked_ctx) { 9615 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9616 free(range); 9617 break; 9618 } 9619 } 9620 9621 /* Note: we should almost always be able to assert that the range specified 9622 * was found. But there are some very rare corner cases where a new channel 9623 * gets created simultaneously with a range unlock, where this function 9624 * would execute on that new channel and wouldn't have the range. 9625 * We also use this to clean up range allocations when a later allocation 9626 * fails in the locking path. 9627 * So we can't actually assert() here. 9628 */ 9629 9630 /* Swap the locked IO into a temporary list, and then try to submit them again. 9631 * We could hyper-optimize this to only resubmit locked I/O that overlap 9632 * with the range that was just unlocked, but this isn't a performance path so 9633 * we go for simplicity here. 9634 */ 9635 TAILQ_INIT(&io_locked); 9636 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9637 while (!TAILQ_EMPTY(&io_locked)) { 9638 bdev_io = TAILQ_FIRST(&io_locked); 9639 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9640 bdev_io_submit(bdev_io); 9641 } 9642 9643 spdk_bdev_for_each_channel_continue(i, 0); 9644 } 9645 9646 static int 9647 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9648 lock_range_cb cb_fn, void *cb_arg) 9649 { 9650 struct locked_lba_range_ctx *ctx; 9651 struct lba_range *range; 9652 9653 spdk_spin_lock(&bdev->internal.spinlock); 9654 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9655 * and remove it. This ensures new channels don't inherit the locked range. 9656 * Then we will send a message to each channel to remove the range from its 9657 * per-channel list. 9658 */ 9659 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9660 if (range->offset == offset && range->length == length && 9661 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9662 break; 9663 } 9664 } 9665 if (range == NULL) { 9666 assert(false); 9667 spdk_spin_unlock(&bdev->internal.spinlock); 9668 return -EINVAL; 9669 } 9670 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9671 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9672 spdk_spin_unlock(&bdev->internal.spinlock); 9673 9674 ctx->cb_fn = cb_fn; 9675 ctx->cb_arg = cb_arg; 9676 9677 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9678 bdev_unlock_lba_range_cb); 9679 return 0; 9680 } 9681 9682 static int 9683 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9684 uint64_t offset, uint64_t length, 9685 lock_range_cb cb_fn, void *cb_arg) 9686 { 9687 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9688 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9689 struct lba_range *range; 9690 bool range_found = false; 9691 9692 /* Let's make sure the specified channel actually has a lock on 9693 * the specified range. Note that the range must match exactly. 9694 */ 9695 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9696 if (range->offset == offset && range->length == length && 9697 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9698 range_found = true; 9699 break; 9700 } 9701 } 9702 9703 if (!range_found) { 9704 return -EINVAL; 9705 } 9706 9707 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9708 } 9709 9710 struct bdev_quiesce_ctx { 9711 spdk_bdev_quiesce_cb cb_fn; 9712 void *cb_arg; 9713 }; 9714 9715 static void 9716 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9717 { 9718 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9719 9720 if (quiesce_ctx->cb_fn != NULL) { 9721 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9722 } 9723 9724 free(quiesce_ctx); 9725 } 9726 9727 static void 9728 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9729 { 9730 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9731 struct spdk_bdev_module *module = range->bdev->module; 9732 9733 if (status != 0) { 9734 if (quiesce_ctx->cb_fn != NULL) { 9735 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9736 } 9737 free(quiesce_ctx); 9738 return; 9739 } 9740 9741 spdk_spin_lock(&module->internal.spinlock); 9742 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9743 spdk_spin_unlock(&module->internal.spinlock); 9744 9745 if (quiesce_ctx->cb_fn != NULL) { 9746 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9747 quiesce_ctx->cb_fn = NULL; 9748 quiesce_ctx->cb_arg = NULL; 9749 } 9750 /* quiesce_ctx will be freed on unquiesce */ 9751 } 9752 9753 static int 9754 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9755 uint64_t offset, uint64_t length, 9756 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9757 bool unquiesce) 9758 { 9759 struct bdev_quiesce_ctx *quiesce_ctx; 9760 int rc; 9761 9762 if (module != bdev->module) { 9763 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9764 return -EINVAL; 9765 } 9766 9767 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9768 return -EINVAL; 9769 } 9770 9771 if (unquiesce) { 9772 struct lba_range *range; 9773 9774 /* Make sure the specified range is actually quiesced in the specified module and 9775 * then remove it from the list. Note that the range must match exactly. 9776 */ 9777 spdk_spin_lock(&module->internal.spinlock); 9778 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9779 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9780 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9781 break; 9782 } 9783 } 9784 spdk_spin_unlock(&module->internal.spinlock); 9785 9786 if (range == NULL) { 9787 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9788 return -EINVAL; 9789 } 9790 9791 quiesce_ctx = range->locked_ctx; 9792 quiesce_ctx->cb_fn = cb_fn; 9793 quiesce_ctx->cb_arg = cb_arg; 9794 9795 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9796 } else { 9797 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9798 if (quiesce_ctx == NULL) { 9799 return -ENOMEM; 9800 } 9801 9802 quiesce_ctx->cb_fn = cb_fn; 9803 quiesce_ctx->cb_arg = cb_arg; 9804 9805 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9806 if (rc != 0) { 9807 free(quiesce_ctx); 9808 } 9809 } 9810 9811 return rc; 9812 } 9813 9814 int 9815 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9816 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9817 { 9818 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9819 } 9820 9821 int 9822 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9823 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9824 { 9825 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9826 } 9827 9828 int 9829 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9830 uint64_t offset, uint64_t length, 9831 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9832 { 9833 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9834 } 9835 9836 int 9837 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9838 uint64_t offset, uint64_t length, 9839 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9840 { 9841 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9842 } 9843 9844 int 9845 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9846 int array_size) 9847 { 9848 if (!bdev) { 9849 return -EINVAL; 9850 } 9851 9852 if (bdev->fn_table->get_memory_domains) { 9853 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9854 } 9855 9856 return 0; 9857 } 9858 9859 struct spdk_bdev_for_each_io_ctx { 9860 void *ctx; 9861 spdk_bdev_io_fn fn; 9862 spdk_bdev_for_each_io_cb cb; 9863 }; 9864 9865 static void 9866 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9867 struct spdk_io_channel *io_ch, void *_ctx) 9868 { 9869 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9870 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9871 struct spdk_bdev_io *bdev_io; 9872 int rc = 0; 9873 9874 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9875 rc = ctx->fn(ctx->ctx, bdev_io); 9876 if (rc != 0) { 9877 break; 9878 } 9879 } 9880 9881 spdk_bdev_for_each_channel_continue(i, rc); 9882 } 9883 9884 static void 9885 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9886 { 9887 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9888 9889 ctx->cb(ctx->ctx, status); 9890 9891 free(ctx); 9892 } 9893 9894 void 9895 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9896 spdk_bdev_for_each_io_cb cb) 9897 { 9898 struct spdk_bdev_for_each_io_ctx *ctx; 9899 9900 assert(fn != NULL && cb != NULL); 9901 9902 ctx = calloc(1, sizeof(*ctx)); 9903 if (ctx == NULL) { 9904 SPDK_ERRLOG("Failed to allocate context.\n"); 9905 cb(_ctx, -ENOMEM); 9906 return; 9907 } 9908 9909 ctx->ctx = _ctx; 9910 ctx->fn = fn; 9911 ctx->cb = cb; 9912 9913 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9914 bdev_for_each_io_done); 9915 } 9916 9917 void 9918 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9919 { 9920 spdk_for_each_channel_continue(iter->i, status); 9921 } 9922 9923 static struct spdk_bdev * 9924 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9925 { 9926 void *io_device = spdk_io_channel_iter_get_io_device(i); 9927 9928 return __bdev_from_io_dev(io_device); 9929 } 9930 9931 static void 9932 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9933 { 9934 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9935 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9936 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9937 9938 iter->i = i; 9939 iter->fn(iter, bdev, ch, iter->ctx); 9940 } 9941 9942 static void 9943 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9944 { 9945 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9946 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9947 9948 iter->i = i; 9949 iter->cpl(bdev, iter->ctx, status); 9950 9951 free(iter); 9952 } 9953 9954 void 9955 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9956 void *ctx, spdk_bdev_for_each_channel_done cpl) 9957 { 9958 struct spdk_bdev_channel_iter *iter; 9959 9960 assert(bdev != NULL && fn != NULL && ctx != NULL); 9961 9962 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9963 if (iter == NULL) { 9964 SPDK_ERRLOG("Unable to allocate iterator\n"); 9965 assert(false); 9966 return; 9967 } 9968 9969 iter->fn = fn; 9970 iter->cpl = cpl; 9971 iter->ctx = ctx; 9972 9973 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9974 iter, bdev_each_channel_cpl); 9975 } 9976 9977 static void 9978 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9979 { 9980 struct spdk_bdev_io *parent_io = cb_arg; 9981 9982 spdk_bdev_free_io(bdev_io); 9983 9984 /* Check return status of write */ 9985 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9986 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9987 } 9988 9989 static void 9990 bdev_copy_do_write(void *_bdev_io) 9991 { 9992 struct spdk_bdev_io *bdev_io = _bdev_io; 9993 int rc; 9994 9995 /* Write blocks */ 9996 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9997 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9998 bdev_io->u.bdev.iovs[0].iov_base, 9999 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10000 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10001 10002 if (rc == -ENOMEM) { 10003 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10004 } else if (rc != 0) { 10005 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10006 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10007 } 10008 } 10009 10010 static void 10011 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10012 { 10013 struct spdk_bdev_io *parent_io = cb_arg; 10014 10015 spdk_bdev_free_io(bdev_io); 10016 10017 /* Check return status of read */ 10018 if (!success) { 10019 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10020 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10021 return; 10022 } 10023 10024 /* Do write */ 10025 bdev_copy_do_write(parent_io); 10026 } 10027 10028 static void 10029 bdev_copy_do_read(void *_bdev_io) 10030 { 10031 struct spdk_bdev_io *bdev_io = _bdev_io; 10032 int rc; 10033 10034 /* Read blocks */ 10035 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10036 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10037 bdev_io->u.bdev.iovs[0].iov_base, 10038 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10039 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10040 10041 if (rc == -ENOMEM) { 10042 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10043 } else if (rc != 0) { 10044 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10045 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10046 } 10047 } 10048 10049 static void 10050 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10051 { 10052 if (!success) { 10053 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10054 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10055 return; 10056 } 10057 10058 bdev_copy_do_read(bdev_io); 10059 } 10060 10061 int 10062 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10063 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10064 spdk_bdev_io_completion_cb cb, void *cb_arg) 10065 { 10066 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10067 struct spdk_bdev_io *bdev_io; 10068 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10069 10070 if (!desc->write) { 10071 return -EBADF; 10072 } 10073 10074 if (num_blocks == 0) { 10075 SPDK_ERRLOG("Can't copy 0 blocks\n"); 10076 return -EINVAL; 10077 } 10078 10079 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10080 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10081 SPDK_DEBUGLOG(bdev, 10082 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10083 dst_offset_blocks, src_offset_blocks, num_blocks); 10084 return -EINVAL; 10085 } 10086 10087 bdev_io = bdev_channel_get_io(channel); 10088 if (!bdev_io) { 10089 return -ENOMEM; 10090 } 10091 10092 bdev_io->internal.ch = channel; 10093 bdev_io->internal.desc = desc; 10094 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10095 10096 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10097 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10098 bdev_io->u.bdev.num_blocks = num_blocks; 10099 bdev_io->u.bdev.memory_domain = NULL; 10100 bdev_io->u.bdev.memory_domain_ctx = NULL; 10101 bdev_io->u.bdev.iovs = NULL; 10102 bdev_io->u.bdev.iovcnt = 0; 10103 bdev_io->u.bdev.md_buf = NULL; 10104 bdev_io->u.bdev.accel_sequence = NULL; 10105 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10106 10107 if (dst_offset_blocks == src_offset_blocks) { 10108 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 10109 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 10110 10111 return 0; 10112 } 10113 10114 10115 /* If the copy size is large and should be split, use the generic split logic 10116 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10117 * 10118 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10119 * emulate it using regular read and write requests otherwise. 10120 */ 10121 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10122 bdev_io->internal.split) { 10123 bdev_io_submit(bdev_io); 10124 return 0; 10125 } 10126 10127 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10128 10129 return 0; 10130 } 10131 10132 SPDK_LOG_REGISTER_COMPONENT(bdev) 10133 10134 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10135 { 10136 struct spdk_trace_tpoint_opts opts[] = { 10137 { 10138 "BDEV_IO_START", TRACE_BDEV_IO_START, 10139 OWNER_BDEV, OBJECT_BDEV_IO, 1, 10140 { 10141 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10142 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10143 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10144 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10145 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10146 } 10147 }, 10148 { 10149 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10150 OWNER_BDEV, OBJECT_BDEV_IO, 0, 10151 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10152 }, 10153 { 10154 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10155 OWNER_BDEV, OBJECT_NONE, 1, 10156 { 10157 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10158 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10159 } 10160 }, 10161 { 10162 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10163 OWNER_BDEV, OBJECT_NONE, 0, 10164 { 10165 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10166 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10167 } 10168 }, 10169 }; 10170 10171 10172 spdk_trace_register_owner(OWNER_BDEV, 'b'); 10173 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10174 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10175 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10176 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10177 } 10178