1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "v23.09", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 112 113 #ifdef SPDK_CONFIG_VTUNE 114 __itt_domain *domain; 115 #endif 116 }; 117 118 static struct spdk_bdev_mgr g_bdev_mgr = { 119 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 120 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 121 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 122 .init_complete = false, 123 .module_init_complete = false, 124 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 125 }; 126 127 static void 128 __attribute__((constructor)) 129 _bdev_init(void) 130 { 131 spdk_spin_init(&g_bdev_mgr.spinlock); 132 } 133 134 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 135 136 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 137 138 struct lba_range { 139 struct spdk_bdev *bdev; 140 uint64_t offset; 141 uint64_t length; 142 void *locked_ctx; 143 struct spdk_thread *owner_thread; 144 struct spdk_bdev_channel *owner_ch; 145 TAILQ_ENTRY(lba_range) tailq; 146 TAILQ_ENTRY(lba_range) tailq_module; 147 }; 148 149 static struct spdk_bdev_opts g_bdev_opts = { 150 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 151 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 152 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. */ 180 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 182 /** Function to update for the submitted IO. */ 183 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 184 }; 185 186 struct spdk_bdev_qos { 187 /** Types of structure of rate limits. */ 188 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 189 190 /** The channel that all I/O are funneled through. */ 191 struct spdk_bdev_channel *ch; 192 193 /** The thread on which the poller is running. */ 194 struct spdk_thread *thread; 195 196 /** Queue of I/O waiting to be issued. */ 197 bdev_io_tailq_t queued; 198 199 /** Size of a timeslice in tsc ticks. */ 200 uint64_t timeslice_size; 201 202 /** Timestamp of start of last timeslice. */ 203 uint64_t last_timeslice; 204 205 /** Poller that processes queued I/O commands each time slice. */ 206 struct spdk_poller *poller; 207 }; 208 209 struct spdk_bdev_mgmt_channel { 210 /* 211 * Each thread keeps a cache of bdev_io - this allows 212 * bdev threads which are *not* DPDK threads to still 213 * benefit from a per-thread bdev_io cache. Without 214 * this, non-DPDK threads fetching from the mempool 215 * incur a cmpxchg on get and put. 216 */ 217 bdev_io_stailq_t per_thread_cache; 218 uint32_t per_thread_cache_count; 219 uint32_t bdev_io_cache_size; 220 221 struct spdk_iobuf_channel iobuf; 222 223 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 224 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 225 }; 226 227 /* 228 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 229 * will queue here their IO that awaits retry. It makes it possible to retry sending 230 * IO to one bdev after IO from other bdev completes. 231 */ 232 struct spdk_bdev_shared_resource { 233 /* The bdev management channel */ 234 struct spdk_bdev_mgmt_channel *mgmt_ch; 235 236 /* 237 * Count of I/O submitted to bdev module and waiting for completion. 238 * Incremented before submit_request() is called on an spdk_bdev_io. 239 */ 240 uint64_t io_outstanding; 241 242 /* 243 * Queue of IO awaiting retry because of a previous NOMEM status returned 244 * on this channel. 245 */ 246 bdev_io_tailq_t nomem_io; 247 248 /* 249 * Threshold which io_outstanding must drop to before retrying nomem_io. 250 */ 251 uint64_t nomem_threshold; 252 253 /* I/O channel allocated by a bdev module */ 254 struct spdk_io_channel *shared_ch; 255 256 /* Refcount of bdev channels using this resource */ 257 uint32_t ref; 258 259 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 260 }; 261 262 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 263 #define BDEV_CH_QOS_ENABLED (1 << 1) 264 265 struct spdk_bdev_channel { 266 struct spdk_bdev *bdev; 267 268 /* The channel for the underlying device */ 269 struct spdk_io_channel *channel; 270 271 /* Accel channel */ 272 struct spdk_io_channel *accel_channel; 273 274 /* Per io_device per thread data */ 275 struct spdk_bdev_shared_resource *shared_resource; 276 277 struct spdk_bdev_io_stat *stat; 278 279 /* 280 * Count of I/O submitted to the underlying dev module through this channel 281 * and waiting for completion. 282 */ 283 uint64_t io_outstanding; 284 285 /* 286 * List of all submitted I/Os including I/O that are generated via splitting. 287 */ 288 bdev_io_tailq_t io_submitted; 289 290 /* 291 * List of spdk_bdev_io that are currently queued because they write to a locked 292 * LBA range. 293 */ 294 bdev_io_tailq_t io_locked; 295 296 /* List of I/Os with accel sequence being currently executed */ 297 bdev_io_tailq_t io_accel_exec; 298 299 /* List of I/Os doing memory domain pull/push */ 300 bdev_io_tailq_t io_memory_domain; 301 302 uint32_t flags; 303 304 struct spdk_histogram_data *histogram; 305 306 #ifdef SPDK_CONFIG_VTUNE 307 uint64_t start_tsc; 308 uint64_t interval_tsc; 309 __itt_string_handle *handle; 310 struct spdk_bdev_io_stat *prev_stat; 311 #endif 312 313 bdev_io_tailq_t queued_resets; 314 315 lba_range_tailq_t locked_ranges; 316 }; 317 318 struct media_event_entry { 319 struct spdk_bdev_media_event event; 320 TAILQ_ENTRY(media_event_entry) tailq; 321 }; 322 323 #define MEDIA_EVENT_POOL_SIZE 64 324 325 struct spdk_bdev_desc { 326 struct spdk_bdev *bdev; 327 struct spdk_thread *thread; 328 struct { 329 spdk_bdev_event_cb_t event_fn; 330 void *ctx; 331 } callback; 332 bool closed; 333 bool write; 334 bool memory_domains_supported; 335 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 336 struct spdk_spinlock spinlock; 337 uint32_t refs; 338 TAILQ_HEAD(, media_event_entry) pending_media_events; 339 TAILQ_HEAD(, media_event_entry) free_media_events; 340 struct media_event_entry *media_events_buffer; 341 TAILQ_ENTRY(spdk_bdev_desc) link; 342 343 uint64_t timeout_in_sec; 344 spdk_bdev_io_timeout_cb cb_fn; 345 void *cb_arg; 346 struct spdk_poller *io_timeout_poller; 347 struct spdk_bdev_module_claim *claim; 348 }; 349 350 struct spdk_bdev_iostat_ctx { 351 struct spdk_bdev_io_stat *stat; 352 spdk_bdev_get_device_stat_cb cb; 353 void *cb_arg; 354 }; 355 356 struct set_qos_limit_ctx { 357 void (*cb_fn)(void *cb_arg, int status); 358 void *cb_arg; 359 struct spdk_bdev *bdev; 360 }; 361 362 struct spdk_bdev_channel_iter { 363 spdk_bdev_for_each_channel_msg fn; 364 spdk_bdev_for_each_channel_done cpl; 365 struct spdk_io_channel_iter *i; 366 void *ctx; 367 }; 368 369 struct spdk_bdev_io_error_stat { 370 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 371 }; 372 373 enum bdev_io_retry_state { 374 BDEV_IO_RETRY_STATE_INVALID, 375 BDEV_IO_RETRY_STATE_PULL, 376 BDEV_IO_RETRY_STATE_PULL_MD, 377 BDEV_IO_RETRY_STATE_SUBMIT, 378 BDEV_IO_RETRY_STATE_PUSH, 379 BDEV_IO_RETRY_STATE_PUSH_MD, 380 }; 381 382 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 383 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 384 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 385 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 386 387 static inline void bdev_io_complete(void *ctx); 388 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 389 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 390 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 391 392 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 393 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 394 395 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 396 struct spdk_io_channel *ch, void *_ctx); 397 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 398 399 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 400 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 401 uint64_t num_blocks, 402 struct spdk_memory_domain *domain, void *domain_ctx, 403 struct spdk_accel_sequence *seq, 404 spdk_bdev_io_completion_cb cb, void *cb_arg); 405 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 406 struct iovec *iov, int iovcnt, void *md_buf, 407 uint64_t offset_blocks, uint64_t num_blocks, 408 struct spdk_memory_domain *domain, void *domain_ctx, 409 struct spdk_accel_sequence *seq, 410 spdk_bdev_io_completion_cb cb, void *cb_arg); 411 412 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 413 uint64_t offset, uint64_t length, 414 lock_range_cb cb_fn, void *cb_arg); 415 416 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 417 uint64_t offset, uint64_t length, 418 lock_range_cb cb_fn, void *cb_arg); 419 420 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 421 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 422 423 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 424 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 425 static void claim_reset(struct spdk_bdev *bdev); 426 427 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 428 429 #define bdev_get_ext_io_opt(opts, field, defval) \ 430 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 431 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 432 433 void 434 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 435 { 436 if (!opts) { 437 SPDK_ERRLOG("opts should not be NULL\n"); 438 return; 439 } 440 441 if (!opts_size) { 442 SPDK_ERRLOG("opts_size should not be zero value\n"); 443 return; 444 } 445 446 opts->opts_size = opts_size; 447 448 #define SET_FIELD(field) \ 449 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 450 opts->field = g_bdev_opts.field; \ 451 } \ 452 453 SET_FIELD(bdev_io_pool_size); 454 SET_FIELD(bdev_io_cache_size); 455 SET_FIELD(bdev_auto_examine); 456 457 /* Do not remove this statement, you should always update this statement when you adding a new field, 458 * and do not forget to add the SET_FIELD statement for your added field. */ 459 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 460 461 #undef SET_FIELD 462 } 463 464 int 465 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 466 { 467 uint32_t min_pool_size; 468 469 if (!opts) { 470 SPDK_ERRLOG("opts cannot be NULL\n"); 471 return -1; 472 } 473 474 if (!opts->opts_size) { 475 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 476 return -1; 477 } 478 479 /* 480 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 481 * initialization. A second mgmt_ch will be created on the same thread when the application starts 482 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 483 */ 484 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 485 if (opts->bdev_io_pool_size < min_pool_size) { 486 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 487 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 488 spdk_thread_get_count()); 489 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 490 return -1; 491 } 492 493 #define SET_FIELD(field) \ 494 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 495 g_bdev_opts.field = opts->field; \ 496 } \ 497 498 SET_FIELD(bdev_io_pool_size); 499 SET_FIELD(bdev_io_cache_size); 500 SET_FIELD(bdev_auto_examine); 501 502 g_bdev_opts.opts_size = opts->opts_size; 503 504 #undef SET_FIELD 505 506 return 0; 507 } 508 509 static struct spdk_bdev * 510 bdev_get_by_name(const char *bdev_name) 511 { 512 struct spdk_bdev_name find; 513 struct spdk_bdev_name *res; 514 515 find.name = (char *)bdev_name; 516 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 517 if (res != NULL) { 518 return res->bdev; 519 } 520 521 return NULL; 522 } 523 524 struct spdk_bdev * 525 spdk_bdev_get_by_name(const char *bdev_name) 526 { 527 struct spdk_bdev *bdev; 528 529 spdk_spin_lock(&g_bdev_mgr.spinlock); 530 bdev = bdev_get_by_name(bdev_name); 531 spdk_spin_unlock(&g_bdev_mgr.spinlock); 532 533 return bdev; 534 } 535 536 struct bdev_io_status_string { 537 enum spdk_bdev_io_status status; 538 const char *str; 539 }; 540 541 static const struct bdev_io_status_string bdev_io_status_strings[] = { 542 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 543 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 544 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 545 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 546 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 547 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 548 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 549 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 550 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 551 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 552 }; 553 554 static const char * 555 bdev_io_status_get_string(enum spdk_bdev_io_status status) 556 { 557 uint32_t i; 558 559 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 560 if (bdev_io_status_strings[i].status == status) { 561 return bdev_io_status_strings[i].str; 562 } 563 } 564 565 return "reserved"; 566 } 567 568 struct spdk_bdev_wait_for_examine_ctx { 569 struct spdk_poller *poller; 570 spdk_bdev_wait_for_examine_cb cb_fn; 571 void *cb_arg; 572 }; 573 574 static bool bdev_module_all_actions_completed(void); 575 576 static int 577 bdev_wait_for_examine_cb(void *arg) 578 { 579 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 580 581 if (!bdev_module_all_actions_completed()) { 582 return SPDK_POLLER_IDLE; 583 } 584 585 spdk_poller_unregister(&ctx->poller); 586 ctx->cb_fn(ctx->cb_arg); 587 free(ctx); 588 589 return SPDK_POLLER_BUSY; 590 } 591 592 int 593 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 594 { 595 struct spdk_bdev_wait_for_examine_ctx *ctx; 596 597 ctx = calloc(1, sizeof(*ctx)); 598 if (ctx == NULL) { 599 return -ENOMEM; 600 } 601 ctx->cb_fn = cb_fn; 602 ctx->cb_arg = cb_arg; 603 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 604 605 return 0; 606 } 607 608 struct spdk_bdev_examine_item { 609 char *name; 610 TAILQ_ENTRY(spdk_bdev_examine_item) link; 611 }; 612 613 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 614 615 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 616 g_bdev_examine_allowlist); 617 618 static inline bool 619 bdev_examine_allowlist_check(const char *name) 620 { 621 struct spdk_bdev_examine_item *item; 622 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 623 if (strcmp(name, item->name) == 0) { 624 return true; 625 } 626 } 627 return false; 628 } 629 630 static inline void 631 bdev_examine_allowlist_free(void) 632 { 633 struct spdk_bdev_examine_item *item; 634 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 635 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 636 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 637 free(item->name); 638 free(item); 639 } 640 } 641 642 static inline bool 643 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 644 { 645 struct spdk_bdev_alias *tmp; 646 if (bdev_examine_allowlist_check(bdev->name)) { 647 return true; 648 } 649 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 650 if (bdev_examine_allowlist_check(tmp->alias.name)) { 651 return true; 652 } 653 } 654 return false; 655 } 656 657 static inline bool 658 bdev_ok_to_examine(struct spdk_bdev *bdev) 659 { 660 if (g_bdev_opts.bdev_auto_examine) { 661 return true; 662 } else { 663 return bdev_in_examine_allowlist(bdev); 664 } 665 } 666 667 static void 668 bdev_examine(struct spdk_bdev *bdev) 669 { 670 struct spdk_bdev_module *module; 671 struct spdk_bdev_module_claim *claim, *tmpclaim; 672 uint32_t action; 673 674 if (!bdev_ok_to_examine(bdev)) { 675 return; 676 } 677 678 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 679 if (module->examine_config) { 680 spdk_spin_lock(&module->internal.spinlock); 681 action = module->internal.action_in_progress; 682 module->internal.action_in_progress++; 683 spdk_spin_unlock(&module->internal.spinlock); 684 module->examine_config(bdev); 685 if (action != module->internal.action_in_progress) { 686 SPDK_ERRLOG("examine_config for module %s did not call " 687 "spdk_bdev_module_examine_done()\n", module->name); 688 } 689 } 690 } 691 692 spdk_spin_lock(&bdev->internal.spinlock); 693 694 switch (bdev->internal.claim_type) { 695 case SPDK_BDEV_CLAIM_NONE: 696 /* Examine by all bdev modules */ 697 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 698 if (module->examine_disk) { 699 spdk_spin_lock(&module->internal.spinlock); 700 module->internal.action_in_progress++; 701 spdk_spin_unlock(&module->internal.spinlock); 702 spdk_spin_unlock(&bdev->internal.spinlock); 703 module->examine_disk(bdev); 704 spdk_spin_lock(&bdev->internal.spinlock); 705 } 706 } 707 break; 708 case SPDK_BDEV_CLAIM_EXCL_WRITE: 709 /* Examine by the one bdev module with a v1 claim */ 710 module = bdev->internal.claim.v1.module; 711 if (module->examine_disk) { 712 spdk_spin_lock(&module->internal.spinlock); 713 module->internal.action_in_progress++; 714 spdk_spin_unlock(&module->internal.spinlock); 715 spdk_spin_unlock(&bdev->internal.spinlock); 716 module->examine_disk(bdev); 717 return; 718 } 719 break; 720 default: 721 /* Examine by all bdev modules with a v2 claim */ 722 assert(claim_type_is_v2(bdev->internal.claim_type)); 723 /* 724 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 725 * list, perhaps accessing freed memory. Without protection, this could happen 726 * while the lock is dropped during the examine callback. 727 */ 728 bdev->internal.examine_in_progress++; 729 730 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 731 module = claim->module; 732 733 if (module == NULL) { 734 /* This is a vestigial claim, held by examine_count */ 735 continue; 736 } 737 738 if (module->examine_disk == NULL) { 739 continue; 740 } 741 742 spdk_spin_lock(&module->internal.spinlock); 743 module->internal.action_in_progress++; 744 spdk_spin_unlock(&module->internal.spinlock); 745 746 /* Call examine_disk without holding internal.spinlock. */ 747 spdk_spin_unlock(&bdev->internal.spinlock); 748 module->examine_disk(bdev); 749 spdk_spin_lock(&bdev->internal.spinlock); 750 } 751 752 assert(bdev->internal.examine_in_progress > 0); 753 bdev->internal.examine_in_progress--; 754 if (bdev->internal.examine_in_progress == 0) { 755 /* Remove any claims that were released during examine_disk */ 756 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 757 if (claim->desc != NULL) { 758 continue; 759 } 760 761 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 762 free(claim); 763 } 764 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 765 claim_reset(bdev); 766 } 767 } 768 } 769 770 spdk_spin_unlock(&bdev->internal.spinlock); 771 } 772 773 int 774 spdk_bdev_examine(const char *name) 775 { 776 struct spdk_bdev *bdev; 777 struct spdk_bdev_examine_item *item; 778 struct spdk_thread *thread = spdk_get_thread(); 779 780 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 781 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 782 thread ? spdk_thread_get_name(thread) : "null"); 783 return -EINVAL; 784 } 785 786 if (g_bdev_opts.bdev_auto_examine) { 787 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 788 return -EINVAL; 789 } 790 791 if (bdev_examine_allowlist_check(name)) { 792 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 793 return -EEXIST; 794 } 795 796 item = calloc(1, sizeof(*item)); 797 if (!item) { 798 return -ENOMEM; 799 } 800 item->name = strdup(name); 801 if (!item->name) { 802 free(item); 803 return -ENOMEM; 804 } 805 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 806 807 bdev = spdk_bdev_get_by_name(name); 808 if (bdev) { 809 bdev_examine(bdev); 810 } 811 return 0; 812 } 813 814 static inline void 815 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 816 { 817 struct spdk_bdev_examine_item *item; 818 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 819 spdk_json_write_object_begin(w); 820 spdk_json_write_named_string(w, "method", "bdev_examine"); 821 spdk_json_write_named_object_begin(w, "params"); 822 spdk_json_write_named_string(w, "name", item->name); 823 spdk_json_write_object_end(w); 824 spdk_json_write_object_end(w); 825 } 826 } 827 828 struct spdk_bdev * 829 spdk_bdev_first(void) 830 { 831 struct spdk_bdev *bdev; 832 833 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 834 if (bdev) { 835 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 836 } 837 838 return bdev; 839 } 840 841 struct spdk_bdev * 842 spdk_bdev_next(struct spdk_bdev *prev) 843 { 844 struct spdk_bdev *bdev; 845 846 bdev = TAILQ_NEXT(prev, internal.link); 847 if (bdev) { 848 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 849 } 850 851 return bdev; 852 } 853 854 static struct spdk_bdev * 855 _bdev_next_leaf(struct spdk_bdev *bdev) 856 { 857 while (bdev != NULL) { 858 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 859 return bdev; 860 } else { 861 bdev = TAILQ_NEXT(bdev, internal.link); 862 } 863 } 864 865 return bdev; 866 } 867 868 struct spdk_bdev * 869 spdk_bdev_first_leaf(void) 870 { 871 struct spdk_bdev *bdev; 872 873 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 874 875 if (bdev) { 876 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 877 } 878 879 return bdev; 880 } 881 882 struct spdk_bdev * 883 spdk_bdev_next_leaf(struct spdk_bdev *prev) 884 { 885 struct spdk_bdev *bdev; 886 887 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 888 889 if (bdev) { 890 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 891 } 892 893 return bdev; 894 } 895 896 static inline bool 897 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 898 { 899 return bdev_io->internal.memory_domain; 900 } 901 902 static inline bool 903 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 904 { 905 return bdev_io->internal.has_accel_sequence; 906 } 907 908 static inline void 909 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 910 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 911 { 912 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 913 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 914 * channels we will instead wait for half to complete. 915 */ 916 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 917 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 918 919 assert(state != BDEV_IO_RETRY_STATE_INVALID); 920 bdev_io->internal.retry_state = state; 921 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 922 } 923 924 static inline void 925 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 926 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 927 { 928 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 929 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 930 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 931 932 assert(state != BDEV_IO_RETRY_STATE_INVALID); 933 bdev_io->internal.retry_state = state; 934 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 935 } 936 937 void 938 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 939 { 940 struct iovec *iovs; 941 942 if (bdev_io->u.bdev.iovs == NULL) { 943 bdev_io->u.bdev.iovs = &bdev_io->iov; 944 bdev_io->u.bdev.iovcnt = 1; 945 } 946 947 iovs = bdev_io->u.bdev.iovs; 948 949 assert(iovs != NULL); 950 assert(bdev_io->u.bdev.iovcnt >= 1); 951 952 iovs[0].iov_base = buf; 953 iovs[0].iov_len = len; 954 } 955 956 void 957 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 958 { 959 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 960 bdev_io->u.bdev.md_buf = md_buf; 961 } 962 963 static bool 964 _is_buf_allocated(const struct iovec *iovs) 965 { 966 if (iovs == NULL) { 967 return false; 968 } 969 970 return iovs[0].iov_base != NULL; 971 } 972 973 static bool 974 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 975 { 976 int i; 977 uintptr_t iov_base; 978 979 if (spdk_likely(alignment == 1)) { 980 return true; 981 } 982 983 for (i = 0; i < iovcnt; i++) { 984 iov_base = (uintptr_t)iovs[i].iov_base; 985 if ((iov_base & (alignment - 1)) != 0) { 986 return false; 987 } 988 } 989 990 return true; 991 } 992 993 static inline bool 994 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 995 { 996 if (!bdev_io->internal.accel_sequence) { 997 return false; 998 } 999 1000 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1001 * bdev module didn't support accel sequences */ 1002 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1003 } 1004 1005 static inline void 1006 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1007 struct spdk_bdev_shared_resource *shared_resource) 1008 { 1009 bdev_ch->io_outstanding++; 1010 shared_resource->io_outstanding++; 1011 } 1012 1013 static inline void 1014 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1015 struct spdk_bdev_shared_resource *shared_resource) 1016 { 1017 assert(bdev_ch->io_outstanding > 0); 1018 assert(shared_resource->io_outstanding > 0); 1019 bdev_ch->io_outstanding--; 1020 shared_resource->io_outstanding--; 1021 } 1022 1023 static void 1024 bdev_io_submit_sequence_cb(void *ctx, int status) 1025 { 1026 struct spdk_bdev_io *bdev_io = ctx; 1027 1028 bdev_io->u.bdev.accel_sequence = NULL; 1029 bdev_io->internal.accel_sequence = NULL; 1030 1031 if (spdk_unlikely(status != 0)) { 1032 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1033 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1034 bdev_io_complete_unsubmitted(bdev_io); 1035 return; 1036 } 1037 1038 bdev_io_submit(bdev_io); 1039 } 1040 1041 static void 1042 bdev_io_exec_sequence_cb(void *ctx, int status) 1043 { 1044 struct spdk_bdev_io *bdev_io = ctx; 1045 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1046 1047 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1048 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1049 1050 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1051 bdev_ch_retry_io(ch); 1052 } 1053 1054 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1055 } 1056 1057 static void 1058 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1059 { 1060 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1061 1062 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1063 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1064 1065 /* Since the operations are appended during submission, they're in the opposite order than 1066 * how we want to execute them for reads (i.e. we need to execute the most recently added 1067 * operation first), so reverse the sequence before executing it. 1068 */ 1069 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1070 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1071 } 1072 1073 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1074 bdev_io_increment_outstanding(ch, ch->shared_resource); 1075 bdev_io->internal.data_transfer_cpl = cb_fn; 1076 1077 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1078 bdev_io_exec_sequence_cb, bdev_io); 1079 } 1080 1081 static void 1082 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1083 { 1084 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1085 void *buf; 1086 1087 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1088 buf = bdev_io->internal.buf; 1089 bdev_io->internal.buf = NULL; 1090 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1091 bdev_io->internal.get_aux_buf_cb = NULL; 1092 } else { 1093 assert(bdev_io->internal.get_buf_cb != NULL); 1094 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1095 bdev_io->internal.get_buf_cb = NULL; 1096 } 1097 } 1098 1099 static void 1100 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1101 { 1102 struct spdk_bdev_io *bdev_io = ctx; 1103 1104 if (rc) { 1105 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1106 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1107 } 1108 bdev_io_get_buf_complete(bdev_io, !rc); 1109 } 1110 1111 static void 1112 bdev_io_pull_md_buf_done(void *ctx, int status) 1113 { 1114 struct spdk_bdev_io *bdev_io = ctx; 1115 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1116 1117 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1118 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1119 1120 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1121 bdev_ch_retry_io(ch); 1122 } 1123 1124 assert(bdev_io->internal.data_transfer_cpl); 1125 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1126 } 1127 1128 static void 1129 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1130 { 1131 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1132 int rc = 0; 1133 1134 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1135 if (bdev_io_use_memory_domain(bdev_io)) { 1136 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1137 bdev_io_increment_outstanding(ch, ch->shared_resource); 1138 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1139 bdev_io->internal.memory_domain_ctx, 1140 &bdev_io->internal.orig_md_iov, 1, 1141 &bdev_io->internal.bounce_md_iov, 1, 1142 bdev_io_pull_md_buf_done, bdev_io); 1143 if (rc == 0) { 1144 /* Continue to submit IO in completion callback */ 1145 return; 1146 } 1147 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1148 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1149 if (rc != -ENOMEM) { 1150 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1151 spdk_memory_domain_get_dma_device_id( 1152 bdev_io->internal.memory_domain), rc); 1153 } 1154 } else { 1155 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1156 bdev_io->internal.orig_md_iov.iov_base, 1157 bdev_io->internal.orig_md_iov.iov_len); 1158 } 1159 } 1160 1161 if (spdk_unlikely(rc == -ENOMEM)) { 1162 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1163 } else { 1164 assert(bdev_io->internal.data_transfer_cpl); 1165 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1166 } 1167 } 1168 1169 static void 1170 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1171 { 1172 /* save original md_buf */ 1173 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1174 bdev_io->internal.orig_md_iov.iov_len = len; 1175 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1176 bdev_io->internal.bounce_md_iov.iov_len = len; 1177 /* set bounce md_buf */ 1178 bdev_io->u.bdev.md_buf = md_buf; 1179 1180 bdev_io_pull_md_buf(bdev_io); 1181 } 1182 1183 static void 1184 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1185 { 1186 struct spdk_bdev *bdev = bdev_io->bdev; 1187 uint64_t md_len; 1188 void *buf; 1189 1190 if (spdk_bdev_is_md_separate(bdev)) { 1191 assert(!bdev_io_use_accel_sequence(bdev_io)); 1192 1193 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1194 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1195 1196 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1197 1198 if (bdev_io->u.bdev.md_buf != NULL) { 1199 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1200 return; 1201 } else { 1202 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1203 } 1204 } 1205 1206 bdev_io_get_buf_complete(bdev_io, true); 1207 } 1208 1209 static inline void 1210 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1211 { 1212 if (rc) { 1213 SPDK_ERRLOG("Failed to get data buffer\n"); 1214 assert(bdev_io->internal.data_transfer_cpl); 1215 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1216 return; 1217 } 1218 1219 _bdev_io_set_md_buf(bdev_io); 1220 } 1221 1222 static void 1223 bdev_io_pull_data_done_and_track(void *ctx, int status) 1224 { 1225 struct spdk_bdev_io *bdev_io = ctx; 1226 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1227 1228 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1229 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1230 1231 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1232 bdev_ch_retry_io(ch); 1233 } 1234 1235 bdev_io_pull_data_done(bdev_io, status); 1236 } 1237 1238 static void 1239 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1240 { 1241 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1242 int rc = 0; 1243 1244 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1245 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1246 * operation */ 1247 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1248 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1249 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1250 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1251 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1252 NULL, NULL, 1253 bdev_io->internal.orig_iovs, 1254 bdev_io->internal.orig_iovcnt, 1255 bdev_io->internal.memory_domain, 1256 bdev_io->internal.memory_domain_ctx, 1257 0, NULL, NULL); 1258 } else { 1259 /* We need to reverse the src/dst for reads */ 1260 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1261 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1262 bdev_io->internal.orig_iovs, 1263 bdev_io->internal.orig_iovcnt, 1264 bdev_io->internal.memory_domain, 1265 bdev_io->internal.memory_domain_ctx, 1266 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1267 NULL, NULL, 0, NULL, NULL); 1268 } 1269 1270 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1271 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1272 bdev_io->internal.accel_sequence); 1273 } 1274 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1275 /* if this is write path, copy data from original buffer to bounce buffer */ 1276 if (bdev_io_use_memory_domain(bdev_io)) { 1277 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1278 bdev_io_increment_outstanding(ch, ch->shared_resource); 1279 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1280 bdev_io->internal.memory_domain_ctx, 1281 bdev_io->internal.orig_iovs, 1282 (uint32_t) bdev_io->internal.orig_iovcnt, 1283 bdev_io->u.bdev.iovs, 1, 1284 bdev_io_pull_data_done_and_track, 1285 bdev_io); 1286 if (rc == 0) { 1287 /* Continue to submit IO in completion callback */ 1288 return; 1289 } 1290 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1291 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1292 if (rc != -ENOMEM) { 1293 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1294 spdk_memory_domain_get_dma_device_id( 1295 bdev_io->internal.memory_domain)); 1296 } 1297 } else { 1298 assert(bdev_io->u.bdev.iovcnt == 1); 1299 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1300 bdev_io->u.bdev.iovs[0].iov_len, 1301 bdev_io->internal.orig_iovs, 1302 bdev_io->internal.orig_iovcnt); 1303 } 1304 } 1305 1306 if (spdk_unlikely(rc == -ENOMEM)) { 1307 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1308 } else { 1309 bdev_io_pull_data_done(bdev_io, rc); 1310 } 1311 } 1312 1313 static void 1314 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1315 bdev_copy_bounce_buffer_cpl cpl_cb) 1316 { 1317 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1318 1319 bdev_io->internal.data_transfer_cpl = cpl_cb; 1320 /* save original iovec */ 1321 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1322 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1323 /* set bounce iov */ 1324 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1325 bdev_io->u.bdev.iovcnt = 1; 1326 /* set bounce buffer for this operation */ 1327 bdev_io->u.bdev.iovs[0].iov_base = buf; 1328 bdev_io->u.bdev.iovs[0].iov_len = len; 1329 1330 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1331 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1332 } else { 1333 bdev_io_pull_data(bdev_io); 1334 } 1335 } 1336 1337 static void 1338 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1339 { 1340 struct spdk_bdev *bdev = bdev_io->bdev; 1341 bool buf_allocated; 1342 uint64_t alignment; 1343 void *aligned_buf; 1344 1345 bdev_io->internal.buf = buf; 1346 1347 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1348 bdev_io_get_buf_complete(bdev_io, true); 1349 return; 1350 } 1351 1352 alignment = spdk_bdev_get_buf_align(bdev); 1353 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1354 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1355 1356 if (buf_allocated) { 1357 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1358 /* Continue in completion callback */ 1359 return; 1360 } else { 1361 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1362 } 1363 1364 _bdev_io_set_md_buf(bdev_io); 1365 } 1366 1367 static inline uint64_t 1368 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1369 { 1370 struct spdk_bdev *bdev = bdev_io->bdev; 1371 uint64_t md_len, alignment; 1372 1373 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1374 1375 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1376 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1377 1378 return len + alignment + md_len; 1379 } 1380 1381 static void 1382 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1383 { 1384 struct spdk_bdev_mgmt_channel *ch; 1385 1386 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1387 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1388 } 1389 1390 static void 1391 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1392 { 1393 assert(bdev_io->internal.buf != NULL); 1394 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1395 bdev_io->internal.buf = NULL; 1396 } 1397 1398 void 1399 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1400 { 1401 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1402 1403 assert(buf != NULL); 1404 _bdev_io_put_buf(bdev_io, buf, len); 1405 } 1406 1407 static inline void 1408 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1409 struct spdk_bdev_io *bdev_io) 1410 { 1411 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1412 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1413 * sequence pointer to make sure we won't touch it anymore. */ 1414 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1415 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1416 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1417 bdev_io->internal.accel_sequence = NULL; 1418 } 1419 1420 bdev->fn_table->submit_request(ioch, bdev_io); 1421 } 1422 1423 static inline void 1424 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1425 { 1426 struct spdk_bdev *bdev = bdev_ch->bdev; 1427 1428 bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource); 1429 bdev_io->internal.error.nvme.cdw0 = 0; 1430 bdev_io->num_retries++; 1431 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1432 } 1433 1434 static void 1435 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1436 { 1437 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1438 struct spdk_bdev_io *bdev_io; 1439 1440 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1441 /* 1442 * Allow some more I/O to complete before retrying the nomem_io queue. 1443 * Some drivers (such as nvme) cannot immediately take a new I/O in 1444 * the context of a completion, because the resources for the I/O are 1445 * not released until control returns to the bdev poller. Also, we 1446 * may require several small I/O to complete before a larger I/O 1447 * (that requires splitting) can be submitted. 1448 */ 1449 return; 1450 } 1451 1452 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1453 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1454 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1455 1456 switch (bdev_io->internal.retry_state) { 1457 case BDEV_IO_RETRY_STATE_SUBMIT: 1458 bdev_ch_resubmit_io(bdev_ch, bdev_io); 1459 break; 1460 case BDEV_IO_RETRY_STATE_PULL: 1461 bdev_io_pull_data(bdev_io); 1462 break; 1463 case BDEV_IO_RETRY_STATE_PULL_MD: 1464 bdev_io_pull_md_buf(bdev_io); 1465 break; 1466 case BDEV_IO_RETRY_STATE_PUSH: 1467 bdev_io_push_bounce_data(bdev_io); 1468 break; 1469 case BDEV_IO_RETRY_STATE_PUSH_MD: 1470 bdev_io_push_bounce_md_buf(bdev_io); 1471 break; 1472 default: 1473 assert(0 && "invalid retry state"); 1474 break; 1475 } 1476 1477 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1478 /* This IO completed again with NOMEM status, so break the loop and 1479 * don't try anymore. Note that a bdev_io that fails with NOMEM 1480 * always gets requeued at the front of the list, to maintain 1481 * ordering. 1482 */ 1483 break; 1484 } 1485 } 1486 } 1487 1488 static inline bool 1489 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1490 { 1491 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1492 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1493 1494 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1495 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1496 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1497 1498 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1499 * ownership of that sequence is transferred back to the bdev layer, so we need to 1500 * restore internal.accel_sequence to make sure that the sequence is handled 1501 * correctly in case the I/O is later aborted. */ 1502 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1503 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1504 assert(bdev_io->internal.accel_sequence == NULL); 1505 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1506 } 1507 1508 return true; 1509 } 1510 1511 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1512 bdev_ch_retry_io(bdev_ch); 1513 } 1514 1515 return false; 1516 } 1517 1518 static void 1519 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1520 { 1521 struct spdk_bdev_io *bdev_io = ctx; 1522 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1523 1524 if (rc) { 1525 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1526 } 1527 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1528 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1529 */ 1530 bdev_io_put_buf(bdev_io); 1531 1532 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1533 bdev_ch_retry_io(ch); 1534 } 1535 1536 /* Continue with IO completion flow */ 1537 bdev_io_complete(bdev_io); 1538 } 1539 1540 static void 1541 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1542 { 1543 struct spdk_bdev_io *bdev_io = ctx; 1544 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1545 1546 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1547 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1548 1549 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1550 bdev_ch_retry_io(ch); 1551 } 1552 1553 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1554 } 1555 1556 static inline void 1557 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1558 { 1559 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1560 int rc = 0; 1561 1562 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1563 /* do the same for metadata buffer */ 1564 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1565 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1566 1567 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1568 if (bdev_io_use_memory_domain(bdev_io)) { 1569 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1570 bdev_io_increment_outstanding(ch, ch->shared_resource); 1571 /* If memory domain is used then we need to call async push function */ 1572 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1573 bdev_io->internal.memory_domain_ctx, 1574 &bdev_io->internal.orig_md_iov, 1575 (uint32_t)bdev_io->internal.orig_iovcnt, 1576 &bdev_io->internal.bounce_md_iov, 1, 1577 bdev_io_push_bounce_md_buf_done, 1578 bdev_io); 1579 if (rc == 0) { 1580 /* Continue IO completion in async callback */ 1581 return; 1582 } 1583 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1584 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1585 if (rc != -ENOMEM) { 1586 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1587 spdk_memory_domain_get_dma_device_id( 1588 bdev_io->internal.memory_domain)); 1589 } 1590 } else { 1591 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1592 bdev_io->internal.orig_md_iov.iov_len); 1593 } 1594 } 1595 } 1596 1597 if (spdk_unlikely(rc == -ENOMEM)) { 1598 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1599 } else { 1600 assert(bdev_io->internal.data_transfer_cpl); 1601 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1602 } 1603 } 1604 1605 static inline void 1606 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1607 { 1608 assert(bdev_io->internal.data_transfer_cpl); 1609 if (rc) { 1610 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1611 return; 1612 } 1613 1614 /* set original buffer for this io */ 1615 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1616 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1617 /* disable bouncing buffer for this io */ 1618 bdev_io->internal.orig_iovcnt = 0; 1619 bdev_io->internal.orig_iovs = NULL; 1620 1621 bdev_io_push_bounce_md_buf(bdev_io); 1622 } 1623 1624 static void 1625 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1626 { 1627 struct spdk_bdev_io *bdev_io = ctx; 1628 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1629 1630 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1631 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1632 1633 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1634 bdev_ch_retry_io(ch); 1635 } 1636 1637 bdev_io_push_bounce_data_done(bdev_io, status); 1638 } 1639 1640 static inline void 1641 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1642 { 1643 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1644 int rc = 0; 1645 1646 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1647 assert(!bdev_io_use_accel_sequence(bdev_io)); 1648 1649 /* if this is read path, copy data from bounce buffer to original buffer */ 1650 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1651 if (bdev_io_use_memory_domain(bdev_io)) { 1652 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1653 bdev_io_increment_outstanding(ch, ch->shared_resource); 1654 /* If memory domain is used then we need to call async push function */ 1655 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1656 bdev_io->internal.memory_domain_ctx, 1657 bdev_io->internal.orig_iovs, 1658 (uint32_t)bdev_io->internal.orig_iovcnt, 1659 &bdev_io->internal.bounce_iov, 1, 1660 bdev_io_push_bounce_data_done_and_track, 1661 bdev_io); 1662 if (rc == 0) { 1663 /* Continue IO completion in async callback */ 1664 return; 1665 } 1666 1667 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1668 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1669 if (rc != -ENOMEM) { 1670 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1671 spdk_memory_domain_get_dma_device_id( 1672 bdev_io->internal.memory_domain)); 1673 } 1674 } else { 1675 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1676 bdev_io->internal.orig_iovcnt, 1677 bdev_io->internal.bounce_iov.iov_base, 1678 bdev_io->internal.bounce_iov.iov_len); 1679 } 1680 } 1681 1682 if (spdk_unlikely(rc == -ENOMEM)) { 1683 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1684 } else { 1685 bdev_io_push_bounce_data_done(bdev_io, rc); 1686 } 1687 } 1688 1689 static inline void 1690 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1691 { 1692 bdev_io->internal.data_transfer_cpl = cpl_cb; 1693 bdev_io_push_bounce_data(bdev_io); 1694 } 1695 1696 static void 1697 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1698 { 1699 struct spdk_bdev_io *bdev_io; 1700 1701 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1702 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1703 } 1704 1705 static void 1706 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1707 { 1708 struct spdk_bdev_mgmt_channel *mgmt_ch; 1709 uint64_t max_len; 1710 void *buf; 1711 1712 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1713 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1714 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1715 1716 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1717 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1718 bdev_io_get_buf_complete(bdev_io, false); 1719 return; 1720 } 1721 1722 bdev_io->internal.buf_len = len; 1723 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1724 bdev_io_get_iobuf_cb); 1725 if (buf != NULL) { 1726 _bdev_io_set_buf(bdev_io, buf, len); 1727 } 1728 } 1729 1730 void 1731 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1732 { 1733 struct spdk_bdev *bdev = bdev_io->bdev; 1734 uint64_t alignment; 1735 1736 assert(cb != NULL); 1737 bdev_io->internal.get_buf_cb = cb; 1738 1739 alignment = spdk_bdev_get_buf_align(bdev); 1740 1741 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1742 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1743 /* Buffer already present and aligned */ 1744 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1745 return; 1746 } 1747 1748 bdev_io_get_buf(bdev_io, len); 1749 } 1750 1751 static void 1752 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1753 bool success) 1754 { 1755 if (!success) { 1756 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1757 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1758 bdev_io_complete_unsubmitted(bdev_io); 1759 return; 1760 } 1761 1762 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1763 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1764 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1765 return; 1766 } 1767 /* For reads we'll execute the sequence after the data is read, so, for now, only 1768 * clear out accel_sequence pointer and submit the IO */ 1769 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1770 bdev_io->u.bdev.accel_sequence = NULL; 1771 } 1772 1773 bdev_io_submit(bdev_io); 1774 } 1775 1776 static void 1777 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1778 uint64_t len) 1779 { 1780 assert(cb != NULL); 1781 bdev_io->internal.get_buf_cb = cb; 1782 1783 bdev_io_get_buf(bdev_io, len); 1784 } 1785 1786 void 1787 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1788 { 1789 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1790 1791 assert(cb != NULL); 1792 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1793 bdev_io->internal.get_aux_buf_cb = cb; 1794 bdev_io_get_buf(bdev_io, len); 1795 } 1796 1797 static int 1798 bdev_module_get_max_ctx_size(void) 1799 { 1800 struct spdk_bdev_module *bdev_module; 1801 int max_bdev_module_size = 0; 1802 1803 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1804 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1805 max_bdev_module_size = bdev_module->get_ctx_size(); 1806 } 1807 } 1808 1809 return max_bdev_module_size; 1810 } 1811 1812 static void 1813 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1814 { 1815 int i; 1816 struct spdk_bdev_qos *qos = bdev->internal.qos; 1817 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1818 1819 if (!qos) { 1820 return; 1821 } 1822 1823 spdk_bdev_get_qos_rate_limits(bdev, limits); 1824 1825 spdk_json_write_object_begin(w); 1826 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1827 1828 spdk_json_write_named_object_begin(w, "params"); 1829 spdk_json_write_named_string(w, "name", bdev->name); 1830 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1831 if (limits[i] > 0) { 1832 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1833 } 1834 } 1835 spdk_json_write_object_end(w); 1836 1837 spdk_json_write_object_end(w); 1838 } 1839 1840 void 1841 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1842 { 1843 struct spdk_bdev_module *bdev_module; 1844 struct spdk_bdev *bdev; 1845 1846 assert(w != NULL); 1847 1848 spdk_json_write_array_begin(w); 1849 1850 spdk_json_write_object_begin(w); 1851 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1852 spdk_json_write_named_object_begin(w, "params"); 1853 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1854 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1855 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1856 spdk_json_write_object_end(w); 1857 spdk_json_write_object_end(w); 1858 1859 bdev_examine_allowlist_config_json(w); 1860 1861 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1862 if (bdev_module->config_json) { 1863 bdev_module->config_json(w); 1864 } 1865 } 1866 1867 spdk_spin_lock(&g_bdev_mgr.spinlock); 1868 1869 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1870 if (bdev->fn_table->write_config_json) { 1871 bdev->fn_table->write_config_json(bdev, w); 1872 } 1873 1874 bdev_qos_config_json(bdev, w); 1875 } 1876 1877 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1878 1879 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1880 spdk_json_write_object_begin(w); 1881 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1882 spdk_json_write_object_end(w); 1883 1884 spdk_json_write_array_end(w); 1885 } 1886 1887 static void 1888 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1889 { 1890 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1891 struct spdk_bdev_io *bdev_io; 1892 1893 spdk_iobuf_channel_fini(&ch->iobuf); 1894 1895 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1896 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1897 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1898 ch->per_thread_cache_count--; 1899 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1900 } 1901 1902 assert(ch->per_thread_cache_count == 0); 1903 } 1904 1905 static int 1906 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1907 { 1908 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1909 struct spdk_bdev_io *bdev_io; 1910 uint32_t i; 1911 int rc; 1912 1913 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1914 if (rc != 0) { 1915 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1916 return -1; 1917 } 1918 1919 STAILQ_INIT(&ch->per_thread_cache); 1920 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1921 1922 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1923 ch->per_thread_cache_count = 0; 1924 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1925 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1926 if (bdev_io == NULL) { 1927 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1928 assert(false); 1929 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1930 return -1; 1931 } 1932 ch->per_thread_cache_count++; 1933 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1934 } 1935 1936 TAILQ_INIT(&ch->shared_resources); 1937 TAILQ_INIT(&ch->io_wait_queue); 1938 1939 return 0; 1940 } 1941 1942 static void 1943 bdev_init_complete(int rc) 1944 { 1945 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1946 void *cb_arg = g_init_cb_arg; 1947 struct spdk_bdev_module *m; 1948 1949 g_bdev_mgr.init_complete = true; 1950 g_init_cb_fn = NULL; 1951 g_init_cb_arg = NULL; 1952 1953 /* 1954 * For modules that need to know when subsystem init is complete, 1955 * inform them now. 1956 */ 1957 if (rc == 0) { 1958 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1959 if (m->init_complete) { 1960 m->init_complete(); 1961 } 1962 } 1963 } 1964 1965 cb_fn(cb_arg, rc); 1966 } 1967 1968 static bool 1969 bdev_module_all_actions_completed(void) 1970 { 1971 struct spdk_bdev_module *m; 1972 1973 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1974 if (m->internal.action_in_progress > 0) { 1975 return false; 1976 } 1977 } 1978 return true; 1979 } 1980 1981 static void 1982 bdev_module_action_complete(void) 1983 { 1984 /* 1985 * Don't finish bdev subsystem initialization if 1986 * module pre-initialization is still in progress, or 1987 * the subsystem been already initialized. 1988 */ 1989 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1990 return; 1991 } 1992 1993 /* 1994 * Check all bdev modules for inits/examinations in progress. If any 1995 * exist, return immediately since we cannot finish bdev subsystem 1996 * initialization until all are completed. 1997 */ 1998 if (!bdev_module_all_actions_completed()) { 1999 return; 2000 } 2001 2002 /* 2003 * Modules already finished initialization - now that all 2004 * the bdev modules have finished their asynchronous I/O 2005 * processing, the entire bdev layer can be marked as complete. 2006 */ 2007 bdev_init_complete(0); 2008 } 2009 2010 static void 2011 bdev_module_action_done(struct spdk_bdev_module *module) 2012 { 2013 spdk_spin_lock(&module->internal.spinlock); 2014 assert(module->internal.action_in_progress > 0); 2015 module->internal.action_in_progress--; 2016 spdk_spin_unlock(&module->internal.spinlock); 2017 bdev_module_action_complete(); 2018 } 2019 2020 void 2021 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2022 { 2023 assert(module->async_init); 2024 bdev_module_action_done(module); 2025 } 2026 2027 void 2028 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2029 { 2030 bdev_module_action_done(module); 2031 } 2032 2033 /** The last initialized bdev module */ 2034 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2035 2036 static void 2037 bdev_init_failed(void *cb_arg) 2038 { 2039 struct spdk_bdev_module *module = cb_arg; 2040 2041 spdk_spin_lock(&module->internal.spinlock); 2042 assert(module->internal.action_in_progress > 0); 2043 module->internal.action_in_progress--; 2044 spdk_spin_unlock(&module->internal.spinlock); 2045 bdev_init_complete(-1); 2046 } 2047 2048 static int 2049 bdev_modules_init(void) 2050 { 2051 struct spdk_bdev_module *module; 2052 int rc = 0; 2053 2054 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2055 g_resume_bdev_module = module; 2056 if (module->async_init) { 2057 spdk_spin_lock(&module->internal.spinlock); 2058 module->internal.action_in_progress = 1; 2059 spdk_spin_unlock(&module->internal.spinlock); 2060 } 2061 rc = module->module_init(); 2062 if (rc != 0) { 2063 /* Bump action_in_progress to prevent other modules from completion of modules_init 2064 * Send message to defer application shutdown until resources are cleaned up */ 2065 spdk_spin_lock(&module->internal.spinlock); 2066 module->internal.action_in_progress = 1; 2067 spdk_spin_unlock(&module->internal.spinlock); 2068 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2069 return rc; 2070 } 2071 } 2072 2073 g_resume_bdev_module = NULL; 2074 return 0; 2075 } 2076 2077 void 2078 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2079 { 2080 int rc = 0; 2081 char mempool_name[32]; 2082 2083 assert(cb_fn != NULL); 2084 2085 g_init_cb_fn = cb_fn; 2086 g_init_cb_arg = cb_arg; 2087 2088 spdk_notify_type_register("bdev_register"); 2089 spdk_notify_type_register("bdev_unregister"); 2090 2091 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2092 2093 rc = spdk_iobuf_register_module("bdev"); 2094 if (rc != 0) { 2095 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2096 bdev_init_complete(-1); 2097 return; 2098 } 2099 2100 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2101 g_bdev_opts.bdev_io_pool_size, 2102 sizeof(struct spdk_bdev_io) + 2103 bdev_module_get_max_ctx_size(), 2104 0, 2105 SPDK_ENV_SOCKET_ID_ANY); 2106 2107 if (g_bdev_mgr.bdev_io_pool == NULL) { 2108 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2109 bdev_init_complete(-1); 2110 return; 2111 } 2112 2113 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2114 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2115 if (!g_bdev_mgr.zero_buffer) { 2116 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2117 bdev_init_complete(-1); 2118 return; 2119 } 2120 2121 #ifdef SPDK_CONFIG_VTUNE 2122 SPDK_LOG_DEPRECATED(vtune_support); 2123 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2124 #endif 2125 2126 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2127 bdev_mgmt_channel_destroy, 2128 sizeof(struct spdk_bdev_mgmt_channel), 2129 "bdev_mgr"); 2130 2131 rc = bdev_modules_init(); 2132 g_bdev_mgr.module_init_complete = true; 2133 if (rc != 0) { 2134 SPDK_ERRLOG("bdev modules init failed\n"); 2135 return; 2136 } 2137 2138 bdev_module_action_complete(); 2139 } 2140 2141 static void 2142 bdev_mgr_unregister_cb(void *io_device) 2143 { 2144 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2145 2146 if (g_bdev_mgr.bdev_io_pool) { 2147 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2148 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2149 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2150 g_bdev_opts.bdev_io_pool_size); 2151 } 2152 2153 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2154 } 2155 2156 spdk_free(g_bdev_mgr.zero_buffer); 2157 2158 bdev_examine_allowlist_free(); 2159 2160 cb_fn(g_fini_cb_arg); 2161 g_fini_cb_fn = NULL; 2162 g_fini_cb_arg = NULL; 2163 g_bdev_mgr.init_complete = false; 2164 g_bdev_mgr.module_init_complete = false; 2165 } 2166 2167 static void 2168 bdev_module_fini_iter(void *arg) 2169 { 2170 struct spdk_bdev_module *bdev_module; 2171 2172 /* FIXME: Handling initialization failures is broken now, 2173 * so we won't even try cleaning up after successfully 2174 * initialized modules. if module_init_complete is false, 2175 * just call spdk_bdev_mgr_unregister_cb 2176 */ 2177 if (!g_bdev_mgr.module_init_complete) { 2178 bdev_mgr_unregister_cb(NULL); 2179 return; 2180 } 2181 2182 /* Start iterating from the last touched module */ 2183 if (!g_resume_bdev_module) { 2184 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2185 } else { 2186 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2187 internal.tailq); 2188 } 2189 2190 while (bdev_module) { 2191 if (bdev_module->async_fini) { 2192 /* Save our place so we can resume later. We must 2193 * save the variable here, before calling module_fini() 2194 * below, because in some cases the module may immediately 2195 * call spdk_bdev_module_fini_done() and re-enter 2196 * this function to continue iterating. */ 2197 g_resume_bdev_module = bdev_module; 2198 } 2199 2200 if (bdev_module->module_fini) { 2201 bdev_module->module_fini(); 2202 } 2203 2204 if (bdev_module->async_fini) { 2205 return; 2206 } 2207 2208 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2209 internal.tailq); 2210 } 2211 2212 g_resume_bdev_module = NULL; 2213 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2214 } 2215 2216 void 2217 spdk_bdev_module_fini_done(void) 2218 { 2219 if (spdk_get_thread() != g_fini_thread) { 2220 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2221 } else { 2222 bdev_module_fini_iter(NULL); 2223 } 2224 } 2225 2226 static void 2227 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2228 { 2229 struct spdk_bdev *bdev = cb_arg; 2230 2231 if (bdeverrno && bdev) { 2232 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2233 bdev->name); 2234 2235 /* 2236 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2237 * bdev; try to continue by manually removing this bdev from the list and continue 2238 * with the next bdev in the list. 2239 */ 2240 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2241 } 2242 2243 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2244 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2245 /* 2246 * Bdev module finish need to be deferred as we might be in the middle of some context 2247 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2248 * after returning. 2249 */ 2250 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2251 return; 2252 } 2253 2254 /* 2255 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2256 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2257 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2258 * base bdevs. 2259 * 2260 * Also, walk the list in the reverse order. 2261 */ 2262 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2263 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2264 spdk_spin_lock(&bdev->internal.spinlock); 2265 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2266 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2267 spdk_spin_unlock(&bdev->internal.spinlock); 2268 continue; 2269 } 2270 spdk_spin_unlock(&bdev->internal.spinlock); 2271 2272 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2273 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2274 return; 2275 } 2276 2277 /* 2278 * If any bdev fails to unclaim underlying bdev properly, we may face the 2279 * case of bdev list consisting of claimed bdevs only (if claims are managed 2280 * correctly, this would mean there's a loop in the claims graph which is 2281 * clearly impossible). Warn and unregister last bdev on the list then. 2282 */ 2283 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2284 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2285 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2286 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2287 return; 2288 } 2289 } 2290 2291 static void 2292 bdev_module_fini_start_iter(void *arg) 2293 { 2294 struct spdk_bdev_module *bdev_module; 2295 2296 if (!g_resume_bdev_module) { 2297 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2298 } else { 2299 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2300 } 2301 2302 while (bdev_module) { 2303 if (bdev_module->async_fini_start) { 2304 /* Save our place so we can resume later. We must 2305 * save the variable here, before calling fini_start() 2306 * below, because in some cases the module may immediately 2307 * call spdk_bdev_module_fini_start_done() and re-enter 2308 * this function to continue iterating. */ 2309 g_resume_bdev_module = bdev_module; 2310 } 2311 2312 if (bdev_module->fini_start) { 2313 bdev_module->fini_start(); 2314 } 2315 2316 if (bdev_module->async_fini_start) { 2317 return; 2318 } 2319 2320 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2321 } 2322 2323 g_resume_bdev_module = NULL; 2324 2325 bdev_finish_unregister_bdevs_iter(NULL, 0); 2326 } 2327 2328 void 2329 spdk_bdev_module_fini_start_done(void) 2330 { 2331 if (spdk_get_thread() != g_fini_thread) { 2332 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2333 } else { 2334 bdev_module_fini_start_iter(NULL); 2335 } 2336 } 2337 2338 static void 2339 bdev_finish_wait_for_examine_done(void *cb_arg) 2340 { 2341 bdev_module_fini_start_iter(NULL); 2342 } 2343 2344 static void bdev_open_async_fini(void); 2345 2346 void 2347 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2348 { 2349 int rc; 2350 2351 assert(cb_fn != NULL); 2352 2353 g_fini_thread = spdk_get_thread(); 2354 2355 g_fini_cb_fn = cb_fn; 2356 g_fini_cb_arg = cb_arg; 2357 2358 bdev_open_async_fini(); 2359 2360 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2361 if (rc != 0) { 2362 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2363 bdev_finish_wait_for_examine_done(NULL); 2364 } 2365 } 2366 2367 struct spdk_bdev_io * 2368 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2369 { 2370 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2371 struct spdk_bdev_io *bdev_io; 2372 2373 if (ch->per_thread_cache_count > 0) { 2374 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2375 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2376 ch->per_thread_cache_count--; 2377 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2378 /* 2379 * Don't try to look for bdev_ios in the global pool if there are 2380 * waiters on bdev_ios - we don't want this caller to jump the line. 2381 */ 2382 bdev_io = NULL; 2383 } else { 2384 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2385 } 2386 2387 return bdev_io; 2388 } 2389 2390 void 2391 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2392 { 2393 struct spdk_bdev_mgmt_channel *ch; 2394 2395 assert(bdev_io != NULL); 2396 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2397 2398 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2399 2400 if (bdev_io->internal.buf != NULL) { 2401 bdev_io_put_buf(bdev_io); 2402 } 2403 2404 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2405 ch->per_thread_cache_count++; 2406 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2407 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2408 struct spdk_bdev_io_wait_entry *entry; 2409 2410 entry = TAILQ_FIRST(&ch->io_wait_queue); 2411 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2412 entry->cb_fn(entry->cb_arg); 2413 } 2414 } else { 2415 /* We should never have a full cache with entries on the io wait queue. */ 2416 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2417 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2418 } 2419 } 2420 2421 static bool 2422 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2423 { 2424 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2425 2426 switch (limit) { 2427 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2428 return true; 2429 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2430 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2431 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2432 return false; 2433 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2434 default: 2435 return false; 2436 } 2437 } 2438 2439 static bool 2440 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2441 { 2442 switch (bdev_io->type) { 2443 case SPDK_BDEV_IO_TYPE_NVME_IO: 2444 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2445 case SPDK_BDEV_IO_TYPE_READ: 2446 case SPDK_BDEV_IO_TYPE_WRITE: 2447 return true; 2448 case SPDK_BDEV_IO_TYPE_ZCOPY: 2449 if (bdev_io->u.bdev.zcopy.start) { 2450 return true; 2451 } else { 2452 return false; 2453 } 2454 default: 2455 return false; 2456 } 2457 } 2458 2459 static bool 2460 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2461 { 2462 switch (bdev_io->type) { 2463 case SPDK_BDEV_IO_TYPE_NVME_IO: 2464 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2465 /* Bit 1 (0x2) set for read operation */ 2466 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2467 return true; 2468 } else { 2469 return false; 2470 } 2471 case SPDK_BDEV_IO_TYPE_READ: 2472 return true; 2473 case SPDK_BDEV_IO_TYPE_ZCOPY: 2474 /* Populate to read from disk */ 2475 if (bdev_io->u.bdev.zcopy.populate) { 2476 return true; 2477 } else { 2478 return false; 2479 } 2480 default: 2481 return false; 2482 } 2483 } 2484 2485 static uint64_t 2486 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2487 { 2488 struct spdk_bdev *bdev = bdev_io->bdev; 2489 2490 switch (bdev_io->type) { 2491 case SPDK_BDEV_IO_TYPE_NVME_IO: 2492 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2493 return bdev_io->u.nvme_passthru.nbytes; 2494 case SPDK_BDEV_IO_TYPE_READ: 2495 case SPDK_BDEV_IO_TYPE_WRITE: 2496 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2497 case SPDK_BDEV_IO_TYPE_ZCOPY: 2498 /* Track the data in the start phase only */ 2499 if (bdev_io->u.bdev.zcopy.start) { 2500 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2501 } else { 2502 return 0; 2503 } 2504 default: 2505 return 0; 2506 } 2507 } 2508 2509 static bool 2510 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2511 { 2512 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2513 return true; 2514 } else { 2515 return false; 2516 } 2517 } 2518 2519 static bool 2520 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2521 { 2522 if (bdev_is_read_io(io) == false) { 2523 return false; 2524 } 2525 2526 return bdev_qos_rw_queue_io(limit, io); 2527 } 2528 2529 static bool 2530 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2531 { 2532 if (bdev_is_read_io(io) == true) { 2533 return false; 2534 } 2535 2536 return bdev_qos_rw_queue_io(limit, io); 2537 } 2538 2539 static void 2540 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2541 { 2542 limit->remaining_this_timeslice--; 2543 } 2544 2545 static void 2546 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2547 { 2548 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2549 } 2550 2551 static void 2552 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2553 { 2554 if (bdev_is_read_io(io) == false) { 2555 return; 2556 } 2557 2558 return bdev_qos_rw_bps_update_quota(limit, io); 2559 } 2560 2561 static void 2562 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2563 { 2564 if (bdev_is_read_io(io) == true) { 2565 return; 2566 } 2567 2568 return bdev_qos_rw_bps_update_quota(limit, io); 2569 } 2570 2571 static void 2572 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2573 { 2574 int i; 2575 2576 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2577 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2578 qos->rate_limits[i].queue_io = NULL; 2579 qos->rate_limits[i].update_quota = NULL; 2580 continue; 2581 } 2582 2583 switch (i) { 2584 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2585 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2586 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2587 break; 2588 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2589 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2590 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2591 break; 2592 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2593 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2594 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2595 break; 2596 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2597 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2598 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2599 break; 2600 default: 2601 break; 2602 } 2603 } 2604 } 2605 2606 static void 2607 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2608 struct spdk_bdev_io *bdev_io, 2609 enum spdk_bdev_io_status status) 2610 { 2611 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2612 2613 bdev_io->internal.in_submit_request = true; 2614 bdev_ch->io_outstanding++; 2615 shared_resource->io_outstanding++; 2616 spdk_bdev_io_complete(bdev_io, status); 2617 bdev_io->internal.in_submit_request = false; 2618 } 2619 2620 static inline void 2621 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2622 { 2623 struct spdk_bdev *bdev = bdev_io->bdev; 2624 struct spdk_io_channel *ch = bdev_ch->channel; 2625 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2626 2627 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2628 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2629 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2630 2631 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2632 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2633 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2634 SPDK_BDEV_IO_STATUS_SUCCESS); 2635 return; 2636 } 2637 } 2638 2639 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2640 bdev_io->bdev->split_on_write_unit && 2641 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2642 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2643 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2644 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2645 return; 2646 } 2647 2648 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2649 bdev_ch->io_outstanding++; 2650 shared_resource->io_outstanding++; 2651 bdev_io->internal.in_submit_request = true; 2652 bdev_submit_request(bdev, ch, bdev_io); 2653 bdev_io->internal.in_submit_request = false; 2654 } else { 2655 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2656 } 2657 } 2658 2659 static bool 2660 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2661 { 2662 int i; 2663 2664 if (bdev_qos_io_to_limit(bdev_io) == true) { 2665 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2666 if (!qos->rate_limits[i].queue_io) { 2667 continue; 2668 } 2669 2670 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2671 bdev_io) == true) { 2672 return true; 2673 } 2674 } 2675 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2676 if (!qos->rate_limits[i].update_quota) { 2677 continue; 2678 } 2679 2680 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2681 } 2682 } 2683 2684 return false; 2685 } 2686 2687 static inline void 2688 _bdev_io_do_submit(void *ctx) 2689 { 2690 struct spdk_bdev_io *bdev_io = ctx; 2691 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2692 2693 bdev_io_do_submit(ch, bdev_io); 2694 } 2695 2696 static int 2697 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2698 { 2699 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2700 int submitted_ios = 0; 2701 2702 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2703 if (!bdev_qos_queue_io(qos, bdev_io)) { 2704 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2705 2706 if (bdev_io->internal.io_submit_ch) { 2707 /* Send back the IO to the original thread for the actual processing. */ 2708 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2709 bdev_io->internal.io_submit_ch = NULL; 2710 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2711 _bdev_io_do_submit, bdev_io); 2712 } else { 2713 bdev_io_do_submit(ch, bdev_io); 2714 } 2715 2716 submitted_ios++; 2717 } 2718 } 2719 2720 return submitted_ios; 2721 } 2722 2723 static void 2724 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2725 { 2726 int rc; 2727 2728 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2729 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2730 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2731 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2732 &bdev_io->internal.waitq_entry); 2733 if (rc != 0) { 2734 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2735 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2736 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2737 } 2738 } 2739 2740 static bool 2741 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2742 { 2743 uint32_t io_boundary; 2744 struct spdk_bdev *bdev = bdev_io->bdev; 2745 uint32_t max_size = bdev->max_segment_size; 2746 int max_segs = bdev->max_num_segments; 2747 2748 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2749 io_boundary = bdev->write_unit_size; 2750 } else if (bdev->split_on_optimal_io_boundary) { 2751 io_boundary = bdev->optimal_io_boundary; 2752 } else { 2753 io_boundary = 0; 2754 } 2755 2756 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2757 return false; 2758 } 2759 2760 if (io_boundary) { 2761 uint64_t start_stripe, end_stripe; 2762 2763 start_stripe = bdev_io->u.bdev.offset_blocks; 2764 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2765 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2766 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2767 start_stripe >>= spdk_u32log2(io_boundary); 2768 end_stripe >>= spdk_u32log2(io_boundary); 2769 } else { 2770 start_stripe /= io_boundary; 2771 end_stripe /= io_boundary; 2772 } 2773 2774 if (start_stripe != end_stripe) { 2775 return true; 2776 } 2777 } 2778 2779 if (max_segs) { 2780 if (bdev_io->u.bdev.iovcnt > max_segs) { 2781 return true; 2782 } 2783 } 2784 2785 if (max_size) { 2786 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2787 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2788 return true; 2789 } 2790 } 2791 } 2792 2793 return false; 2794 } 2795 2796 static bool 2797 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2798 { 2799 uint32_t num_unmap_segments; 2800 2801 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2802 return false; 2803 } 2804 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2805 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2806 return true; 2807 } 2808 2809 return false; 2810 } 2811 2812 static bool 2813 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2814 { 2815 if (!bdev_io->bdev->max_write_zeroes) { 2816 return false; 2817 } 2818 2819 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2820 return true; 2821 } 2822 2823 return false; 2824 } 2825 2826 static bool 2827 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2828 { 2829 if (bdev_io->bdev->max_copy != 0 && 2830 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2831 return true; 2832 } 2833 2834 return false; 2835 } 2836 2837 static bool 2838 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2839 { 2840 switch (bdev_io->type) { 2841 case SPDK_BDEV_IO_TYPE_READ: 2842 case SPDK_BDEV_IO_TYPE_WRITE: 2843 return bdev_rw_should_split(bdev_io); 2844 case SPDK_BDEV_IO_TYPE_UNMAP: 2845 return bdev_unmap_should_split(bdev_io); 2846 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2847 return bdev_write_zeroes_should_split(bdev_io); 2848 case SPDK_BDEV_IO_TYPE_COPY: 2849 return bdev_copy_should_split(bdev_io); 2850 default: 2851 return false; 2852 } 2853 } 2854 2855 static uint32_t 2856 _to_next_boundary(uint64_t offset, uint32_t boundary) 2857 { 2858 return (boundary - (offset % boundary)); 2859 } 2860 2861 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2862 2863 static void _bdev_rw_split(void *_bdev_io); 2864 2865 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2866 2867 static void 2868 _bdev_unmap_split(void *_bdev_io) 2869 { 2870 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2871 } 2872 2873 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2874 2875 static void 2876 _bdev_write_zeroes_split(void *_bdev_io) 2877 { 2878 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2879 } 2880 2881 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2882 2883 static void 2884 _bdev_copy_split(void *_bdev_io) 2885 { 2886 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2887 } 2888 2889 static int 2890 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2891 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2892 { 2893 int rc; 2894 uint64_t current_offset, current_remaining, current_src_offset; 2895 spdk_bdev_io_wait_cb io_wait_fn; 2896 2897 current_offset = *offset; 2898 current_remaining = *remaining; 2899 2900 bdev_io->u.bdev.split_outstanding++; 2901 2902 io_wait_fn = _bdev_rw_split; 2903 switch (bdev_io->type) { 2904 case SPDK_BDEV_IO_TYPE_READ: 2905 assert(bdev_io->u.bdev.accel_sequence == NULL); 2906 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2907 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2908 iov, iovcnt, md_buf, current_offset, 2909 num_blocks, bdev_io->internal.memory_domain, 2910 bdev_io->internal.memory_domain_ctx, NULL, 2911 bdev_io_split_done, bdev_io); 2912 break; 2913 case SPDK_BDEV_IO_TYPE_WRITE: 2914 assert(bdev_io->u.bdev.accel_sequence == NULL); 2915 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2916 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2917 iov, iovcnt, md_buf, current_offset, 2918 num_blocks, bdev_io->internal.memory_domain, 2919 bdev_io->internal.memory_domain_ctx, NULL, 2920 bdev_io_split_done, bdev_io); 2921 break; 2922 case SPDK_BDEV_IO_TYPE_UNMAP: 2923 io_wait_fn = _bdev_unmap_split; 2924 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2925 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2926 current_offset, num_blocks, 2927 bdev_io_split_done, bdev_io); 2928 break; 2929 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2930 io_wait_fn = _bdev_write_zeroes_split; 2931 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2932 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2933 current_offset, num_blocks, 2934 bdev_io_split_done, bdev_io); 2935 break; 2936 case SPDK_BDEV_IO_TYPE_COPY: 2937 io_wait_fn = _bdev_copy_split; 2938 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2939 (current_offset - bdev_io->u.bdev.offset_blocks); 2940 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2941 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2942 current_offset, current_src_offset, num_blocks, 2943 bdev_io_split_done, bdev_io); 2944 break; 2945 default: 2946 assert(false); 2947 rc = -EINVAL; 2948 break; 2949 } 2950 2951 if (rc == 0) { 2952 current_offset += num_blocks; 2953 current_remaining -= num_blocks; 2954 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2955 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2956 *offset = current_offset; 2957 *remaining = current_remaining; 2958 } else { 2959 bdev_io->u.bdev.split_outstanding--; 2960 if (rc == -ENOMEM) { 2961 if (bdev_io->u.bdev.split_outstanding == 0) { 2962 /* No I/O is outstanding. Hence we should wait here. */ 2963 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2964 } 2965 } else { 2966 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2967 if (bdev_io->u.bdev.split_outstanding == 0) { 2968 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2969 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2970 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2971 } 2972 } 2973 } 2974 2975 return rc; 2976 } 2977 2978 static void 2979 _bdev_rw_split(void *_bdev_io) 2980 { 2981 struct iovec *parent_iov, *iov; 2982 struct spdk_bdev_io *bdev_io = _bdev_io; 2983 struct spdk_bdev *bdev = bdev_io->bdev; 2984 uint64_t parent_offset, current_offset, remaining; 2985 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2986 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2987 uint32_t iovcnt, iov_len, child_iovsize; 2988 uint32_t blocklen = bdev->blocklen; 2989 uint32_t io_boundary; 2990 uint32_t max_segment_size = bdev->max_segment_size; 2991 uint32_t max_child_iovcnt = bdev->max_num_segments; 2992 void *md_buf = NULL; 2993 int rc; 2994 2995 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2996 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2997 SPDK_BDEV_IO_NUM_CHILD_IOV; 2998 2999 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3000 io_boundary = bdev->write_unit_size; 3001 } else if (bdev->split_on_optimal_io_boundary) { 3002 io_boundary = bdev->optimal_io_boundary; 3003 } else { 3004 io_boundary = UINT32_MAX; 3005 } 3006 3007 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3008 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3009 parent_offset = bdev_io->u.bdev.offset_blocks; 3010 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3011 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3012 3013 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3014 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3015 if (parent_iov_offset < parent_iov->iov_len) { 3016 break; 3017 } 3018 parent_iov_offset -= parent_iov->iov_len; 3019 } 3020 3021 child_iovcnt = 0; 3022 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3023 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3024 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3025 to_next_boundary = spdk_min(remaining, to_next_boundary); 3026 to_next_boundary_bytes = to_next_boundary * blocklen; 3027 3028 iov = &bdev_io->child_iov[child_iovcnt]; 3029 iovcnt = 0; 3030 3031 if (bdev_io->u.bdev.md_buf) { 3032 md_buf = (char *)bdev_io->u.bdev.md_buf + 3033 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3034 } 3035 3036 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3037 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3038 iovcnt < child_iovsize) { 3039 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3040 iov_len = parent_iov->iov_len - parent_iov_offset; 3041 3042 iov_len = spdk_min(iov_len, max_segment_size); 3043 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3044 to_next_boundary_bytes -= iov_len; 3045 3046 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3047 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3048 3049 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3050 parent_iov_offset += iov_len; 3051 } else { 3052 parent_iovpos++; 3053 parent_iov_offset = 0; 3054 } 3055 child_iovcnt++; 3056 iovcnt++; 3057 } 3058 3059 if (to_next_boundary_bytes > 0) { 3060 /* We had to stop this child I/O early because we ran out of 3061 * child_iov space or were limited by max_num_segments. 3062 * Ensure the iovs to be aligned with block size and 3063 * then adjust to_next_boundary before starting the 3064 * child I/O. 3065 */ 3066 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3067 iovcnt == child_iovsize); 3068 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3069 if (to_last_block_bytes != 0) { 3070 uint32_t child_iovpos = child_iovcnt - 1; 3071 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3072 * so the loop will naturally end 3073 */ 3074 3075 to_last_block_bytes = blocklen - to_last_block_bytes; 3076 to_next_boundary_bytes += to_last_block_bytes; 3077 while (to_last_block_bytes > 0 && iovcnt > 0) { 3078 iov_len = spdk_min(to_last_block_bytes, 3079 bdev_io->child_iov[child_iovpos].iov_len); 3080 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3081 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3082 child_iovpos--; 3083 if (--iovcnt == 0) { 3084 /* If the child IO is less than a block size just return. 3085 * If the first child IO of any split round is less than 3086 * a block size, an error exit. 3087 */ 3088 if (bdev_io->u.bdev.split_outstanding == 0) { 3089 SPDK_ERRLOG("The first child io was less than a block size\n"); 3090 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3091 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3092 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3093 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3094 } 3095 3096 return; 3097 } 3098 } 3099 3100 to_last_block_bytes -= iov_len; 3101 3102 if (parent_iov_offset == 0) { 3103 parent_iovpos--; 3104 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3105 } 3106 parent_iov_offset -= iov_len; 3107 } 3108 3109 assert(to_last_block_bytes == 0); 3110 } 3111 to_next_boundary -= to_next_boundary_bytes / blocklen; 3112 } 3113 3114 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3115 ¤t_offset, &remaining); 3116 if (spdk_unlikely(rc)) { 3117 return; 3118 } 3119 } 3120 } 3121 3122 static void 3123 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3124 { 3125 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3126 uint32_t num_children_reqs = 0; 3127 int rc; 3128 3129 offset = bdev_io->u.bdev.split_current_offset_blocks; 3130 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3131 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3132 3133 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3134 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3135 3136 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3137 &offset, &remaining); 3138 if (spdk_likely(rc == 0)) { 3139 num_children_reqs++; 3140 } else { 3141 return; 3142 } 3143 } 3144 } 3145 3146 static void 3147 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3148 { 3149 uint64_t offset, write_zeroes_blocks, remaining; 3150 uint32_t num_children_reqs = 0; 3151 int rc; 3152 3153 offset = bdev_io->u.bdev.split_current_offset_blocks; 3154 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3155 3156 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3157 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3158 3159 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3160 &offset, &remaining); 3161 if (spdk_likely(rc == 0)) { 3162 num_children_reqs++; 3163 } else { 3164 return; 3165 } 3166 } 3167 } 3168 3169 static void 3170 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3171 { 3172 uint64_t offset, copy_blocks, remaining; 3173 uint32_t num_children_reqs = 0; 3174 int rc; 3175 3176 offset = bdev_io->u.bdev.split_current_offset_blocks; 3177 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3178 3179 assert(bdev_io->bdev->max_copy != 0); 3180 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3181 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3182 3183 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3184 &offset, &remaining); 3185 if (spdk_likely(rc == 0)) { 3186 num_children_reqs++; 3187 } else { 3188 return; 3189 } 3190 } 3191 } 3192 3193 static void 3194 parent_bdev_io_complete(void *ctx, int rc) 3195 { 3196 struct spdk_bdev_io *parent_io = ctx; 3197 3198 if (rc) { 3199 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3200 } 3201 3202 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3203 parent_io->internal.caller_ctx); 3204 } 3205 3206 static void 3207 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3208 { 3209 struct spdk_bdev_io *bdev_io = ctx; 3210 3211 /* u.bdev.accel_sequence should have already been cleared at this point */ 3212 assert(bdev_io->u.bdev.accel_sequence == NULL); 3213 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3214 bdev_io->internal.accel_sequence = NULL; 3215 3216 if (spdk_unlikely(status != 0)) { 3217 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3218 } 3219 3220 parent_bdev_io_complete(bdev_io, status); 3221 } 3222 3223 static void 3224 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3225 { 3226 struct spdk_bdev_io *parent_io = cb_arg; 3227 3228 spdk_bdev_free_io(bdev_io); 3229 3230 if (!success) { 3231 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3232 /* If any child I/O failed, stop further splitting process. */ 3233 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3234 parent_io->u.bdev.split_remaining_num_blocks = 0; 3235 } 3236 parent_io->u.bdev.split_outstanding--; 3237 if (parent_io->u.bdev.split_outstanding != 0) { 3238 return; 3239 } 3240 3241 /* 3242 * Parent I/O finishes when all blocks are consumed. 3243 */ 3244 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3245 assert(parent_io->internal.cb != bdev_io_split_done); 3246 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3247 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3248 3249 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3250 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3251 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3252 return; 3253 } else if (parent_io->internal.orig_iovcnt != 0 && 3254 !bdev_io_use_accel_sequence(bdev_io)) { 3255 /* bdev IO will be completed in the callback */ 3256 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3257 return; 3258 } 3259 } 3260 3261 parent_bdev_io_complete(parent_io, 0); 3262 return; 3263 } 3264 3265 /* 3266 * Continue with the splitting process. This function will complete the parent I/O if the 3267 * splitting is done. 3268 */ 3269 switch (parent_io->type) { 3270 case SPDK_BDEV_IO_TYPE_READ: 3271 case SPDK_BDEV_IO_TYPE_WRITE: 3272 _bdev_rw_split(parent_io); 3273 break; 3274 case SPDK_BDEV_IO_TYPE_UNMAP: 3275 bdev_unmap_split(parent_io); 3276 break; 3277 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3278 bdev_write_zeroes_split(parent_io); 3279 break; 3280 case SPDK_BDEV_IO_TYPE_COPY: 3281 bdev_copy_split(parent_io); 3282 break; 3283 default: 3284 assert(false); 3285 break; 3286 } 3287 } 3288 3289 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3290 bool success); 3291 3292 static void 3293 bdev_io_split(struct spdk_bdev_io *bdev_io) 3294 { 3295 assert(bdev_io_should_split(bdev_io)); 3296 3297 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3298 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3299 bdev_io->u.bdev.split_outstanding = 0; 3300 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3301 3302 switch (bdev_io->type) { 3303 case SPDK_BDEV_IO_TYPE_READ: 3304 case SPDK_BDEV_IO_TYPE_WRITE: 3305 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3306 _bdev_rw_split(bdev_io); 3307 } else { 3308 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3309 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3310 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3311 } 3312 break; 3313 case SPDK_BDEV_IO_TYPE_UNMAP: 3314 bdev_unmap_split(bdev_io); 3315 break; 3316 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3317 bdev_write_zeroes_split(bdev_io); 3318 break; 3319 case SPDK_BDEV_IO_TYPE_COPY: 3320 bdev_copy_split(bdev_io); 3321 break; 3322 default: 3323 assert(false); 3324 break; 3325 } 3326 } 3327 3328 static void 3329 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3330 { 3331 if (!success) { 3332 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3333 return; 3334 } 3335 3336 _bdev_rw_split(bdev_io); 3337 } 3338 3339 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3340 * be inlined, at least on some compilers. 3341 */ 3342 static inline void 3343 _bdev_io_submit(void *ctx) 3344 { 3345 struct spdk_bdev_io *bdev_io = ctx; 3346 struct spdk_bdev *bdev = bdev_io->bdev; 3347 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3348 3349 if (spdk_likely(bdev_ch->flags == 0)) { 3350 bdev_io_do_submit(bdev_ch, bdev_io); 3351 return; 3352 } 3353 3354 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3355 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3356 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3357 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3358 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3359 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3360 } else { 3361 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3362 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3363 } 3364 } else { 3365 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3366 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3367 } 3368 } 3369 3370 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3371 3372 bool 3373 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3374 { 3375 if (range1->length == 0 || range2->length == 0) { 3376 return false; 3377 } 3378 3379 if (range1->offset + range1->length <= range2->offset) { 3380 return false; 3381 } 3382 3383 if (range2->offset + range2->length <= range1->offset) { 3384 return false; 3385 } 3386 3387 return true; 3388 } 3389 3390 static bool 3391 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3392 { 3393 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3394 struct lba_range r; 3395 3396 switch (bdev_io->type) { 3397 case SPDK_BDEV_IO_TYPE_NVME_IO: 3398 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3399 /* Don't try to decode the NVMe command - just assume worst-case and that 3400 * it overlaps a locked range. 3401 */ 3402 return true; 3403 case SPDK_BDEV_IO_TYPE_WRITE: 3404 case SPDK_BDEV_IO_TYPE_UNMAP: 3405 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3406 case SPDK_BDEV_IO_TYPE_ZCOPY: 3407 case SPDK_BDEV_IO_TYPE_COPY: 3408 r.offset = bdev_io->u.bdev.offset_blocks; 3409 r.length = bdev_io->u.bdev.num_blocks; 3410 if (!bdev_lba_range_overlapped(range, &r)) { 3411 /* This I/O doesn't overlap the specified LBA range. */ 3412 return false; 3413 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3414 /* This I/O overlaps, but the I/O is on the same channel that locked this 3415 * range, and the caller_ctx is the same as the locked_ctx. This means 3416 * that this I/O is associated with the lock, and is allowed to execute. 3417 */ 3418 return false; 3419 } else { 3420 return true; 3421 } 3422 default: 3423 return false; 3424 } 3425 } 3426 3427 void 3428 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3429 { 3430 struct spdk_bdev *bdev = bdev_io->bdev; 3431 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3432 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3433 3434 assert(thread != NULL); 3435 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3436 3437 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3438 struct lba_range *range; 3439 3440 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3441 if (bdev_io_range_is_locked(bdev_io, range)) { 3442 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3443 return; 3444 } 3445 } 3446 } 3447 3448 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3449 3450 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3451 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3452 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3453 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3454 spdk_bdev_get_name(bdev)); 3455 3456 if (bdev_io->internal.split) { 3457 bdev_io_split(bdev_io); 3458 return; 3459 } 3460 3461 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3462 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3463 _bdev_io_submit(bdev_io); 3464 } else { 3465 bdev_io->internal.io_submit_ch = ch; 3466 bdev_io->internal.ch = bdev->internal.qos->ch; 3467 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3468 } 3469 } else { 3470 _bdev_io_submit(bdev_io); 3471 } 3472 } 3473 3474 static inline void 3475 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3476 { 3477 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3478 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3479 * For write operation we need to pull buffers from memory domain before submitting IO. 3480 * Once read operation completes, we need to use memory_domain push functionality to 3481 * update data in original memory domain IO buffer 3482 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3483 bdev_io->u.bdev.memory_domain = NULL; 3484 bdev_io->u.bdev.memory_domain_ctx = NULL; 3485 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3486 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3487 } 3488 3489 static inline void 3490 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3491 { 3492 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3493 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3494 3495 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3496 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3497 bdev_io_complete_unsubmitted(bdev_io); 3498 return; 3499 } 3500 3501 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3502 * support them, but we need to execute an accel sequence and the data buffer is from accel 3503 * memory domain (to avoid doing a push/pull from that domain). 3504 */ 3505 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3506 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3507 _bdev_io_ext_use_bounce_buffer(bdev_io); 3508 return; 3509 } 3510 3511 if (needs_exec) { 3512 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3513 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3514 return; 3515 } 3516 /* For reads we'll execute the sequence after the data is read, so, for now, only 3517 * clear out accel_sequence pointer and submit the IO */ 3518 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3519 bdev_io->u.bdev.accel_sequence = NULL; 3520 } 3521 3522 bdev_io_submit(bdev_io); 3523 } 3524 3525 static void 3526 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3527 { 3528 struct spdk_bdev *bdev = bdev_io->bdev; 3529 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3530 struct spdk_io_channel *ch = bdev_ch->channel; 3531 3532 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3533 3534 bdev_io->internal.in_submit_request = true; 3535 bdev_submit_request(bdev, ch, bdev_io); 3536 bdev_io->internal.in_submit_request = false; 3537 } 3538 3539 void 3540 bdev_io_init(struct spdk_bdev_io *bdev_io, 3541 struct spdk_bdev *bdev, void *cb_arg, 3542 spdk_bdev_io_completion_cb cb) 3543 { 3544 bdev_io->bdev = bdev; 3545 bdev_io->internal.caller_ctx = cb_arg; 3546 bdev_io->internal.cb = cb; 3547 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3548 bdev_io->internal.in_submit_request = false; 3549 bdev_io->internal.buf = NULL; 3550 bdev_io->internal.io_submit_ch = NULL; 3551 bdev_io->internal.orig_iovs = NULL; 3552 bdev_io->internal.orig_iovcnt = 0; 3553 bdev_io->internal.orig_md_iov.iov_base = NULL; 3554 bdev_io->internal.error.nvme.cdw0 = 0; 3555 bdev_io->num_retries = 0; 3556 bdev_io->internal.get_buf_cb = NULL; 3557 bdev_io->internal.get_aux_buf_cb = NULL; 3558 bdev_io->internal.memory_domain = NULL; 3559 bdev_io->internal.memory_domain_ctx = NULL; 3560 bdev_io->internal.data_transfer_cpl = NULL; 3561 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3562 bdev_io->internal.accel_sequence = NULL; 3563 bdev_io->internal.has_accel_sequence = false; 3564 } 3565 3566 static bool 3567 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3568 { 3569 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3570 } 3571 3572 bool 3573 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3574 { 3575 bool supported; 3576 3577 supported = bdev_io_type_supported(bdev, io_type); 3578 3579 if (!supported) { 3580 switch (io_type) { 3581 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3582 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3583 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3584 break; 3585 default: 3586 break; 3587 } 3588 } 3589 3590 return supported; 3591 } 3592 3593 uint64_t 3594 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3595 { 3596 return bdev_io->internal.submit_tsc; 3597 } 3598 3599 int 3600 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3601 { 3602 if (bdev->fn_table->dump_info_json) { 3603 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3604 } 3605 3606 return 0; 3607 } 3608 3609 static void 3610 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3611 { 3612 uint32_t max_per_timeslice = 0; 3613 int i; 3614 3615 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3616 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3617 qos->rate_limits[i].max_per_timeslice = 0; 3618 continue; 3619 } 3620 3621 max_per_timeslice = qos->rate_limits[i].limit * 3622 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3623 3624 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3625 qos->rate_limits[i].min_per_timeslice); 3626 3627 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3628 } 3629 3630 bdev_qos_set_ops(qos); 3631 } 3632 3633 static int 3634 bdev_channel_poll_qos(void *arg) 3635 { 3636 struct spdk_bdev_qos *qos = arg; 3637 uint64_t now = spdk_get_ticks(); 3638 int i; 3639 3640 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3641 /* We received our callback earlier than expected - return 3642 * immediately and wait to do accounting until at least one 3643 * timeslice has actually expired. This should never happen 3644 * with a well-behaved timer implementation. 3645 */ 3646 return SPDK_POLLER_IDLE; 3647 } 3648 3649 /* Reset for next round of rate limiting */ 3650 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3651 /* We may have allowed the IOs or bytes to slightly overrun in the last 3652 * timeslice. remaining_this_timeslice is signed, so if it's negative 3653 * here, we'll account for the overrun so that the next timeslice will 3654 * be appropriately reduced. 3655 */ 3656 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3657 qos->rate_limits[i].remaining_this_timeslice = 0; 3658 } 3659 } 3660 3661 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3662 qos->last_timeslice += qos->timeslice_size; 3663 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3664 qos->rate_limits[i].remaining_this_timeslice += 3665 qos->rate_limits[i].max_per_timeslice; 3666 } 3667 } 3668 3669 return bdev_qos_io_submit(qos->ch, qos); 3670 } 3671 3672 static void 3673 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3674 { 3675 struct spdk_bdev_shared_resource *shared_resource; 3676 struct lba_range *range; 3677 3678 bdev_free_io_stat(ch->stat); 3679 #ifdef SPDK_CONFIG_VTUNE 3680 bdev_free_io_stat(ch->prev_stat); 3681 #endif 3682 3683 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3684 range = TAILQ_FIRST(&ch->locked_ranges); 3685 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3686 free(range); 3687 } 3688 3689 spdk_put_io_channel(ch->channel); 3690 spdk_put_io_channel(ch->accel_channel); 3691 3692 shared_resource = ch->shared_resource; 3693 3694 assert(TAILQ_EMPTY(&ch->io_locked)); 3695 assert(TAILQ_EMPTY(&ch->io_submitted)); 3696 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3697 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3698 assert(ch->io_outstanding == 0); 3699 assert(shared_resource->ref > 0); 3700 shared_resource->ref--; 3701 if (shared_resource->ref == 0) { 3702 assert(shared_resource->io_outstanding == 0); 3703 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3704 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3705 free(shared_resource); 3706 } 3707 } 3708 3709 static void 3710 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3711 { 3712 struct spdk_bdev_qos *qos = bdev->internal.qos; 3713 int i; 3714 3715 assert(spdk_spin_held(&bdev->internal.spinlock)); 3716 3717 /* Rate limiting on this bdev enabled */ 3718 if (qos) { 3719 if (qos->ch == NULL) { 3720 struct spdk_io_channel *io_ch; 3721 3722 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3723 bdev->name, spdk_get_thread()); 3724 3725 /* No qos channel has been selected, so set one up */ 3726 3727 /* Take another reference to ch */ 3728 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3729 assert(io_ch != NULL); 3730 qos->ch = ch; 3731 3732 qos->thread = spdk_io_channel_get_thread(io_ch); 3733 3734 TAILQ_INIT(&qos->queued); 3735 3736 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3737 if (bdev_qos_is_iops_rate_limit(i) == true) { 3738 qos->rate_limits[i].min_per_timeslice = 3739 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3740 } else { 3741 qos->rate_limits[i].min_per_timeslice = 3742 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3743 } 3744 3745 if (qos->rate_limits[i].limit == 0) { 3746 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3747 } 3748 } 3749 bdev_qos_update_max_quota_per_timeslice(qos); 3750 qos->timeslice_size = 3751 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3752 qos->last_timeslice = spdk_get_ticks(); 3753 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3754 qos, 3755 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3756 } 3757 3758 ch->flags |= BDEV_CH_QOS_ENABLED; 3759 } 3760 } 3761 3762 struct poll_timeout_ctx { 3763 struct spdk_bdev_desc *desc; 3764 uint64_t timeout_in_sec; 3765 spdk_bdev_io_timeout_cb cb_fn; 3766 void *cb_arg; 3767 }; 3768 3769 static void 3770 bdev_desc_free(struct spdk_bdev_desc *desc) 3771 { 3772 spdk_spin_destroy(&desc->spinlock); 3773 free(desc->media_events_buffer); 3774 free(desc); 3775 } 3776 3777 static void 3778 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3779 { 3780 struct poll_timeout_ctx *ctx = _ctx; 3781 struct spdk_bdev_desc *desc = ctx->desc; 3782 3783 free(ctx); 3784 3785 spdk_spin_lock(&desc->spinlock); 3786 desc->refs--; 3787 if (desc->closed == true && desc->refs == 0) { 3788 spdk_spin_unlock(&desc->spinlock); 3789 bdev_desc_free(desc); 3790 return; 3791 } 3792 spdk_spin_unlock(&desc->spinlock); 3793 } 3794 3795 static void 3796 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3797 struct spdk_io_channel *io_ch, void *_ctx) 3798 { 3799 struct poll_timeout_ctx *ctx = _ctx; 3800 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3801 struct spdk_bdev_desc *desc = ctx->desc; 3802 struct spdk_bdev_io *bdev_io; 3803 uint64_t now; 3804 3805 spdk_spin_lock(&desc->spinlock); 3806 if (desc->closed == true) { 3807 spdk_spin_unlock(&desc->spinlock); 3808 spdk_bdev_for_each_channel_continue(i, -1); 3809 return; 3810 } 3811 spdk_spin_unlock(&desc->spinlock); 3812 3813 now = spdk_get_ticks(); 3814 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3815 /* Exclude any I/O that are generated via splitting. */ 3816 if (bdev_io->internal.cb == bdev_io_split_done) { 3817 continue; 3818 } 3819 3820 /* Once we find an I/O that has not timed out, we can immediately 3821 * exit the loop. 3822 */ 3823 if (now < (bdev_io->internal.submit_tsc + 3824 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3825 goto end; 3826 } 3827 3828 if (bdev_io->internal.desc == desc) { 3829 ctx->cb_fn(ctx->cb_arg, bdev_io); 3830 } 3831 } 3832 3833 end: 3834 spdk_bdev_for_each_channel_continue(i, 0); 3835 } 3836 3837 static int 3838 bdev_poll_timeout_io(void *arg) 3839 { 3840 struct spdk_bdev_desc *desc = arg; 3841 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3842 struct poll_timeout_ctx *ctx; 3843 3844 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3845 if (!ctx) { 3846 SPDK_ERRLOG("failed to allocate memory\n"); 3847 return SPDK_POLLER_BUSY; 3848 } 3849 ctx->desc = desc; 3850 ctx->cb_arg = desc->cb_arg; 3851 ctx->cb_fn = desc->cb_fn; 3852 ctx->timeout_in_sec = desc->timeout_in_sec; 3853 3854 /* Take a ref on the descriptor in case it gets closed while we are checking 3855 * all of the channels. 3856 */ 3857 spdk_spin_lock(&desc->spinlock); 3858 desc->refs++; 3859 spdk_spin_unlock(&desc->spinlock); 3860 3861 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3862 bdev_channel_poll_timeout_io_done); 3863 3864 return SPDK_POLLER_BUSY; 3865 } 3866 3867 int 3868 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3869 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3870 { 3871 assert(desc->thread == spdk_get_thread()); 3872 3873 spdk_poller_unregister(&desc->io_timeout_poller); 3874 3875 if (timeout_in_sec) { 3876 assert(cb_fn != NULL); 3877 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3878 desc, 3879 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3880 1000); 3881 if (desc->io_timeout_poller == NULL) { 3882 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3883 return -1; 3884 } 3885 } 3886 3887 desc->cb_fn = cb_fn; 3888 desc->cb_arg = cb_arg; 3889 desc->timeout_in_sec = timeout_in_sec; 3890 3891 return 0; 3892 } 3893 3894 static int 3895 bdev_channel_create(void *io_device, void *ctx_buf) 3896 { 3897 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3898 struct spdk_bdev_channel *ch = ctx_buf; 3899 struct spdk_io_channel *mgmt_io_ch; 3900 struct spdk_bdev_mgmt_channel *mgmt_ch; 3901 struct spdk_bdev_shared_resource *shared_resource; 3902 struct lba_range *range; 3903 3904 ch->bdev = bdev; 3905 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3906 if (!ch->channel) { 3907 return -1; 3908 } 3909 3910 ch->accel_channel = spdk_accel_get_io_channel(); 3911 if (!ch->accel_channel) { 3912 spdk_put_io_channel(ch->channel); 3913 return -1; 3914 } 3915 3916 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3917 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3918 3919 assert(ch->histogram == NULL); 3920 if (bdev->internal.histogram_enabled) { 3921 ch->histogram = spdk_histogram_data_alloc(); 3922 if (ch->histogram == NULL) { 3923 SPDK_ERRLOG("Could not allocate histogram\n"); 3924 } 3925 } 3926 3927 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3928 if (!mgmt_io_ch) { 3929 spdk_put_io_channel(ch->channel); 3930 spdk_put_io_channel(ch->accel_channel); 3931 return -1; 3932 } 3933 3934 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3935 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3936 if (shared_resource->shared_ch == ch->channel) { 3937 spdk_put_io_channel(mgmt_io_ch); 3938 shared_resource->ref++; 3939 break; 3940 } 3941 } 3942 3943 if (shared_resource == NULL) { 3944 shared_resource = calloc(1, sizeof(*shared_resource)); 3945 if (shared_resource == NULL) { 3946 spdk_put_io_channel(ch->channel); 3947 spdk_put_io_channel(ch->accel_channel); 3948 spdk_put_io_channel(mgmt_io_ch); 3949 return -1; 3950 } 3951 3952 shared_resource->mgmt_ch = mgmt_ch; 3953 shared_resource->io_outstanding = 0; 3954 TAILQ_INIT(&shared_resource->nomem_io); 3955 shared_resource->nomem_threshold = 0; 3956 shared_resource->shared_ch = ch->channel; 3957 shared_resource->ref = 1; 3958 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3959 } 3960 3961 ch->io_outstanding = 0; 3962 TAILQ_INIT(&ch->queued_resets); 3963 TAILQ_INIT(&ch->locked_ranges); 3964 ch->flags = 0; 3965 ch->shared_resource = shared_resource; 3966 3967 TAILQ_INIT(&ch->io_submitted); 3968 TAILQ_INIT(&ch->io_locked); 3969 TAILQ_INIT(&ch->io_accel_exec); 3970 TAILQ_INIT(&ch->io_memory_domain); 3971 3972 ch->stat = bdev_alloc_io_stat(false); 3973 if (ch->stat == NULL) { 3974 bdev_channel_destroy_resource(ch); 3975 return -1; 3976 } 3977 3978 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3979 3980 #ifdef SPDK_CONFIG_VTUNE 3981 { 3982 char *name; 3983 __itt_init_ittlib(NULL, 0); 3984 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3985 if (!name) { 3986 bdev_channel_destroy_resource(ch); 3987 return -1; 3988 } 3989 ch->handle = __itt_string_handle_create(name); 3990 free(name); 3991 ch->start_tsc = spdk_get_ticks(); 3992 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3993 ch->prev_stat = bdev_alloc_io_stat(false); 3994 if (ch->prev_stat == NULL) { 3995 bdev_channel_destroy_resource(ch); 3996 return -1; 3997 } 3998 } 3999 #endif 4000 4001 spdk_spin_lock(&bdev->internal.spinlock); 4002 bdev_enable_qos(bdev, ch); 4003 4004 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4005 struct lba_range *new_range; 4006 4007 new_range = calloc(1, sizeof(*new_range)); 4008 if (new_range == NULL) { 4009 spdk_spin_unlock(&bdev->internal.spinlock); 4010 bdev_channel_destroy_resource(ch); 4011 return -1; 4012 } 4013 new_range->length = range->length; 4014 new_range->offset = range->offset; 4015 new_range->locked_ctx = range->locked_ctx; 4016 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4017 } 4018 4019 spdk_spin_unlock(&bdev->internal.spinlock); 4020 4021 return 0; 4022 } 4023 4024 static int 4025 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4026 void *cb_ctx) 4027 { 4028 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4029 struct spdk_bdev_io *bdev_io; 4030 uint64_t buf_len; 4031 4032 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4033 if (bdev_io->internal.ch == bdev_ch) { 4034 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4035 spdk_iobuf_entry_abort(ch, entry, buf_len); 4036 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4037 } 4038 4039 return 0; 4040 } 4041 4042 /* 4043 * Abort I/O that are waiting on a data buffer. 4044 */ 4045 static void 4046 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4047 { 4048 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4049 bdev_abort_all_buf_io_cb, ch); 4050 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4051 bdev_abort_all_buf_io_cb, ch); 4052 } 4053 4054 /* 4055 * Abort I/O that are queued waiting for submission. These types of I/O are 4056 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4057 */ 4058 static void 4059 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4060 { 4061 struct spdk_bdev_io *bdev_io, *tmp; 4062 4063 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4064 if (bdev_io->internal.ch == ch) { 4065 TAILQ_REMOVE(queue, bdev_io, internal.link); 4066 /* 4067 * spdk_bdev_io_complete() assumes that the completed I/O had 4068 * been submitted to the bdev module. Since in this case it 4069 * hadn't, bump io_outstanding to account for the decrement 4070 * that spdk_bdev_io_complete() will do. 4071 */ 4072 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4073 ch->io_outstanding++; 4074 ch->shared_resource->io_outstanding++; 4075 } 4076 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4077 } 4078 } 4079 } 4080 4081 static bool 4082 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4083 { 4084 struct spdk_bdev_io *bdev_io; 4085 4086 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4087 if (bdev_io == bio_to_abort) { 4088 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4089 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4090 return true; 4091 } 4092 } 4093 4094 return false; 4095 } 4096 4097 static int 4098 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4099 { 4100 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4101 uint64_t buf_len; 4102 4103 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4104 if (bdev_io == bio_to_abort) { 4105 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4106 spdk_iobuf_entry_abort(ch, entry, buf_len); 4107 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4108 return 1; 4109 } 4110 4111 return 0; 4112 } 4113 4114 static bool 4115 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4116 { 4117 int rc; 4118 4119 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4120 bdev_abort_buf_io_cb, bio_to_abort); 4121 if (rc == 1) { 4122 return true; 4123 } 4124 4125 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4126 bdev_abort_buf_io_cb, bio_to_abort); 4127 return rc == 1; 4128 } 4129 4130 static void 4131 bdev_qos_channel_destroy(void *cb_arg) 4132 { 4133 struct spdk_bdev_qos *qos = cb_arg; 4134 4135 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4136 spdk_poller_unregister(&qos->poller); 4137 4138 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4139 4140 free(qos); 4141 } 4142 4143 static int 4144 bdev_qos_destroy(struct spdk_bdev *bdev) 4145 { 4146 int i; 4147 4148 /* 4149 * Cleanly shutting down the QoS poller is tricky, because 4150 * during the asynchronous operation the user could open 4151 * a new descriptor and create a new channel, spawning 4152 * a new QoS poller. 4153 * 4154 * The strategy is to create a new QoS structure here and swap it 4155 * in. The shutdown path then continues to refer to the old one 4156 * until it completes and then releases it. 4157 */ 4158 struct spdk_bdev_qos *new_qos, *old_qos; 4159 4160 old_qos = bdev->internal.qos; 4161 4162 new_qos = calloc(1, sizeof(*new_qos)); 4163 if (!new_qos) { 4164 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4165 return -ENOMEM; 4166 } 4167 4168 /* Copy the old QoS data into the newly allocated structure */ 4169 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4170 4171 /* Zero out the key parts of the QoS structure */ 4172 new_qos->ch = NULL; 4173 new_qos->thread = NULL; 4174 new_qos->poller = NULL; 4175 TAILQ_INIT(&new_qos->queued); 4176 /* 4177 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4178 * It will be used later for the new QoS structure. 4179 */ 4180 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4181 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4182 new_qos->rate_limits[i].min_per_timeslice = 0; 4183 new_qos->rate_limits[i].max_per_timeslice = 0; 4184 } 4185 4186 bdev->internal.qos = new_qos; 4187 4188 if (old_qos->thread == NULL) { 4189 free(old_qos); 4190 } else { 4191 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4192 } 4193 4194 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4195 * been destroyed yet. The destruction path will end up waiting for the final 4196 * channel to be put before it releases resources. */ 4197 4198 return 0; 4199 } 4200 4201 void 4202 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4203 { 4204 total->bytes_read += add->bytes_read; 4205 total->num_read_ops += add->num_read_ops; 4206 total->bytes_written += add->bytes_written; 4207 total->num_write_ops += add->num_write_ops; 4208 total->bytes_unmapped += add->bytes_unmapped; 4209 total->num_unmap_ops += add->num_unmap_ops; 4210 total->bytes_copied += add->bytes_copied; 4211 total->num_copy_ops += add->num_copy_ops; 4212 total->read_latency_ticks += add->read_latency_ticks; 4213 total->write_latency_ticks += add->write_latency_ticks; 4214 total->unmap_latency_ticks += add->unmap_latency_ticks; 4215 total->copy_latency_ticks += add->copy_latency_ticks; 4216 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4217 total->max_read_latency_ticks = add->max_read_latency_ticks; 4218 } 4219 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4220 total->min_read_latency_ticks = add->min_read_latency_ticks; 4221 } 4222 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4223 total->max_write_latency_ticks = add->max_write_latency_ticks; 4224 } 4225 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4226 total->min_write_latency_ticks = add->min_write_latency_ticks; 4227 } 4228 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4229 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4230 } 4231 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4232 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4233 } 4234 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4235 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4236 } 4237 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4238 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4239 } 4240 } 4241 4242 static void 4243 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4244 { 4245 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4246 4247 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4248 memcpy(to_stat->io_error, from_stat->io_error, 4249 sizeof(struct spdk_bdev_io_error_stat)); 4250 } 4251 } 4252 4253 void 4254 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4255 { 4256 stat->max_read_latency_ticks = 0; 4257 stat->min_read_latency_ticks = UINT64_MAX; 4258 stat->max_write_latency_ticks = 0; 4259 stat->min_write_latency_ticks = UINT64_MAX; 4260 stat->max_unmap_latency_ticks = 0; 4261 stat->min_unmap_latency_ticks = UINT64_MAX; 4262 stat->max_copy_latency_ticks = 0; 4263 stat->min_copy_latency_ticks = UINT64_MAX; 4264 4265 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4266 return; 4267 } 4268 4269 stat->bytes_read = 0; 4270 stat->num_read_ops = 0; 4271 stat->bytes_written = 0; 4272 stat->num_write_ops = 0; 4273 stat->bytes_unmapped = 0; 4274 stat->num_unmap_ops = 0; 4275 stat->bytes_copied = 0; 4276 stat->num_copy_ops = 0; 4277 stat->read_latency_ticks = 0; 4278 stat->write_latency_ticks = 0; 4279 stat->unmap_latency_ticks = 0; 4280 stat->copy_latency_ticks = 0; 4281 4282 if (stat->io_error != NULL) { 4283 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4284 } 4285 } 4286 4287 struct spdk_bdev_io_stat * 4288 bdev_alloc_io_stat(bool io_error_stat) 4289 { 4290 struct spdk_bdev_io_stat *stat; 4291 4292 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4293 if (stat == NULL) { 4294 return NULL; 4295 } 4296 4297 if (io_error_stat) { 4298 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4299 if (stat->io_error == NULL) { 4300 free(stat); 4301 return NULL; 4302 } 4303 } else { 4304 stat->io_error = NULL; 4305 } 4306 4307 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4308 4309 return stat; 4310 } 4311 4312 void 4313 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4314 { 4315 if (stat != NULL) { 4316 free(stat->io_error); 4317 free(stat); 4318 } 4319 } 4320 4321 void 4322 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4323 { 4324 int i; 4325 4326 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4327 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4328 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4329 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4330 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4331 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4332 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4333 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4334 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4335 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4336 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4337 stat->min_read_latency_ticks != UINT64_MAX ? 4338 stat->min_read_latency_ticks : 0); 4339 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4340 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4341 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4342 stat->min_write_latency_ticks != UINT64_MAX ? 4343 stat->min_write_latency_ticks : 0); 4344 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4345 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4346 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4347 stat->min_unmap_latency_ticks != UINT64_MAX ? 4348 stat->min_unmap_latency_ticks : 0); 4349 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4350 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4351 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4352 stat->min_copy_latency_ticks != UINT64_MAX ? 4353 stat->min_copy_latency_ticks : 0); 4354 4355 if (stat->io_error != NULL) { 4356 spdk_json_write_named_object_begin(w, "io_error"); 4357 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4358 if (stat->io_error->error_status[i] != 0) { 4359 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4360 stat->io_error->error_status[i]); 4361 } 4362 } 4363 spdk_json_write_object_end(w); 4364 } 4365 } 4366 4367 static void 4368 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4369 { 4370 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4371 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4372 4373 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4374 bdev_abort_all_buf_io(mgmt_ch, ch); 4375 } 4376 4377 static void 4378 bdev_channel_destroy(void *io_device, void *ctx_buf) 4379 { 4380 struct spdk_bdev_channel *ch = ctx_buf; 4381 4382 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4383 spdk_get_thread()); 4384 4385 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4386 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4387 4388 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4389 spdk_spin_lock(&ch->bdev->internal.spinlock); 4390 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4391 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4392 4393 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4394 4395 bdev_channel_abort_queued_ios(ch); 4396 4397 if (ch->histogram) { 4398 spdk_histogram_data_free(ch->histogram); 4399 } 4400 4401 bdev_channel_destroy_resource(ch); 4402 } 4403 4404 /* 4405 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4406 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4407 */ 4408 static int 4409 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4410 { 4411 struct spdk_bdev_name *tmp; 4412 4413 bdev_name->name = strdup(name); 4414 if (bdev_name->name == NULL) { 4415 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4416 return -ENOMEM; 4417 } 4418 4419 bdev_name->bdev = bdev; 4420 4421 spdk_spin_lock(&g_bdev_mgr.spinlock); 4422 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4423 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4424 4425 if (tmp != NULL) { 4426 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4427 free(bdev_name->name); 4428 return -EEXIST; 4429 } 4430 4431 return 0; 4432 } 4433 4434 static void 4435 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4436 { 4437 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4438 free(bdev_name->name); 4439 } 4440 4441 static void 4442 bdev_name_del(struct spdk_bdev_name *bdev_name) 4443 { 4444 spdk_spin_lock(&g_bdev_mgr.spinlock); 4445 bdev_name_del_unsafe(bdev_name); 4446 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4447 } 4448 4449 int 4450 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4451 { 4452 struct spdk_bdev_alias *tmp; 4453 int ret; 4454 4455 if (alias == NULL) { 4456 SPDK_ERRLOG("Empty alias passed\n"); 4457 return -EINVAL; 4458 } 4459 4460 tmp = calloc(1, sizeof(*tmp)); 4461 if (tmp == NULL) { 4462 SPDK_ERRLOG("Unable to allocate alias\n"); 4463 return -ENOMEM; 4464 } 4465 4466 ret = bdev_name_add(&tmp->alias, bdev, alias); 4467 if (ret != 0) { 4468 free(tmp); 4469 return ret; 4470 } 4471 4472 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4473 4474 return 0; 4475 } 4476 4477 static int 4478 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4479 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4480 { 4481 struct spdk_bdev_alias *tmp; 4482 4483 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4484 if (strcmp(alias, tmp->alias.name) == 0) { 4485 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4486 alias_del_fn(&tmp->alias); 4487 free(tmp); 4488 return 0; 4489 } 4490 } 4491 4492 return -ENOENT; 4493 } 4494 4495 int 4496 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4497 { 4498 int rc; 4499 4500 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4501 if (rc == -ENOENT) { 4502 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4503 } 4504 4505 return rc; 4506 } 4507 4508 void 4509 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4510 { 4511 struct spdk_bdev_alias *p, *tmp; 4512 4513 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4514 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4515 bdev_name_del(&p->alias); 4516 free(p); 4517 } 4518 } 4519 4520 struct spdk_io_channel * 4521 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4522 { 4523 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4524 } 4525 4526 void * 4527 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4528 { 4529 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4530 void *ctx = NULL; 4531 4532 if (bdev->fn_table->get_module_ctx) { 4533 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4534 } 4535 4536 return ctx; 4537 } 4538 4539 const char * 4540 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4541 { 4542 return bdev->module->name; 4543 } 4544 4545 const char * 4546 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4547 { 4548 return bdev->name; 4549 } 4550 4551 const char * 4552 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4553 { 4554 return bdev->product_name; 4555 } 4556 4557 const struct spdk_bdev_aliases_list * 4558 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4559 { 4560 return &bdev->aliases; 4561 } 4562 4563 uint32_t 4564 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4565 { 4566 return bdev->blocklen; 4567 } 4568 4569 uint32_t 4570 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4571 { 4572 return bdev->write_unit_size; 4573 } 4574 4575 uint64_t 4576 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4577 { 4578 return bdev->blockcnt; 4579 } 4580 4581 const char * 4582 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4583 { 4584 return qos_rpc_type[type]; 4585 } 4586 4587 void 4588 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4589 { 4590 int i; 4591 4592 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4593 4594 spdk_spin_lock(&bdev->internal.spinlock); 4595 if (bdev->internal.qos) { 4596 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4597 if (bdev->internal.qos->rate_limits[i].limit != 4598 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4599 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4600 if (bdev_qos_is_iops_rate_limit(i) == false) { 4601 /* Change from Byte to Megabyte which is user visible. */ 4602 limits[i] = limits[i] / 1024 / 1024; 4603 } 4604 } 4605 } 4606 } 4607 spdk_spin_unlock(&bdev->internal.spinlock); 4608 } 4609 4610 size_t 4611 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4612 { 4613 return 1 << bdev->required_alignment; 4614 } 4615 4616 uint32_t 4617 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4618 { 4619 return bdev->optimal_io_boundary; 4620 } 4621 4622 bool 4623 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4624 { 4625 return bdev->write_cache; 4626 } 4627 4628 const struct spdk_uuid * 4629 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4630 { 4631 return &bdev->uuid; 4632 } 4633 4634 uint16_t 4635 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4636 { 4637 return bdev->acwu; 4638 } 4639 4640 uint32_t 4641 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4642 { 4643 return bdev->md_len; 4644 } 4645 4646 bool 4647 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4648 { 4649 return (bdev->md_len != 0) && bdev->md_interleave; 4650 } 4651 4652 bool 4653 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4654 { 4655 return (bdev->md_len != 0) && !bdev->md_interleave; 4656 } 4657 4658 bool 4659 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4660 { 4661 return bdev->zoned; 4662 } 4663 4664 uint32_t 4665 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4666 { 4667 if (spdk_bdev_is_md_interleaved(bdev)) { 4668 return bdev->blocklen - bdev->md_len; 4669 } else { 4670 return bdev->blocklen; 4671 } 4672 } 4673 4674 uint32_t 4675 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4676 { 4677 return bdev->phys_blocklen; 4678 } 4679 4680 static uint32_t 4681 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4682 { 4683 if (!spdk_bdev_is_md_interleaved(bdev)) { 4684 return bdev->blocklen + bdev->md_len; 4685 } else { 4686 return bdev->blocklen; 4687 } 4688 } 4689 4690 /* We have to use the typedef in the function declaration to appease astyle. */ 4691 typedef enum spdk_dif_type spdk_dif_type_t; 4692 4693 spdk_dif_type_t 4694 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4695 { 4696 if (bdev->md_len != 0) { 4697 return bdev->dif_type; 4698 } else { 4699 return SPDK_DIF_DISABLE; 4700 } 4701 } 4702 4703 bool 4704 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4705 { 4706 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4707 return bdev->dif_is_head_of_md; 4708 } else { 4709 return false; 4710 } 4711 } 4712 4713 bool 4714 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4715 enum spdk_dif_check_type check_type) 4716 { 4717 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4718 return false; 4719 } 4720 4721 switch (check_type) { 4722 case SPDK_DIF_CHECK_TYPE_REFTAG: 4723 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4724 case SPDK_DIF_CHECK_TYPE_APPTAG: 4725 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4726 case SPDK_DIF_CHECK_TYPE_GUARD: 4727 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4728 default: 4729 return false; 4730 } 4731 } 4732 4733 static uint32_t 4734 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4735 { 4736 uint64_t aligned_length, max_write_blocks; 4737 4738 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4739 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4740 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4741 4742 return max_write_blocks; 4743 } 4744 4745 uint32_t 4746 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4747 { 4748 return bdev->max_copy; 4749 } 4750 4751 uint64_t 4752 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4753 { 4754 return bdev->internal.measured_queue_depth; 4755 } 4756 4757 uint64_t 4758 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4759 { 4760 return bdev->internal.period; 4761 } 4762 4763 uint64_t 4764 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4765 { 4766 return bdev->internal.weighted_io_time; 4767 } 4768 4769 uint64_t 4770 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4771 { 4772 return bdev->internal.io_time; 4773 } 4774 4775 static void bdev_update_qd_sampling_period(void *ctx); 4776 4777 static void 4778 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4779 { 4780 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4781 4782 if (bdev->internal.measured_queue_depth) { 4783 bdev->internal.io_time += bdev->internal.period; 4784 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4785 } 4786 4787 bdev->internal.qd_poll_in_progress = false; 4788 4789 bdev_update_qd_sampling_period(bdev); 4790 } 4791 4792 static void 4793 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4794 struct spdk_io_channel *io_ch, void *_ctx) 4795 { 4796 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4797 4798 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4799 spdk_bdev_for_each_channel_continue(i, 0); 4800 } 4801 4802 static int 4803 bdev_calculate_measured_queue_depth(void *ctx) 4804 { 4805 struct spdk_bdev *bdev = ctx; 4806 4807 bdev->internal.qd_poll_in_progress = true; 4808 bdev->internal.temporary_queue_depth = 0; 4809 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4810 return SPDK_POLLER_BUSY; 4811 } 4812 4813 static void 4814 bdev_update_qd_sampling_period(void *ctx) 4815 { 4816 struct spdk_bdev *bdev = ctx; 4817 4818 if (bdev->internal.period == bdev->internal.new_period) { 4819 return; 4820 } 4821 4822 if (bdev->internal.qd_poll_in_progress) { 4823 return; 4824 } 4825 4826 bdev->internal.period = bdev->internal.new_period; 4827 4828 spdk_poller_unregister(&bdev->internal.qd_poller); 4829 if (bdev->internal.period != 0) { 4830 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4831 bdev, bdev->internal.period); 4832 } else { 4833 spdk_bdev_close(bdev->internal.qd_desc); 4834 bdev->internal.qd_desc = NULL; 4835 } 4836 } 4837 4838 static void 4839 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4840 { 4841 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4842 } 4843 4844 void 4845 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4846 { 4847 int rc; 4848 4849 if (bdev->internal.new_period == period) { 4850 return; 4851 } 4852 4853 bdev->internal.new_period = period; 4854 4855 if (bdev->internal.qd_desc != NULL) { 4856 assert(bdev->internal.period != 0); 4857 4858 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4859 bdev_update_qd_sampling_period, bdev); 4860 return; 4861 } 4862 4863 assert(bdev->internal.period == 0); 4864 4865 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4866 NULL, &bdev->internal.qd_desc); 4867 if (rc != 0) { 4868 return; 4869 } 4870 4871 bdev->internal.period = period; 4872 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4873 bdev, period); 4874 } 4875 4876 struct bdev_get_current_qd_ctx { 4877 uint64_t current_qd; 4878 spdk_bdev_get_current_qd_cb cb_fn; 4879 void *cb_arg; 4880 }; 4881 4882 static void 4883 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4884 { 4885 struct bdev_get_current_qd_ctx *ctx = _ctx; 4886 4887 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4888 4889 free(ctx); 4890 } 4891 4892 static void 4893 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4894 struct spdk_io_channel *io_ch, void *_ctx) 4895 { 4896 struct bdev_get_current_qd_ctx *ctx = _ctx; 4897 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4898 4899 ctx->current_qd += bdev_ch->io_outstanding; 4900 4901 spdk_bdev_for_each_channel_continue(i, 0); 4902 } 4903 4904 void 4905 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4906 void *cb_arg) 4907 { 4908 struct bdev_get_current_qd_ctx *ctx; 4909 4910 assert(cb_fn != NULL); 4911 4912 ctx = calloc(1, sizeof(*ctx)); 4913 if (ctx == NULL) { 4914 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4915 return; 4916 } 4917 4918 ctx->cb_fn = cb_fn; 4919 ctx->cb_arg = cb_arg; 4920 4921 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4922 } 4923 4924 static void 4925 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4926 { 4927 assert(desc->thread == spdk_get_thread()); 4928 4929 spdk_spin_lock(&desc->spinlock); 4930 desc->refs--; 4931 if (!desc->closed) { 4932 spdk_spin_unlock(&desc->spinlock); 4933 desc->callback.event_fn(type, 4934 desc->bdev, 4935 desc->callback.ctx); 4936 return; 4937 } else if (desc->refs == 0) { 4938 /* This descriptor was closed after this event_notify message was sent. 4939 * spdk_bdev_close() could not free the descriptor since this message was 4940 * in flight, so we free it now using bdev_desc_free(). 4941 */ 4942 spdk_spin_unlock(&desc->spinlock); 4943 bdev_desc_free(desc); 4944 return; 4945 } 4946 spdk_spin_unlock(&desc->spinlock); 4947 } 4948 4949 static void 4950 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4951 { 4952 spdk_spin_lock(&desc->spinlock); 4953 desc->refs++; 4954 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4955 spdk_spin_unlock(&desc->spinlock); 4956 } 4957 4958 static void 4959 _resize_notify(void *ctx) 4960 { 4961 struct spdk_bdev_desc *desc = ctx; 4962 4963 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4964 } 4965 4966 int 4967 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4968 { 4969 struct spdk_bdev_desc *desc; 4970 int ret; 4971 4972 if (size == bdev->blockcnt) { 4973 return 0; 4974 } 4975 4976 spdk_spin_lock(&bdev->internal.spinlock); 4977 4978 /* bdev has open descriptors */ 4979 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4980 bdev->blockcnt > size) { 4981 ret = -EBUSY; 4982 } else { 4983 bdev->blockcnt = size; 4984 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4985 event_notify(desc, _resize_notify); 4986 } 4987 ret = 0; 4988 } 4989 4990 spdk_spin_unlock(&bdev->internal.spinlock); 4991 4992 return ret; 4993 } 4994 4995 /* 4996 * Convert I/O offset and length from bytes to blocks. 4997 * 4998 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4999 */ 5000 static uint64_t 5001 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5002 uint64_t num_bytes, uint64_t *num_blocks) 5003 { 5004 uint32_t block_size = bdev->blocklen; 5005 uint8_t shift_cnt; 5006 5007 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5008 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5009 shift_cnt = spdk_u32log2(block_size); 5010 *offset_blocks = offset_bytes >> shift_cnt; 5011 *num_blocks = num_bytes >> shift_cnt; 5012 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5013 (num_bytes - (*num_blocks << shift_cnt)); 5014 } else { 5015 *offset_blocks = offset_bytes / block_size; 5016 *num_blocks = num_bytes / block_size; 5017 return (offset_bytes % block_size) | (num_bytes % block_size); 5018 } 5019 } 5020 5021 static bool 5022 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5023 { 5024 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5025 * has been an overflow and hence the offset has been wrapped around */ 5026 if (offset_blocks + num_blocks < offset_blocks) { 5027 return false; 5028 } 5029 5030 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5031 if (offset_blocks + num_blocks > bdev->blockcnt) { 5032 return false; 5033 } 5034 5035 return true; 5036 } 5037 5038 static void 5039 bdev_seek_complete_cb(void *ctx) 5040 { 5041 struct spdk_bdev_io *bdev_io = ctx; 5042 5043 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5044 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5045 } 5046 5047 static int 5048 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5049 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5050 spdk_bdev_io_completion_cb cb, void *cb_arg) 5051 { 5052 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5053 struct spdk_bdev_io *bdev_io; 5054 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5055 5056 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5057 5058 /* Check if offset_blocks is valid looking at the validity of one block */ 5059 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5060 return -EINVAL; 5061 } 5062 5063 bdev_io = bdev_channel_get_io(channel); 5064 if (!bdev_io) { 5065 return -ENOMEM; 5066 } 5067 5068 bdev_io->internal.ch = channel; 5069 bdev_io->internal.desc = desc; 5070 bdev_io->type = io_type; 5071 bdev_io->u.bdev.offset_blocks = offset_blocks; 5072 bdev_io->u.bdev.memory_domain = NULL; 5073 bdev_io->u.bdev.memory_domain_ctx = NULL; 5074 bdev_io->u.bdev.accel_sequence = NULL; 5075 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5076 5077 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5078 /* In case bdev doesn't support seek to next data/hole offset, 5079 * it is assumed that only data and no holes are present */ 5080 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5081 bdev_io->u.bdev.seek.offset = offset_blocks; 5082 } else { 5083 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5084 } 5085 5086 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5087 return 0; 5088 } 5089 5090 bdev_io_submit(bdev_io); 5091 return 0; 5092 } 5093 5094 int 5095 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5096 uint64_t offset_blocks, 5097 spdk_bdev_io_completion_cb cb, void *cb_arg) 5098 { 5099 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5100 } 5101 5102 int 5103 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5104 uint64_t offset_blocks, 5105 spdk_bdev_io_completion_cb cb, void *cb_arg) 5106 { 5107 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5108 } 5109 5110 uint64_t 5111 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5112 { 5113 return bdev_io->u.bdev.seek.offset; 5114 } 5115 5116 static int 5117 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5118 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5119 spdk_bdev_io_completion_cb cb, void *cb_arg) 5120 { 5121 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5122 struct spdk_bdev_io *bdev_io; 5123 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5124 5125 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5126 return -EINVAL; 5127 } 5128 5129 bdev_io = bdev_channel_get_io(channel); 5130 if (!bdev_io) { 5131 return -ENOMEM; 5132 } 5133 5134 bdev_io->internal.ch = channel; 5135 bdev_io->internal.desc = desc; 5136 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5137 bdev_io->u.bdev.iovs = &bdev_io->iov; 5138 bdev_io->u.bdev.iovs[0].iov_base = buf; 5139 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5140 bdev_io->u.bdev.iovcnt = 1; 5141 bdev_io->u.bdev.md_buf = md_buf; 5142 bdev_io->u.bdev.num_blocks = num_blocks; 5143 bdev_io->u.bdev.offset_blocks = offset_blocks; 5144 bdev_io->u.bdev.memory_domain = NULL; 5145 bdev_io->u.bdev.memory_domain_ctx = NULL; 5146 bdev_io->u.bdev.accel_sequence = NULL; 5147 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5148 5149 bdev_io_submit(bdev_io); 5150 return 0; 5151 } 5152 5153 int 5154 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5155 void *buf, uint64_t offset, uint64_t nbytes, 5156 spdk_bdev_io_completion_cb cb, void *cb_arg) 5157 { 5158 uint64_t offset_blocks, num_blocks; 5159 5160 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5161 nbytes, &num_blocks) != 0) { 5162 return -EINVAL; 5163 } 5164 5165 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5166 } 5167 5168 int 5169 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5170 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5171 spdk_bdev_io_completion_cb cb, void *cb_arg) 5172 { 5173 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5174 } 5175 5176 int 5177 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5178 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5179 spdk_bdev_io_completion_cb cb, void *cb_arg) 5180 { 5181 struct iovec iov = { 5182 .iov_base = buf, 5183 }; 5184 5185 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5186 return -EINVAL; 5187 } 5188 5189 if (md_buf && !_is_buf_allocated(&iov)) { 5190 return -EINVAL; 5191 } 5192 5193 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5194 cb, cb_arg); 5195 } 5196 5197 int 5198 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5199 struct iovec *iov, int iovcnt, 5200 uint64_t offset, uint64_t nbytes, 5201 spdk_bdev_io_completion_cb cb, void *cb_arg) 5202 { 5203 uint64_t offset_blocks, num_blocks; 5204 5205 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5206 nbytes, &num_blocks) != 0) { 5207 return -EINVAL; 5208 } 5209 5210 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5211 } 5212 5213 static int 5214 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5215 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5216 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5217 struct spdk_accel_sequence *seq, 5218 spdk_bdev_io_completion_cb cb, void *cb_arg) 5219 { 5220 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5221 struct spdk_bdev_io *bdev_io; 5222 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5223 5224 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5225 return -EINVAL; 5226 } 5227 5228 bdev_io = bdev_channel_get_io(channel); 5229 if (!bdev_io) { 5230 return -ENOMEM; 5231 } 5232 5233 bdev_io->internal.ch = channel; 5234 bdev_io->internal.desc = desc; 5235 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5236 bdev_io->u.bdev.iovs = iov; 5237 bdev_io->u.bdev.iovcnt = iovcnt; 5238 bdev_io->u.bdev.md_buf = md_buf; 5239 bdev_io->u.bdev.num_blocks = num_blocks; 5240 bdev_io->u.bdev.offset_blocks = offset_blocks; 5241 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5242 bdev_io->internal.memory_domain = domain; 5243 bdev_io->internal.memory_domain_ctx = domain_ctx; 5244 bdev_io->internal.accel_sequence = seq; 5245 bdev_io->internal.has_accel_sequence = seq != NULL; 5246 bdev_io->u.bdev.memory_domain = domain; 5247 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5248 bdev_io->u.bdev.accel_sequence = seq; 5249 5250 _bdev_io_submit_ext(desc, bdev_io); 5251 5252 return 0; 5253 } 5254 5255 int 5256 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5257 struct iovec *iov, int iovcnt, 5258 uint64_t offset_blocks, uint64_t num_blocks, 5259 spdk_bdev_io_completion_cb cb, void *cb_arg) 5260 { 5261 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5262 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5263 } 5264 5265 int 5266 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5267 struct iovec *iov, int iovcnt, void *md_buf, 5268 uint64_t offset_blocks, uint64_t num_blocks, 5269 spdk_bdev_io_completion_cb cb, void *cb_arg) 5270 { 5271 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5272 return -EINVAL; 5273 } 5274 5275 if (md_buf && !_is_buf_allocated(iov)) { 5276 return -EINVAL; 5277 } 5278 5279 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5280 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5281 } 5282 5283 static inline bool 5284 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5285 { 5286 /* 5287 * We check if opts size is at least of size when we first introduced 5288 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5289 * are not checked internal. 5290 */ 5291 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5292 sizeof(opts->metadata) && 5293 opts->size <= sizeof(*opts) && 5294 /* When memory domain is used, the user must provide data buffers */ 5295 (!opts->memory_domain || (iov && iov[0].iov_base)); 5296 } 5297 5298 int 5299 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5300 struct iovec *iov, int iovcnt, 5301 uint64_t offset_blocks, uint64_t num_blocks, 5302 spdk_bdev_io_completion_cb cb, void *cb_arg, 5303 struct spdk_bdev_ext_io_opts *opts) 5304 { 5305 void *md = NULL; 5306 5307 if (opts) { 5308 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5309 return -EINVAL; 5310 } 5311 md = opts->metadata; 5312 } 5313 5314 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5315 return -EINVAL; 5316 } 5317 5318 if (md && !_is_buf_allocated(iov)) { 5319 return -EINVAL; 5320 } 5321 5322 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5323 num_blocks, 5324 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5325 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5326 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5327 cb, cb_arg); 5328 } 5329 5330 static int 5331 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5332 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5333 spdk_bdev_io_completion_cb cb, void *cb_arg) 5334 { 5335 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5336 struct spdk_bdev_io *bdev_io; 5337 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5338 5339 if (!desc->write) { 5340 return -EBADF; 5341 } 5342 5343 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5344 return -EINVAL; 5345 } 5346 5347 bdev_io = bdev_channel_get_io(channel); 5348 if (!bdev_io) { 5349 return -ENOMEM; 5350 } 5351 5352 bdev_io->internal.ch = channel; 5353 bdev_io->internal.desc = desc; 5354 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5355 bdev_io->u.bdev.iovs = &bdev_io->iov; 5356 bdev_io->u.bdev.iovs[0].iov_base = buf; 5357 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5358 bdev_io->u.bdev.iovcnt = 1; 5359 bdev_io->u.bdev.md_buf = md_buf; 5360 bdev_io->u.bdev.num_blocks = num_blocks; 5361 bdev_io->u.bdev.offset_blocks = offset_blocks; 5362 bdev_io->u.bdev.memory_domain = NULL; 5363 bdev_io->u.bdev.memory_domain_ctx = NULL; 5364 bdev_io->u.bdev.accel_sequence = NULL; 5365 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5366 5367 bdev_io_submit(bdev_io); 5368 return 0; 5369 } 5370 5371 int 5372 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5373 void *buf, uint64_t offset, uint64_t nbytes, 5374 spdk_bdev_io_completion_cb cb, void *cb_arg) 5375 { 5376 uint64_t offset_blocks, num_blocks; 5377 5378 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5379 nbytes, &num_blocks) != 0) { 5380 return -EINVAL; 5381 } 5382 5383 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5384 } 5385 5386 int 5387 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5388 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5389 spdk_bdev_io_completion_cb cb, void *cb_arg) 5390 { 5391 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5392 cb, cb_arg); 5393 } 5394 5395 int 5396 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5397 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5398 spdk_bdev_io_completion_cb cb, void *cb_arg) 5399 { 5400 struct iovec iov = { 5401 .iov_base = buf, 5402 }; 5403 5404 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5405 return -EINVAL; 5406 } 5407 5408 if (md_buf && !_is_buf_allocated(&iov)) { 5409 return -EINVAL; 5410 } 5411 5412 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5413 cb, cb_arg); 5414 } 5415 5416 static int 5417 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5418 struct iovec *iov, int iovcnt, void *md_buf, 5419 uint64_t offset_blocks, uint64_t num_blocks, 5420 struct spdk_memory_domain *domain, void *domain_ctx, 5421 struct spdk_accel_sequence *seq, 5422 spdk_bdev_io_completion_cb cb, void *cb_arg) 5423 { 5424 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5425 struct spdk_bdev_io *bdev_io; 5426 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5427 5428 if (!desc->write) { 5429 return -EBADF; 5430 } 5431 5432 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5433 return -EINVAL; 5434 } 5435 5436 bdev_io = bdev_channel_get_io(channel); 5437 if (!bdev_io) { 5438 return -ENOMEM; 5439 } 5440 5441 bdev_io->internal.ch = channel; 5442 bdev_io->internal.desc = desc; 5443 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5444 bdev_io->u.bdev.iovs = iov; 5445 bdev_io->u.bdev.iovcnt = iovcnt; 5446 bdev_io->u.bdev.md_buf = md_buf; 5447 bdev_io->u.bdev.num_blocks = num_blocks; 5448 bdev_io->u.bdev.offset_blocks = offset_blocks; 5449 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5450 bdev_io->internal.memory_domain = domain; 5451 bdev_io->internal.memory_domain_ctx = domain_ctx; 5452 bdev_io->internal.accel_sequence = seq; 5453 bdev_io->internal.has_accel_sequence = seq != NULL; 5454 bdev_io->u.bdev.memory_domain = domain; 5455 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5456 bdev_io->u.bdev.accel_sequence = seq; 5457 5458 _bdev_io_submit_ext(desc, bdev_io); 5459 5460 return 0; 5461 } 5462 5463 int 5464 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5465 struct iovec *iov, int iovcnt, 5466 uint64_t offset, uint64_t len, 5467 spdk_bdev_io_completion_cb cb, void *cb_arg) 5468 { 5469 uint64_t offset_blocks, num_blocks; 5470 5471 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5472 len, &num_blocks) != 0) { 5473 return -EINVAL; 5474 } 5475 5476 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5477 } 5478 5479 int 5480 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5481 struct iovec *iov, int iovcnt, 5482 uint64_t offset_blocks, uint64_t num_blocks, 5483 spdk_bdev_io_completion_cb cb, void *cb_arg) 5484 { 5485 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5486 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5487 } 5488 5489 int 5490 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5491 struct iovec *iov, int iovcnt, void *md_buf, 5492 uint64_t offset_blocks, uint64_t num_blocks, 5493 spdk_bdev_io_completion_cb cb, void *cb_arg) 5494 { 5495 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5496 return -EINVAL; 5497 } 5498 5499 if (md_buf && !_is_buf_allocated(iov)) { 5500 return -EINVAL; 5501 } 5502 5503 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5504 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5505 } 5506 5507 int 5508 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5509 struct iovec *iov, int iovcnt, 5510 uint64_t offset_blocks, uint64_t num_blocks, 5511 spdk_bdev_io_completion_cb cb, void *cb_arg, 5512 struct spdk_bdev_ext_io_opts *opts) 5513 { 5514 void *md = NULL; 5515 5516 if (opts) { 5517 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5518 return -EINVAL; 5519 } 5520 md = opts->metadata; 5521 } 5522 5523 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5524 return -EINVAL; 5525 } 5526 5527 if (md && !_is_buf_allocated(iov)) { 5528 return -EINVAL; 5529 } 5530 5531 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5532 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5533 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5534 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5535 cb, cb_arg); 5536 } 5537 5538 static void 5539 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5540 { 5541 struct spdk_bdev_io *parent_io = cb_arg; 5542 struct spdk_bdev *bdev = parent_io->bdev; 5543 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5544 int i, rc = 0; 5545 5546 if (!success) { 5547 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5548 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5549 spdk_bdev_free_io(bdev_io); 5550 return; 5551 } 5552 5553 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5554 rc = memcmp(read_buf, 5555 parent_io->u.bdev.iovs[i].iov_base, 5556 parent_io->u.bdev.iovs[i].iov_len); 5557 if (rc) { 5558 break; 5559 } 5560 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5561 } 5562 5563 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5564 rc = memcmp(bdev_io->u.bdev.md_buf, 5565 parent_io->u.bdev.md_buf, 5566 spdk_bdev_get_md_size(bdev)); 5567 } 5568 5569 spdk_bdev_free_io(bdev_io); 5570 5571 if (rc == 0) { 5572 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5573 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5574 } else { 5575 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5576 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5577 } 5578 } 5579 5580 static void 5581 bdev_compare_do_read(void *_bdev_io) 5582 { 5583 struct spdk_bdev_io *bdev_io = _bdev_io; 5584 int rc; 5585 5586 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5587 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5588 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5589 bdev_compare_do_read_done, bdev_io); 5590 5591 if (rc == -ENOMEM) { 5592 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5593 } else if (rc != 0) { 5594 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5595 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5596 } 5597 } 5598 5599 static int 5600 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5601 struct iovec *iov, int iovcnt, void *md_buf, 5602 uint64_t offset_blocks, uint64_t num_blocks, 5603 spdk_bdev_io_completion_cb cb, void *cb_arg) 5604 { 5605 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5606 struct spdk_bdev_io *bdev_io; 5607 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5608 5609 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5610 return -EINVAL; 5611 } 5612 5613 bdev_io = bdev_channel_get_io(channel); 5614 if (!bdev_io) { 5615 return -ENOMEM; 5616 } 5617 5618 bdev_io->internal.ch = channel; 5619 bdev_io->internal.desc = desc; 5620 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5621 bdev_io->u.bdev.iovs = iov; 5622 bdev_io->u.bdev.iovcnt = iovcnt; 5623 bdev_io->u.bdev.md_buf = md_buf; 5624 bdev_io->u.bdev.num_blocks = num_blocks; 5625 bdev_io->u.bdev.offset_blocks = offset_blocks; 5626 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5627 bdev_io->u.bdev.memory_domain = NULL; 5628 bdev_io->u.bdev.memory_domain_ctx = NULL; 5629 bdev_io->u.bdev.accel_sequence = NULL; 5630 5631 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5632 bdev_io_submit(bdev_io); 5633 return 0; 5634 } 5635 5636 bdev_compare_do_read(bdev_io); 5637 5638 return 0; 5639 } 5640 5641 int 5642 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5643 struct iovec *iov, int iovcnt, 5644 uint64_t offset_blocks, uint64_t num_blocks, 5645 spdk_bdev_io_completion_cb cb, void *cb_arg) 5646 { 5647 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5648 num_blocks, cb, cb_arg); 5649 } 5650 5651 int 5652 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5653 struct iovec *iov, int iovcnt, void *md_buf, 5654 uint64_t offset_blocks, uint64_t num_blocks, 5655 spdk_bdev_io_completion_cb cb, void *cb_arg) 5656 { 5657 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5658 return -EINVAL; 5659 } 5660 5661 if (md_buf && !_is_buf_allocated(iov)) { 5662 return -EINVAL; 5663 } 5664 5665 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5666 num_blocks, cb, cb_arg); 5667 } 5668 5669 static int 5670 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5671 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5672 spdk_bdev_io_completion_cb cb, void *cb_arg) 5673 { 5674 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5675 struct spdk_bdev_io *bdev_io; 5676 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5677 5678 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5679 return -EINVAL; 5680 } 5681 5682 bdev_io = bdev_channel_get_io(channel); 5683 if (!bdev_io) { 5684 return -ENOMEM; 5685 } 5686 5687 bdev_io->internal.ch = channel; 5688 bdev_io->internal.desc = desc; 5689 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5690 bdev_io->u.bdev.iovs = &bdev_io->iov; 5691 bdev_io->u.bdev.iovs[0].iov_base = buf; 5692 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5693 bdev_io->u.bdev.iovcnt = 1; 5694 bdev_io->u.bdev.md_buf = md_buf; 5695 bdev_io->u.bdev.num_blocks = num_blocks; 5696 bdev_io->u.bdev.offset_blocks = offset_blocks; 5697 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5698 bdev_io->u.bdev.memory_domain = NULL; 5699 bdev_io->u.bdev.memory_domain_ctx = NULL; 5700 bdev_io->u.bdev.accel_sequence = NULL; 5701 5702 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5703 bdev_io_submit(bdev_io); 5704 return 0; 5705 } 5706 5707 bdev_compare_do_read(bdev_io); 5708 5709 return 0; 5710 } 5711 5712 int 5713 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5714 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5715 spdk_bdev_io_completion_cb cb, void *cb_arg) 5716 { 5717 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5718 cb, cb_arg); 5719 } 5720 5721 int 5722 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5723 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5724 spdk_bdev_io_completion_cb cb, void *cb_arg) 5725 { 5726 struct iovec iov = { 5727 .iov_base = buf, 5728 }; 5729 5730 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5731 return -EINVAL; 5732 } 5733 5734 if (md_buf && !_is_buf_allocated(&iov)) { 5735 return -EINVAL; 5736 } 5737 5738 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5739 cb, cb_arg); 5740 } 5741 5742 static void 5743 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5744 { 5745 struct spdk_bdev_io *bdev_io = ctx; 5746 5747 if (unlock_status) { 5748 SPDK_ERRLOG("LBA range unlock failed\n"); 5749 } 5750 5751 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5752 false, bdev_io->internal.caller_ctx); 5753 } 5754 5755 static void 5756 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5757 { 5758 bdev_io->internal.status = status; 5759 5760 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5761 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5762 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5763 } 5764 5765 static void 5766 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5767 { 5768 struct spdk_bdev_io *parent_io = cb_arg; 5769 5770 if (!success) { 5771 SPDK_ERRLOG("Compare and write operation failed\n"); 5772 } 5773 5774 spdk_bdev_free_io(bdev_io); 5775 5776 bdev_comparev_and_writev_blocks_unlock(parent_io, 5777 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5778 } 5779 5780 static void 5781 bdev_compare_and_write_do_write(void *_bdev_io) 5782 { 5783 struct spdk_bdev_io *bdev_io = _bdev_io; 5784 int rc; 5785 5786 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5787 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5788 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5789 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5790 bdev_compare_and_write_do_write_done, bdev_io); 5791 5792 5793 if (rc == -ENOMEM) { 5794 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5795 } else if (rc != 0) { 5796 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5797 } 5798 } 5799 5800 static void 5801 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5802 { 5803 struct spdk_bdev_io *parent_io = cb_arg; 5804 5805 spdk_bdev_free_io(bdev_io); 5806 5807 if (!success) { 5808 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5809 return; 5810 } 5811 5812 bdev_compare_and_write_do_write(parent_io); 5813 } 5814 5815 static void 5816 bdev_compare_and_write_do_compare(void *_bdev_io) 5817 { 5818 struct spdk_bdev_io *bdev_io = _bdev_io; 5819 int rc; 5820 5821 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5822 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5823 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5824 bdev_compare_and_write_do_compare_done, bdev_io); 5825 5826 if (rc == -ENOMEM) { 5827 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5828 } else if (rc != 0) { 5829 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5830 } 5831 } 5832 5833 static void 5834 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5835 { 5836 struct spdk_bdev_io *bdev_io = ctx; 5837 5838 if (status) { 5839 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5840 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5841 return; 5842 } 5843 5844 bdev_compare_and_write_do_compare(bdev_io); 5845 } 5846 5847 int 5848 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5849 struct iovec *compare_iov, int compare_iovcnt, 5850 struct iovec *write_iov, int write_iovcnt, 5851 uint64_t offset_blocks, uint64_t num_blocks, 5852 spdk_bdev_io_completion_cb cb, void *cb_arg) 5853 { 5854 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5855 struct spdk_bdev_io *bdev_io; 5856 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5857 5858 if (!desc->write) { 5859 return -EBADF; 5860 } 5861 5862 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5863 return -EINVAL; 5864 } 5865 5866 if (num_blocks > bdev->acwu) { 5867 return -EINVAL; 5868 } 5869 5870 bdev_io = bdev_channel_get_io(channel); 5871 if (!bdev_io) { 5872 return -ENOMEM; 5873 } 5874 5875 bdev_io->internal.ch = channel; 5876 bdev_io->internal.desc = desc; 5877 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5878 bdev_io->u.bdev.iovs = compare_iov; 5879 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5880 bdev_io->u.bdev.fused_iovs = write_iov; 5881 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5882 bdev_io->u.bdev.md_buf = NULL; 5883 bdev_io->u.bdev.num_blocks = num_blocks; 5884 bdev_io->u.bdev.offset_blocks = offset_blocks; 5885 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5886 bdev_io->u.bdev.memory_domain = NULL; 5887 bdev_io->u.bdev.memory_domain_ctx = NULL; 5888 bdev_io->u.bdev.accel_sequence = NULL; 5889 5890 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5891 bdev_io_submit(bdev_io); 5892 return 0; 5893 } 5894 5895 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5896 bdev_comparev_and_writev_blocks_locked, bdev_io); 5897 } 5898 5899 int 5900 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5901 struct iovec *iov, int iovcnt, 5902 uint64_t offset_blocks, uint64_t num_blocks, 5903 bool populate, 5904 spdk_bdev_io_completion_cb cb, void *cb_arg) 5905 { 5906 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5907 struct spdk_bdev_io *bdev_io; 5908 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5909 5910 if (!desc->write) { 5911 return -EBADF; 5912 } 5913 5914 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5915 return -EINVAL; 5916 } 5917 5918 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5919 return -ENOTSUP; 5920 } 5921 5922 bdev_io = bdev_channel_get_io(channel); 5923 if (!bdev_io) { 5924 return -ENOMEM; 5925 } 5926 5927 bdev_io->internal.ch = channel; 5928 bdev_io->internal.desc = desc; 5929 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5930 bdev_io->u.bdev.num_blocks = num_blocks; 5931 bdev_io->u.bdev.offset_blocks = offset_blocks; 5932 bdev_io->u.bdev.iovs = iov; 5933 bdev_io->u.bdev.iovcnt = iovcnt; 5934 bdev_io->u.bdev.md_buf = NULL; 5935 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5936 bdev_io->u.bdev.zcopy.commit = 0; 5937 bdev_io->u.bdev.zcopy.start = 1; 5938 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5939 bdev_io->u.bdev.memory_domain = NULL; 5940 bdev_io->u.bdev.memory_domain_ctx = NULL; 5941 bdev_io->u.bdev.accel_sequence = NULL; 5942 5943 bdev_io_submit(bdev_io); 5944 5945 return 0; 5946 } 5947 5948 int 5949 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5950 spdk_bdev_io_completion_cb cb, void *cb_arg) 5951 { 5952 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5953 return -EINVAL; 5954 } 5955 5956 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5957 bdev_io->u.bdev.zcopy.start = 0; 5958 bdev_io->internal.caller_ctx = cb_arg; 5959 bdev_io->internal.cb = cb; 5960 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5961 5962 bdev_io_submit(bdev_io); 5963 5964 return 0; 5965 } 5966 5967 int 5968 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5969 uint64_t offset, uint64_t len, 5970 spdk_bdev_io_completion_cb cb, void *cb_arg) 5971 { 5972 uint64_t offset_blocks, num_blocks; 5973 5974 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5975 len, &num_blocks) != 0) { 5976 return -EINVAL; 5977 } 5978 5979 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5980 } 5981 5982 int 5983 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5984 uint64_t offset_blocks, uint64_t num_blocks, 5985 spdk_bdev_io_completion_cb cb, void *cb_arg) 5986 { 5987 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5988 struct spdk_bdev_io *bdev_io; 5989 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5990 5991 if (!desc->write) { 5992 return -EBADF; 5993 } 5994 5995 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5996 return -EINVAL; 5997 } 5998 5999 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6000 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6001 return -ENOTSUP; 6002 } 6003 6004 bdev_io = bdev_channel_get_io(channel); 6005 6006 if (!bdev_io) { 6007 return -ENOMEM; 6008 } 6009 6010 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6011 bdev_io->internal.ch = channel; 6012 bdev_io->internal.desc = desc; 6013 bdev_io->u.bdev.offset_blocks = offset_blocks; 6014 bdev_io->u.bdev.num_blocks = num_blocks; 6015 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6016 bdev_io->u.bdev.memory_domain = NULL; 6017 bdev_io->u.bdev.memory_domain_ctx = NULL; 6018 bdev_io->u.bdev.accel_sequence = NULL; 6019 6020 /* If the write_zeroes size is large and should be split, use the generic split 6021 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6022 * 6023 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6024 * or emulate it using regular write request otherwise. 6025 */ 6026 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6027 bdev_io->internal.split) { 6028 bdev_io_submit(bdev_io); 6029 return 0; 6030 } 6031 6032 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6033 6034 return bdev_write_zero_buffer(bdev_io); 6035 } 6036 6037 int 6038 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6039 uint64_t offset, uint64_t nbytes, 6040 spdk_bdev_io_completion_cb cb, void *cb_arg) 6041 { 6042 uint64_t offset_blocks, num_blocks; 6043 6044 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6045 nbytes, &num_blocks) != 0) { 6046 return -EINVAL; 6047 } 6048 6049 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6050 } 6051 6052 int 6053 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6054 uint64_t offset_blocks, uint64_t num_blocks, 6055 spdk_bdev_io_completion_cb cb, void *cb_arg) 6056 { 6057 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6058 struct spdk_bdev_io *bdev_io; 6059 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6060 6061 if (!desc->write) { 6062 return -EBADF; 6063 } 6064 6065 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6066 return -EINVAL; 6067 } 6068 6069 if (num_blocks == 0) { 6070 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6071 return -EINVAL; 6072 } 6073 6074 bdev_io = bdev_channel_get_io(channel); 6075 if (!bdev_io) { 6076 return -ENOMEM; 6077 } 6078 6079 bdev_io->internal.ch = channel; 6080 bdev_io->internal.desc = desc; 6081 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6082 6083 bdev_io->u.bdev.iovs = &bdev_io->iov; 6084 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6085 bdev_io->u.bdev.iovs[0].iov_len = 0; 6086 bdev_io->u.bdev.iovcnt = 1; 6087 6088 bdev_io->u.bdev.offset_blocks = offset_blocks; 6089 bdev_io->u.bdev.num_blocks = num_blocks; 6090 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6091 bdev_io->u.bdev.memory_domain = NULL; 6092 bdev_io->u.bdev.memory_domain_ctx = NULL; 6093 bdev_io->u.bdev.accel_sequence = NULL; 6094 6095 bdev_io_submit(bdev_io); 6096 return 0; 6097 } 6098 6099 int 6100 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6101 uint64_t offset, uint64_t length, 6102 spdk_bdev_io_completion_cb cb, void *cb_arg) 6103 { 6104 uint64_t offset_blocks, num_blocks; 6105 6106 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6107 length, &num_blocks) != 0) { 6108 return -EINVAL; 6109 } 6110 6111 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6112 } 6113 6114 int 6115 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6116 uint64_t offset_blocks, uint64_t num_blocks, 6117 spdk_bdev_io_completion_cb cb, void *cb_arg) 6118 { 6119 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6120 struct spdk_bdev_io *bdev_io; 6121 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6122 6123 if (!desc->write) { 6124 return -EBADF; 6125 } 6126 6127 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6128 return -EINVAL; 6129 } 6130 6131 bdev_io = bdev_channel_get_io(channel); 6132 if (!bdev_io) { 6133 return -ENOMEM; 6134 } 6135 6136 bdev_io->internal.ch = channel; 6137 bdev_io->internal.desc = desc; 6138 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6139 bdev_io->u.bdev.iovs = NULL; 6140 bdev_io->u.bdev.iovcnt = 0; 6141 bdev_io->u.bdev.offset_blocks = offset_blocks; 6142 bdev_io->u.bdev.num_blocks = num_blocks; 6143 bdev_io->u.bdev.memory_domain = NULL; 6144 bdev_io->u.bdev.memory_domain_ctx = NULL; 6145 bdev_io->u.bdev.accel_sequence = NULL; 6146 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6147 6148 bdev_io_submit(bdev_io); 6149 return 0; 6150 } 6151 6152 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6153 6154 static void 6155 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6156 { 6157 struct spdk_bdev_channel *ch = _ctx; 6158 struct spdk_bdev_io *bdev_io; 6159 6160 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6161 6162 if (status == -EBUSY) { 6163 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6164 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6165 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6166 } else { 6167 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6168 6169 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6170 /* If outstanding IOs are still present and reset_io_drain_timeout 6171 * seconds passed, start the reset. */ 6172 bdev_io_submit_reset(bdev_io); 6173 } else { 6174 /* We still have in progress memory domain pull/push or we're 6175 * executing accel sequence. Since we cannot abort either of those 6176 * operaions, fail the reset request. */ 6177 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6178 } 6179 } 6180 } else { 6181 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6182 SPDK_DEBUGLOG(bdev, 6183 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6184 ch->bdev->name); 6185 /* Mark the completion status as a SUCCESS and complete the reset. */ 6186 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6187 } 6188 } 6189 6190 static void 6191 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6192 struct spdk_io_channel *io_ch, void *_ctx) 6193 { 6194 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6195 int status = 0; 6196 6197 if (cur_ch->io_outstanding > 0 || 6198 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6199 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6200 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6201 * further iteration over the rest of the channels and pass non-zero status 6202 * to the callback function. */ 6203 status = -EBUSY; 6204 } 6205 spdk_bdev_for_each_channel_continue(i, status); 6206 } 6207 6208 static int 6209 bdev_reset_poll_for_outstanding_io(void *ctx) 6210 { 6211 struct spdk_bdev_channel *ch = ctx; 6212 struct spdk_bdev_io *bdev_io; 6213 6214 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6215 6216 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6217 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6218 bdev_reset_check_outstanding_io_done); 6219 6220 return SPDK_POLLER_BUSY; 6221 } 6222 6223 static void 6224 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6225 { 6226 struct spdk_bdev_channel *ch = _ctx; 6227 struct spdk_bdev_io *bdev_io; 6228 6229 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6230 6231 if (bdev->reset_io_drain_timeout == 0) { 6232 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6233 6234 bdev_io_submit_reset(bdev_io); 6235 return; 6236 } 6237 6238 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6239 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6240 6241 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6242 * submit the reset to the underlying module only if outstanding I/O 6243 * remain after reset_io_drain_timeout seconds have passed. */ 6244 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6245 bdev_reset_check_outstanding_io_done); 6246 } 6247 6248 static void 6249 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6250 struct spdk_io_channel *ch, void *_ctx) 6251 { 6252 struct spdk_bdev_channel *channel; 6253 struct spdk_bdev_mgmt_channel *mgmt_channel; 6254 struct spdk_bdev_shared_resource *shared_resource; 6255 bdev_io_tailq_t tmp_queued; 6256 6257 TAILQ_INIT(&tmp_queued); 6258 6259 channel = __io_ch_to_bdev_ch(ch); 6260 shared_resource = channel->shared_resource; 6261 mgmt_channel = shared_resource->mgmt_ch; 6262 6263 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6264 6265 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6266 /* The QoS object is always valid and readable while 6267 * the channel flag is set, so the lock here should not 6268 * be necessary. We're not in the fast path though, so 6269 * just take it anyway. */ 6270 spdk_spin_lock(&channel->bdev->internal.spinlock); 6271 if (channel->bdev->internal.qos->ch == channel) { 6272 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6273 } 6274 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6275 } 6276 6277 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6278 bdev_abort_all_buf_io(mgmt_channel, channel); 6279 bdev_abort_all_queued_io(&tmp_queued, channel); 6280 6281 spdk_bdev_for_each_channel_continue(i, 0); 6282 } 6283 6284 static void 6285 bdev_start_reset(void *ctx) 6286 { 6287 struct spdk_bdev_channel *ch = ctx; 6288 6289 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6290 bdev_reset_freeze_channel_done); 6291 } 6292 6293 static void 6294 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6295 { 6296 struct spdk_bdev *bdev = ch->bdev; 6297 6298 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6299 6300 spdk_spin_lock(&bdev->internal.spinlock); 6301 if (bdev->internal.reset_in_progress == NULL) { 6302 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6303 /* 6304 * Take a channel reference for the target bdev for the life of this 6305 * reset. This guards against the channel getting destroyed while 6306 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6307 * progress. We will release the reference when this reset is 6308 * completed. 6309 */ 6310 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6311 bdev_start_reset(ch); 6312 } 6313 spdk_spin_unlock(&bdev->internal.spinlock); 6314 } 6315 6316 int 6317 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6318 spdk_bdev_io_completion_cb cb, void *cb_arg) 6319 { 6320 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6321 struct spdk_bdev_io *bdev_io; 6322 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6323 6324 bdev_io = bdev_channel_get_io(channel); 6325 if (!bdev_io) { 6326 return -ENOMEM; 6327 } 6328 6329 bdev_io->internal.ch = channel; 6330 bdev_io->internal.desc = desc; 6331 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6332 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6333 bdev_io->u.reset.ch_ref = NULL; 6334 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6335 6336 spdk_spin_lock(&bdev->internal.spinlock); 6337 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6338 spdk_spin_unlock(&bdev->internal.spinlock); 6339 6340 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6341 internal.ch_link); 6342 6343 bdev_channel_start_reset(channel); 6344 6345 return 0; 6346 } 6347 6348 void 6349 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6350 struct spdk_bdev_io_stat *stat) 6351 { 6352 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6353 6354 bdev_get_io_stat(stat, channel->stat); 6355 } 6356 6357 static void 6358 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6359 { 6360 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6361 6362 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6363 bdev_iostat_ctx->cb_arg, 0); 6364 free(bdev_iostat_ctx); 6365 } 6366 6367 static void 6368 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6369 struct spdk_io_channel *ch, void *_ctx) 6370 { 6371 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6372 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6373 6374 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6375 spdk_bdev_for_each_channel_continue(i, 0); 6376 } 6377 6378 void 6379 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6380 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6381 { 6382 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6383 6384 assert(bdev != NULL); 6385 assert(stat != NULL); 6386 assert(cb != NULL); 6387 6388 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6389 if (bdev_iostat_ctx == NULL) { 6390 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6391 cb(bdev, stat, cb_arg, -ENOMEM); 6392 return; 6393 } 6394 6395 bdev_iostat_ctx->stat = stat; 6396 bdev_iostat_ctx->cb = cb; 6397 bdev_iostat_ctx->cb_arg = cb_arg; 6398 6399 /* Start with the statistics from previously deleted channels. */ 6400 spdk_spin_lock(&bdev->internal.spinlock); 6401 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6402 spdk_spin_unlock(&bdev->internal.spinlock); 6403 6404 /* Then iterate and add the statistics from each existing channel. */ 6405 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6406 bdev_get_device_stat_done); 6407 } 6408 6409 struct bdev_iostat_reset_ctx { 6410 enum spdk_bdev_reset_stat_mode mode; 6411 bdev_reset_device_stat_cb cb; 6412 void *cb_arg; 6413 }; 6414 6415 static void 6416 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6417 { 6418 struct bdev_iostat_reset_ctx *ctx = _ctx; 6419 6420 ctx->cb(bdev, ctx->cb_arg, 0); 6421 6422 free(ctx); 6423 } 6424 6425 static void 6426 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6427 struct spdk_io_channel *ch, void *_ctx) 6428 { 6429 struct bdev_iostat_reset_ctx *ctx = _ctx; 6430 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6431 6432 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6433 6434 spdk_bdev_for_each_channel_continue(i, 0); 6435 } 6436 6437 void 6438 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6439 bdev_reset_device_stat_cb cb, void *cb_arg) 6440 { 6441 struct bdev_iostat_reset_ctx *ctx; 6442 6443 assert(bdev != NULL); 6444 assert(cb != NULL); 6445 6446 ctx = calloc(1, sizeof(*ctx)); 6447 if (ctx == NULL) { 6448 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6449 cb(bdev, cb_arg, -ENOMEM); 6450 return; 6451 } 6452 6453 ctx->mode = mode; 6454 ctx->cb = cb; 6455 ctx->cb_arg = cb_arg; 6456 6457 spdk_spin_lock(&bdev->internal.spinlock); 6458 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6459 spdk_spin_unlock(&bdev->internal.spinlock); 6460 6461 spdk_bdev_for_each_channel(bdev, 6462 bdev_reset_each_channel_stat, 6463 ctx, 6464 bdev_reset_device_stat_done); 6465 } 6466 6467 int 6468 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6469 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6470 spdk_bdev_io_completion_cb cb, void *cb_arg) 6471 { 6472 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6473 struct spdk_bdev_io *bdev_io; 6474 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6475 6476 if (!desc->write) { 6477 return -EBADF; 6478 } 6479 6480 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6481 return -ENOTSUP; 6482 } 6483 6484 bdev_io = bdev_channel_get_io(channel); 6485 if (!bdev_io) { 6486 return -ENOMEM; 6487 } 6488 6489 bdev_io->internal.ch = channel; 6490 bdev_io->internal.desc = desc; 6491 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6492 bdev_io->u.nvme_passthru.cmd = *cmd; 6493 bdev_io->u.nvme_passthru.buf = buf; 6494 bdev_io->u.nvme_passthru.nbytes = nbytes; 6495 bdev_io->u.nvme_passthru.md_buf = NULL; 6496 bdev_io->u.nvme_passthru.md_len = 0; 6497 6498 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6499 6500 bdev_io_submit(bdev_io); 6501 return 0; 6502 } 6503 6504 int 6505 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6506 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6507 spdk_bdev_io_completion_cb cb, void *cb_arg) 6508 { 6509 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6510 struct spdk_bdev_io *bdev_io; 6511 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6512 6513 if (!desc->write) { 6514 /* 6515 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6516 * to easily determine if the command is a read or write, but for now just 6517 * do not allow io_passthru with a read-only descriptor. 6518 */ 6519 return -EBADF; 6520 } 6521 6522 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6523 return -ENOTSUP; 6524 } 6525 6526 bdev_io = bdev_channel_get_io(channel); 6527 if (!bdev_io) { 6528 return -ENOMEM; 6529 } 6530 6531 bdev_io->internal.ch = channel; 6532 bdev_io->internal.desc = desc; 6533 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6534 bdev_io->u.nvme_passthru.cmd = *cmd; 6535 bdev_io->u.nvme_passthru.buf = buf; 6536 bdev_io->u.nvme_passthru.nbytes = nbytes; 6537 bdev_io->u.nvme_passthru.md_buf = NULL; 6538 bdev_io->u.nvme_passthru.md_len = 0; 6539 6540 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6541 6542 bdev_io_submit(bdev_io); 6543 return 0; 6544 } 6545 6546 int 6547 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6548 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6549 spdk_bdev_io_completion_cb cb, void *cb_arg) 6550 { 6551 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6552 struct spdk_bdev_io *bdev_io; 6553 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6554 6555 if (!desc->write) { 6556 /* 6557 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6558 * to easily determine if the command is a read or write, but for now just 6559 * do not allow io_passthru with a read-only descriptor. 6560 */ 6561 return -EBADF; 6562 } 6563 6564 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6565 return -ENOTSUP; 6566 } 6567 6568 bdev_io = bdev_channel_get_io(channel); 6569 if (!bdev_io) { 6570 return -ENOMEM; 6571 } 6572 6573 bdev_io->internal.ch = channel; 6574 bdev_io->internal.desc = desc; 6575 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6576 bdev_io->u.nvme_passthru.cmd = *cmd; 6577 bdev_io->u.nvme_passthru.buf = buf; 6578 bdev_io->u.nvme_passthru.nbytes = nbytes; 6579 bdev_io->u.nvme_passthru.md_buf = md_buf; 6580 bdev_io->u.nvme_passthru.md_len = md_len; 6581 6582 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6583 6584 bdev_io_submit(bdev_io); 6585 return 0; 6586 } 6587 6588 static void bdev_abort_retry(void *ctx); 6589 static void bdev_abort(struct spdk_bdev_io *parent_io); 6590 6591 static void 6592 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6593 { 6594 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6595 struct spdk_bdev_io *parent_io = cb_arg; 6596 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6597 6598 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6599 6600 spdk_bdev_free_io(bdev_io); 6601 6602 if (!success) { 6603 /* Check if the target I/O completed in the meantime. */ 6604 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6605 if (tmp_io == bio_to_abort) { 6606 break; 6607 } 6608 } 6609 6610 /* If the target I/O still exists, set the parent to failed. */ 6611 if (tmp_io != NULL) { 6612 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6613 } 6614 } 6615 6616 parent_io->u.bdev.split_outstanding--; 6617 if (parent_io->u.bdev.split_outstanding == 0) { 6618 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6619 bdev_abort_retry(parent_io); 6620 } else { 6621 bdev_io_complete(parent_io); 6622 } 6623 } 6624 } 6625 6626 static int 6627 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6628 struct spdk_bdev_io *bio_to_abort, 6629 spdk_bdev_io_completion_cb cb, void *cb_arg) 6630 { 6631 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6632 struct spdk_bdev_io *bdev_io; 6633 6634 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6635 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6636 /* TODO: Abort reset or abort request. */ 6637 return -ENOTSUP; 6638 } 6639 6640 bdev_io = bdev_channel_get_io(channel); 6641 if (bdev_io == NULL) { 6642 return -ENOMEM; 6643 } 6644 6645 bdev_io->internal.ch = channel; 6646 bdev_io->internal.desc = desc; 6647 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6648 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6649 6650 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6651 assert(bdev_io_should_split(bio_to_abort)); 6652 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6653 6654 /* Parent abort request is not submitted directly, but to manage its 6655 * execution add it to the submitted list here. 6656 */ 6657 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6658 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6659 6660 bdev_abort(bdev_io); 6661 6662 return 0; 6663 } 6664 6665 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6666 6667 /* Submit the abort request to the underlying bdev module. */ 6668 bdev_io_submit(bdev_io); 6669 6670 return 0; 6671 } 6672 6673 static bool 6674 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6675 { 6676 struct spdk_bdev_io *iter; 6677 6678 TAILQ_FOREACH(iter, tailq, internal.link) { 6679 if (iter == bdev_io) { 6680 return true; 6681 } 6682 } 6683 6684 return false; 6685 } 6686 6687 static uint32_t 6688 _bdev_abort(struct spdk_bdev_io *parent_io) 6689 { 6690 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6691 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6692 void *bio_cb_arg; 6693 struct spdk_bdev_io *bio_to_abort; 6694 uint32_t matched_ios; 6695 int rc; 6696 6697 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6698 6699 /* matched_ios is returned and will be kept by the caller. 6700 * 6701 * This function will be used for two cases, 1) the same cb_arg is used for 6702 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6703 * Incrementing split_outstanding directly here may confuse readers especially 6704 * for the 1st case. 6705 * 6706 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6707 * works as expected. 6708 */ 6709 matched_ios = 0; 6710 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6711 6712 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6713 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6714 continue; 6715 } 6716 6717 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6718 /* Any I/O which was submitted after this abort command should be excluded. */ 6719 continue; 6720 } 6721 6722 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6723 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6724 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6725 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6726 break; 6727 } 6728 6729 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6730 if (rc != 0) { 6731 if (rc == -ENOMEM) { 6732 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6733 } else { 6734 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6735 } 6736 break; 6737 } 6738 matched_ios++; 6739 } 6740 6741 return matched_ios; 6742 } 6743 6744 static void 6745 bdev_abort_retry(void *ctx) 6746 { 6747 struct spdk_bdev_io *parent_io = ctx; 6748 uint32_t matched_ios; 6749 6750 matched_ios = _bdev_abort(parent_io); 6751 6752 if (matched_ios == 0) { 6753 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6754 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6755 } else { 6756 /* For retry, the case that no target I/O was found is success 6757 * because it means target I/Os completed in the meantime. 6758 */ 6759 bdev_io_complete(parent_io); 6760 } 6761 return; 6762 } 6763 6764 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6765 parent_io->u.bdev.split_outstanding = matched_ios; 6766 } 6767 6768 static void 6769 bdev_abort(struct spdk_bdev_io *parent_io) 6770 { 6771 uint32_t matched_ios; 6772 6773 matched_ios = _bdev_abort(parent_io); 6774 6775 if (matched_ios == 0) { 6776 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6777 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6778 } else { 6779 /* The case the no target I/O was found is failure. */ 6780 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6781 bdev_io_complete(parent_io); 6782 } 6783 return; 6784 } 6785 6786 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6787 parent_io->u.bdev.split_outstanding = matched_ios; 6788 } 6789 6790 int 6791 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6792 void *bio_cb_arg, 6793 spdk_bdev_io_completion_cb cb, void *cb_arg) 6794 { 6795 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6796 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6797 struct spdk_bdev_io *bdev_io; 6798 6799 if (bio_cb_arg == NULL) { 6800 return -EINVAL; 6801 } 6802 6803 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6804 return -ENOTSUP; 6805 } 6806 6807 bdev_io = bdev_channel_get_io(channel); 6808 if (bdev_io == NULL) { 6809 return -ENOMEM; 6810 } 6811 6812 bdev_io->internal.ch = channel; 6813 bdev_io->internal.desc = desc; 6814 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6815 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6816 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6817 6818 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6819 6820 /* Parent abort request is not submitted directly, but to manage its execution, 6821 * add it to the submitted list here. 6822 */ 6823 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6824 6825 bdev_abort(bdev_io); 6826 6827 return 0; 6828 } 6829 6830 int 6831 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6832 struct spdk_bdev_io_wait_entry *entry) 6833 { 6834 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6835 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6836 6837 if (bdev != entry->bdev) { 6838 SPDK_ERRLOG("bdevs do not match\n"); 6839 return -EINVAL; 6840 } 6841 6842 if (mgmt_ch->per_thread_cache_count > 0) { 6843 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6844 return -EINVAL; 6845 } 6846 6847 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6848 return 0; 6849 } 6850 6851 static inline void 6852 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6853 { 6854 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6855 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6856 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6857 uint32_t blocklen = bdev_io->bdev->blocklen; 6858 6859 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6860 switch (bdev_io->type) { 6861 case SPDK_BDEV_IO_TYPE_READ: 6862 io_stat->bytes_read += num_blocks * blocklen; 6863 io_stat->num_read_ops++; 6864 io_stat->read_latency_ticks += tsc_diff; 6865 if (io_stat->max_read_latency_ticks < tsc_diff) { 6866 io_stat->max_read_latency_ticks = tsc_diff; 6867 } 6868 if (io_stat->min_read_latency_ticks > tsc_diff) { 6869 io_stat->min_read_latency_ticks = tsc_diff; 6870 } 6871 break; 6872 case SPDK_BDEV_IO_TYPE_WRITE: 6873 io_stat->bytes_written += num_blocks * blocklen; 6874 io_stat->num_write_ops++; 6875 io_stat->write_latency_ticks += tsc_diff; 6876 if (io_stat->max_write_latency_ticks < tsc_diff) { 6877 io_stat->max_write_latency_ticks = tsc_diff; 6878 } 6879 if (io_stat->min_write_latency_ticks > tsc_diff) { 6880 io_stat->min_write_latency_ticks = tsc_diff; 6881 } 6882 break; 6883 case SPDK_BDEV_IO_TYPE_UNMAP: 6884 io_stat->bytes_unmapped += num_blocks * blocklen; 6885 io_stat->num_unmap_ops++; 6886 io_stat->unmap_latency_ticks += tsc_diff; 6887 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6888 io_stat->max_unmap_latency_ticks = tsc_diff; 6889 } 6890 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6891 io_stat->min_unmap_latency_ticks = tsc_diff; 6892 } 6893 break; 6894 case SPDK_BDEV_IO_TYPE_ZCOPY: 6895 /* Track the data in the start phase only */ 6896 if (bdev_io->u.bdev.zcopy.start) { 6897 if (bdev_io->u.bdev.zcopy.populate) { 6898 io_stat->bytes_read += num_blocks * blocklen; 6899 io_stat->num_read_ops++; 6900 io_stat->read_latency_ticks += tsc_diff; 6901 if (io_stat->max_read_latency_ticks < tsc_diff) { 6902 io_stat->max_read_latency_ticks = tsc_diff; 6903 } 6904 if (io_stat->min_read_latency_ticks > tsc_diff) { 6905 io_stat->min_read_latency_ticks = tsc_diff; 6906 } 6907 } else { 6908 io_stat->bytes_written += num_blocks * blocklen; 6909 io_stat->num_write_ops++; 6910 io_stat->write_latency_ticks += tsc_diff; 6911 if (io_stat->max_write_latency_ticks < tsc_diff) { 6912 io_stat->max_write_latency_ticks = tsc_diff; 6913 } 6914 if (io_stat->min_write_latency_ticks > tsc_diff) { 6915 io_stat->min_write_latency_ticks = tsc_diff; 6916 } 6917 } 6918 } 6919 break; 6920 case SPDK_BDEV_IO_TYPE_COPY: 6921 io_stat->bytes_copied += num_blocks * blocklen; 6922 io_stat->num_copy_ops++; 6923 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6924 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6925 io_stat->max_copy_latency_ticks = tsc_diff; 6926 } 6927 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6928 io_stat->min_copy_latency_ticks = tsc_diff; 6929 } 6930 break; 6931 default: 6932 break; 6933 } 6934 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6935 io_stat = bdev_io->bdev->internal.stat; 6936 assert(io_stat->io_error != NULL); 6937 6938 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6939 io_stat->io_error->error_status[-io_status - 1]++; 6940 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6941 } 6942 6943 #ifdef SPDK_CONFIG_VTUNE 6944 uint64_t now_tsc = spdk_get_ticks(); 6945 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6946 uint64_t data[5]; 6947 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6948 6949 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6950 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6951 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6952 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6953 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6954 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6955 6956 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6957 __itt_metadata_u64, 5, data); 6958 6959 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6960 bdev_io->internal.ch->start_tsc = now_tsc; 6961 } 6962 #endif 6963 } 6964 6965 static inline void 6966 _bdev_io_complete(void *ctx) 6967 { 6968 struct spdk_bdev_io *bdev_io = ctx; 6969 6970 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6971 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6972 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6973 } 6974 6975 assert(bdev_io->internal.cb != NULL); 6976 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6977 6978 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6979 bdev_io->internal.caller_ctx); 6980 } 6981 6982 static inline void 6983 bdev_io_complete(void *ctx) 6984 { 6985 struct spdk_bdev_io *bdev_io = ctx; 6986 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6987 uint64_t tsc, tsc_diff; 6988 6989 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6990 /* 6991 * Defer completion to avoid potential infinite recursion if the 6992 * user's completion callback issues a new I/O. 6993 */ 6994 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6995 bdev_io_complete, bdev_io); 6996 return; 6997 } 6998 6999 tsc = spdk_get_ticks(); 7000 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7001 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7002 bdev_io->internal.caller_ctx); 7003 7004 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7005 7006 if (bdev_io->internal.ch->histogram) { 7007 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7008 } 7009 7010 bdev_io_update_io_stat(bdev_io, tsc_diff); 7011 _bdev_io_complete(bdev_io); 7012 } 7013 7014 /* The difference between this function and bdev_io_complete() is that this should be called to 7015 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7016 * io_submitted list and don't have submit_tsc updated. 7017 */ 7018 static inline void 7019 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7020 { 7021 /* Since the IO hasn't been submitted it's bound to be failed */ 7022 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7023 7024 /* At this point we don't know if the IO is completed from submission context or not, but, 7025 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7026 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7027 _bdev_io_complete, bdev_io); 7028 } 7029 7030 static void bdev_destroy_cb(void *io_device); 7031 7032 static void 7033 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7034 { 7035 struct spdk_bdev_io *bdev_io = _ctx; 7036 7037 if (bdev_io->u.reset.ch_ref != NULL) { 7038 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7039 bdev_io->u.reset.ch_ref = NULL; 7040 } 7041 7042 bdev_io_complete(bdev_io); 7043 7044 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7045 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7046 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7047 } 7048 } 7049 7050 static void 7051 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7052 struct spdk_io_channel *_ch, void *_ctx) 7053 { 7054 struct spdk_bdev_io *bdev_io = _ctx; 7055 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7056 struct spdk_bdev_io *queued_reset; 7057 7058 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7059 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7060 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7061 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7062 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7063 } 7064 7065 spdk_bdev_for_each_channel_continue(i, 0); 7066 } 7067 7068 static void 7069 bdev_io_complete_sequence_cb(void *ctx, int status) 7070 { 7071 struct spdk_bdev_io *bdev_io = ctx; 7072 7073 /* u.bdev.accel_sequence should have already been cleared at this point */ 7074 assert(bdev_io->u.bdev.accel_sequence == NULL); 7075 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7076 bdev_io->internal.accel_sequence = NULL; 7077 7078 if (spdk_unlikely(status != 0)) { 7079 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7080 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7081 } 7082 7083 bdev_io_complete(bdev_io); 7084 } 7085 7086 void 7087 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7088 { 7089 struct spdk_bdev *bdev = bdev_io->bdev; 7090 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7091 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7092 7093 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7094 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7095 spdk_bdev_get_module_name(bdev), 7096 bdev_io_status_get_string(bdev_io->internal.status)); 7097 assert(false); 7098 } 7099 bdev_io->internal.status = status; 7100 7101 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7102 bool unlock_channels = false; 7103 7104 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7105 SPDK_ERRLOG("NOMEM returned for reset\n"); 7106 } 7107 spdk_spin_lock(&bdev->internal.spinlock); 7108 if (bdev_io == bdev->internal.reset_in_progress) { 7109 bdev->internal.reset_in_progress = NULL; 7110 unlock_channels = true; 7111 } 7112 spdk_spin_unlock(&bdev->internal.spinlock); 7113 7114 if (unlock_channels) { 7115 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7116 bdev_reset_complete); 7117 return; 7118 } 7119 } else { 7120 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7121 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7122 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7123 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7124 return; 7125 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7126 !bdev_io_use_accel_sequence(bdev_io))) { 7127 _bdev_io_push_bounce_data_buffer(bdev_io, 7128 _bdev_io_complete_push_bounce_done); 7129 /* bdev IO will be completed in the callback */ 7130 return; 7131 } 7132 } 7133 7134 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7135 return; 7136 } 7137 } 7138 7139 bdev_io_complete(bdev_io); 7140 } 7141 7142 void 7143 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7144 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7145 { 7146 enum spdk_bdev_io_status status; 7147 7148 if (sc == SPDK_SCSI_STATUS_GOOD) { 7149 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7150 } else { 7151 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7152 bdev_io->internal.error.scsi.sc = sc; 7153 bdev_io->internal.error.scsi.sk = sk; 7154 bdev_io->internal.error.scsi.asc = asc; 7155 bdev_io->internal.error.scsi.ascq = ascq; 7156 } 7157 7158 spdk_bdev_io_complete(bdev_io, status); 7159 } 7160 7161 void 7162 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7163 int *sc, int *sk, int *asc, int *ascq) 7164 { 7165 assert(sc != NULL); 7166 assert(sk != NULL); 7167 assert(asc != NULL); 7168 assert(ascq != NULL); 7169 7170 switch (bdev_io->internal.status) { 7171 case SPDK_BDEV_IO_STATUS_SUCCESS: 7172 *sc = SPDK_SCSI_STATUS_GOOD; 7173 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7174 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7175 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7176 break; 7177 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7178 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7179 break; 7180 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7181 *sc = bdev_io->internal.error.scsi.sc; 7182 *sk = bdev_io->internal.error.scsi.sk; 7183 *asc = bdev_io->internal.error.scsi.asc; 7184 *ascq = bdev_io->internal.error.scsi.ascq; 7185 break; 7186 default: 7187 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7188 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7189 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7190 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7191 break; 7192 } 7193 } 7194 7195 void 7196 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7197 { 7198 enum spdk_bdev_io_status status; 7199 7200 if (aio_result == 0) { 7201 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7202 } else { 7203 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7204 } 7205 7206 bdev_io->internal.error.aio_result = aio_result; 7207 7208 spdk_bdev_io_complete(bdev_io, status); 7209 } 7210 7211 void 7212 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7213 { 7214 assert(aio_result != NULL); 7215 7216 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7217 *aio_result = bdev_io->internal.error.aio_result; 7218 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7219 *aio_result = 0; 7220 } else { 7221 *aio_result = -EIO; 7222 } 7223 } 7224 7225 void 7226 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7227 { 7228 enum spdk_bdev_io_status status; 7229 7230 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7231 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7232 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7233 status = SPDK_BDEV_IO_STATUS_ABORTED; 7234 } else { 7235 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7236 } 7237 7238 bdev_io->internal.error.nvme.cdw0 = cdw0; 7239 bdev_io->internal.error.nvme.sct = sct; 7240 bdev_io->internal.error.nvme.sc = sc; 7241 7242 spdk_bdev_io_complete(bdev_io, status); 7243 } 7244 7245 void 7246 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7247 { 7248 assert(sct != NULL); 7249 assert(sc != NULL); 7250 assert(cdw0 != NULL); 7251 7252 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7253 *sct = SPDK_NVME_SCT_GENERIC; 7254 *sc = SPDK_NVME_SC_SUCCESS; 7255 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7256 *cdw0 = 0; 7257 } else { 7258 *cdw0 = 1U; 7259 } 7260 return; 7261 } 7262 7263 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7264 *sct = bdev_io->internal.error.nvme.sct; 7265 *sc = bdev_io->internal.error.nvme.sc; 7266 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7267 *sct = SPDK_NVME_SCT_GENERIC; 7268 *sc = SPDK_NVME_SC_SUCCESS; 7269 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7270 *sct = SPDK_NVME_SCT_GENERIC; 7271 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7272 } else { 7273 *sct = SPDK_NVME_SCT_GENERIC; 7274 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7275 } 7276 7277 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7278 } 7279 7280 void 7281 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7282 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7283 { 7284 assert(first_sct != NULL); 7285 assert(first_sc != NULL); 7286 assert(second_sct != NULL); 7287 assert(second_sc != NULL); 7288 assert(cdw0 != NULL); 7289 7290 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7291 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7292 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7293 *first_sct = bdev_io->internal.error.nvme.sct; 7294 *first_sc = bdev_io->internal.error.nvme.sc; 7295 *second_sct = SPDK_NVME_SCT_GENERIC; 7296 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7297 } else { 7298 *first_sct = SPDK_NVME_SCT_GENERIC; 7299 *first_sc = SPDK_NVME_SC_SUCCESS; 7300 *second_sct = bdev_io->internal.error.nvme.sct; 7301 *second_sc = bdev_io->internal.error.nvme.sc; 7302 } 7303 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7304 *first_sct = SPDK_NVME_SCT_GENERIC; 7305 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7306 *second_sct = SPDK_NVME_SCT_GENERIC; 7307 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7308 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7309 *first_sct = SPDK_NVME_SCT_GENERIC; 7310 *first_sc = SPDK_NVME_SC_SUCCESS; 7311 *second_sct = SPDK_NVME_SCT_GENERIC; 7312 *second_sc = SPDK_NVME_SC_SUCCESS; 7313 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7314 *first_sct = SPDK_NVME_SCT_GENERIC; 7315 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7316 *second_sct = SPDK_NVME_SCT_GENERIC; 7317 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7318 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7319 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7320 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7321 *second_sct = SPDK_NVME_SCT_GENERIC; 7322 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7323 } else { 7324 *first_sct = SPDK_NVME_SCT_GENERIC; 7325 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7326 *second_sct = SPDK_NVME_SCT_GENERIC; 7327 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7328 } 7329 7330 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7331 } 7332 7333 struct spdk_thread * 7334 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7335 { 7336 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7337 } 7338 7339 struct spdk_io_channel * 7340 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7341 { 7342 return bdev_io->internal.ch->channel; 7343 } 7344 7345 static int 7346 bdev_register(struct spdk_bdev *bdev) 7347 { 7348 char *bdev_name; 7349 char uuid[SPDK_UUID_STRING_LEN]; 7350 struct spdk_iobuf_opts iobuf_opts; 7351 int ret, i; 7352 7353 assert(bdev->module != NULL); 7354 7355 if (!bdev->name) { 7356 SPDK_ERRLOG("Bdev name is NULL\n"); 7357 return -EINVAL; 7358 } 7359 7360 if (!strlen(bdev->name)) { 7361 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7362 return -EINVAL; 7363 } 7364 7365 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7366 if (bdev->fn_table->accel_sequence_supported == NULL) { 7367 continue; 7368 } 7369 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7370 (enum spdk_bdev_io_type)i)) { 7371 continue; 7372 } 7373 7374 if (spdk_bdev_is_md_separate(bdev)) { 7375 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7376 "accel sequence support\n"); 7377 return -EINVAL; 7378 } 7379 } 7380 7381 /* Users often register their own I/O devices using the bdev name. In 7382 * order to avoid conflicts, prepend bdev_. */ 7383 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7384 if (!bdev_name) { 7385 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7386 return -ENOMEM; 7387 } 7388 7389 bdev->internal.stat = bdev_alloc_io_stat(true); 7390 if (!bdev->internal.stat) { 7391 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7392 free(bdev_name); 7393 return -ENOMEM; 7394 } 7395 7396 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7397 bdev->internal.measured_queue_depth = UINT64_MAX; 7398 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7399 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7400 bdev->internal.qd_poller = NULL; 7401 bdev->internal.qos = NULL; 7402 7403 TAILQ_INIT(&bdev->internal.open_descs); 7404 TAILQ_INIT(&bdev->internal.locked_ranges); 7405 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7406 TAILQ_INIT(&bdev->aliases); 7407 7408 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7409 if (ret != 0) { 7410 bdev_free_io_stat(bdev->internal.stat); 7411 free(bdev_name); 7412 return ret; 7413 } 7414 7415 /* UUID may be specified by the user or defined by bdev itself. 7416 * Otherwise it will be generated here, so this field will never be empty. */ 7417 if (spdk_uuid_is_null(&bdev->uuid)) { 7418 spdk_uuid_generate(&bdev->uuid); 7419 } 7420 7421 /* Add the UUID alias only if it's different than the name */ 7422 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7423 if (strcmp(bdev->name, uuid) != 0) { 7424 ret = spdk_bdev_alias_add(bdev, uuid); 7425 if (ret != 0) { 7426 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7427 bdev_name_del(&bdev->internal.bdev_name); 7428 bdev_free_io_stat(bdev->internal.stat); 7429 free(bdev_name); 7430 return ret; 7431 } 7432 } 7433 7434 if (spdk_bdev_get_buf_align(bdev) > 1) { 7435 if (bdev->split_on_optimal_io_boundary) { 7436 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7437 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7438 } else { 7439 bdev->split_on_optimal_io_boundary = true; 7440 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7441 } 7442 } 7443 7444 /* If the user didn't specify a write unit size, set it to one. */ 7445 if (bdev->write_unit_size == 0) { 7446 bdev->write_unit_size = 1; 7447 } 7448 7449 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7450 if (bdev->acwu == 0) { 7451 bdev->acwu = bdev->write_unit_size; 7452 } 7453 7454 if (bdev->phys_blocklen == 0) { 7455 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7456 } 7457 7458 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7459 spdk_iobuf_get_opts(&iobuf_opts); 7460 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7461 } 7462 7463 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7464 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7465 } 7466 7467 bdev->internal.reset_in_progress = NULL; 7468 bdev->internal.qd_poll_in_progress = false; 7469 bdev->internal.period = 0; 7470 bdev->internal.new_period = 0; 7471 7472 spdk_io_device_register(__bdev_to_io_dev(bdev), 7473 bdev_channel_create, bdev_channel_destroy, 7474 sizeof(struct spdk_bdev_channel), 7475 bdev_name); 7476 7477 free(bdev_name); 7478 7479 spdk_spin_init(&bdev->internal.spinlock); 7480 7481 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7482 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7483 7484 return 0; 7485 } 7486 7487 static void 7488 bdev_destroy_cb(void *io_device) 7489 { 7490 int rc; 7491 struct spdk_bdev *bdev; 7492 spdk_bdev_unregister_cb cb_fn; 7493 void *cb_arg; 7494 7495 bdev = __bdev_from_io_dev(io_device); 7496 7497 if (bdev->internal.unregister_td != spdk_get_thread()) { 7498 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7499 return; 7500 } 7501 7502 cb_fn = bdev->internal.unregister_cb; 7503 cb_arg = bdev->internal.unregister_ctx; 7504 7505 spdk_spin_destroy(&bdev->internal.spinlock); 7506 free(bdev->internal.qos); 7507 bdev_free_io_stat(bdev->internal.stat); 7508 7509 rc = bdev->fn_table->destruct(bdev->ctxt); 7510 if (rc < 0) { 7511 SPDK_ERRLOG("destruct failed\n"); 7512 } 7513 if (rc <= 0 && cb_fn != NULL) { 7514 cb_fn(cb_arg, rc); 7515 } 7516 } 7517 7518 void 7519 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7520 { 7521 if (bdev->internal.unregister_cb != NULL) { 7522 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7523 } 7524 } 7525 7526 static void 7527 _remove_notify(void *arg) 7528 { 7529 struct spdk_bdev_desc *desc = arg; 7530 7531 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7532 } 7533 7534 /* returns: 0 - bdev removed and ready to be destructed. 7535 * -EBUSY - bdev can't be destructed yet. */ 7536 static int 7537 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7538 { 7539 struct spdk_bdev_desc *desc, *tmp; 7540 int rc = 0; 7541 char uuid[SPDK_UUID_STRING_LEN]; 7542 7543 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7544 assert(spdk_spin_held(&bdev->internal.spinlock)); 7545 7546 /* Notify each descriptor about hotremoval */ 7547 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7548 rc = -EBUSY; 7549 /* 7550 * Defer invocation of the event_cb to a separate message that will 7551 * run later on its thread. This ensures this context unwinds and 7552 * we don't recursively unregister this bdev again if the event_cb 7553 * immediately closes its descriptor. 7554 */ 7555 event_notify(desc, _remove_notify); 7556 } 7557 7558 /* If there are no descriptors, proceed removing the bdev */ 7559 if (rc == 0) { 7560 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7561 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7562 7563 /* Delete the name and the UUID alias */ 7564 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7565 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7566 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7567 7568 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7569 7570 if (bdev->internal.reset_in_progress != NULL) { 7571 /* If reset is in progress, let the completion callback for reset 7572 * unregister the bdev. 7573 */ 7574 rc = -EBUSY; 7575 } 7576 } 7577 7578 return rc; 7579 } 7580 7581 static void 7582 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7583 struct spdk_io_channel *io_ch, void *_ctx) 7584 { 7585 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7586 7587 bdev_channel_abort_queued_ios(bdev_ch); 7588 spdk_bdev_for_each_channel_continue(i, 0); 7589 } 7590 7591 static void 7592 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7593 { 7594 int rc; 7595 7596 spdk_spin_lock(&g_bdev_mgr.spinlock); 7597 spdk_spin_lock(&bdev->internal.spinlock); 7598 /* 7599 * Set the status to REMOVING after completing to abort channels. Otherwise, 7600 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7601 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7602 * may fail. 7603 */ 7604 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7605 rc = bdev_unregister_unsafe(bdev); 7606 spdk_spin_unlock(&bdev->internal.spinlock); 7607 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7608 7609 if (rc == 0) { 7610 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7611 } 7612 } 7613 7614 void 7615 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7616 { 7617 struct spdk_thread *thread; 7618 7619 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7620 7621 thread = spdk_get_thread(); 7622 if (!thread) { 7623 /* The user called this from a non-SPDK thread. */ 7624 if (cb_fn != NULL) { 7625 cb_fn(cb_arg, -ENOTSUP); 7626 } 7627 return; 7628 } 7629 7630 spdk_spin_lock(&g_bdev_mgr.spinlock); 7631 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7632 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7633 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7634 if (cb_fn) { 7635 cb_fn(cb_arg, -EBUSY); 7636 } 7637 return; 7638 } 7639 7640 spdk_spin_lock(&bdev->internal.spinlock); 7641 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7642 bdev->internal.unregister_cb = cb_fn; 7643 bdev->internal.unregister_ctx = cb_arg; 7644 bdev->internal.unregister_td = thread; 7645 spdk_spin_unlock(&bdev->internal.spinlock); 7646 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7647 7648 spdk_bdev_set_qd_sampling_period(bdev, 0); 7649 7650 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7651 bdev_unregister); 7652 } 7653 7654 int 7655 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7656 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7657 { 7658 struct spdk_bdev_desc *desc; 7659 struct spdk_bdev *bdev; 7660 int rc; 7661 7662 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7663 if (rc != 0) { 7664 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7665 return rc; 7666 } 7667 7668 bdev = spdk_bdev_desc_get_bdev(desc); 7669 7670 if (bdev->module != module) { 7671 spdk_bdev_close(desc); 7672 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7673 bdev_name); 7674 return -ENODEV; 7675 } 7676 7677 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7678 7679 spdk_bdev_close(desc); 7680 7681 return 0; 7682 } 7683 7684 static int 7685 bdev_start_qos(struct spdk_bdev *bdev) 7686 { 7687 struct set_qos_limit_ctx *ctx; 7688 7689 /* Enable QoS */ 7690 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7691 ctx = calloc(1, sizeof(*ctx)); 7692 if (ctx == NULL) { 7693 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7694 return -ENOMEM; 7695 } 7696 ctx->bdev = bdev; 7697 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7698 } 7699 7700 return 0; 7701 } 7702 7703 static void 7704 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7705 struct spdk_bdev *bdev) 7706 { 7707 enum spdk_bdev_claim_type type; 7708 const char *typename, *modname; 7709 extern struct spdk_log_flag SPDK_LOG_bdev; 7710 7711 assert(spdk_spin_held(&bdev->internal.spinlock)); 7712 7713 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7714 return; 7715 } 7716 7717 type = bdev->internal.claim_type; 7718 typename = spdk_bdev_claim_get_name(type); 7719 7720 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7721 modname = bdev->internal.claim.v1.module->name; 7722 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7723 bdev->name, detail, typename, modname); 7724 return; 7725 } 7726 7727 if (claim_type_is_v2(type)) { 7728 struct spdk_bdev_module_claim *claim; 7729 7730 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7731 modname = claim->module->name; 7732 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7733 bdev->name, detail, typename, modname); 7734 } 7735 return; 7736 } 7737 7738 assert(false); 7739 } 7740 7741 static int 7742 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7743 { 7744 struct spdk_thread *thread; 7745 int rc = 0; 7746 7747 thread = spdk_get_thread(); 7748 if (!thread) { 7749 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7750 return -ENOTSUP; 7751 } 7752 7753 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7754 spdk_get_thread()); 7755 7756 desc->bdev = bdev; 7757 desc->thread = thread; 7758 desc->write = write; 7759 7760 spdk_spin_lock(&bdev->internal.spinlock); 7761 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7762 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7763 spdk_spin_unlock(&bdev->internal.spinlock); 7764 return -ENODEV; 7765 } 7766 7767 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7768 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7769 spdk_spin_unlock(&bdev->internal.spinlock); 7770 return -EPERM; 7771 } 7772 7773 rc = bdev_start_qos(bdev); 7774 if (rc != 0) { 7775 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7776 spdk_spin_unlock(&bdev->internal.spinlock); 7777 return rc; 7778 } 7779 7780 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7781 7782 spdk_spin_unlock(&bdev->internal.spinlock); 7783 7784 return 0; 7785 } 7786 7787 static int 7788 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7789 struct spdk_bdev_desc **_desc) 7790 { 7791 struct spdk_bdev_desc *desc; 7792 unsigned int i; 7793 7794 desc = calloc(1, sizeof(*desc)); 7795 if (desc == NULL) { 7796 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7797 return -ENOMEM; 7798 } 7799 7800 TAILQ_INIT(&desc->pending_media_events); 7801 TAILQ_INIT(&desc->free_media_events); 7802 7803 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7804 desc->callback.event_fn = event_cb; 7805 desc->callback.ctx = event_ctx; 7806 spdk_spin_init(&desc->spinlock); 7807 7808 if (bdev->media_events) { 7809 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7810 sizeof(*desc->media_events_buffer)); 7811 if (desc->media_events_buffer == NULL) { 7812 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7813 bdev_desc_free(desc); 7814 return -ENOMEM; 7815 } 7816 7817 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7818 TAILQ_INSERT_TAIL(&desc->free_media_events, 7819 &desc->media_events_buffer[i], tailq); 7820 } 7821 } 7822 7823 if (bdev->fn_table->accel_sequence_supported != NULL) { 7824 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7825 desc->accel_sequence_supported[i] = 7826 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7827 (enum spdk_bdev_io_type)i); 7828 } 7829 } 7830 7831 *_desc = desc; 7832 7833 return 0; 7834 } 7835 7836 static int 7837 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7838 void *event_ctx, struct spdk_bdev_desc **_desc) 7839 { 7840 struct spdk_bdev_desc *desc; 7841 struct spdk_bdev *bdev; 7842 int rc; 7843 7844 bdev = bdev_get_by_name(bdev_name); 7845 7846 if (bdev == NULL) { 7847 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7848 return -ENODEV; 7849 } 7850 7851 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7852 if (rc != 0) { 7853 return rc; 7854 } 7855 7856 rc = bdev_open(bdev, write, desc); 7857 if (rc != 0) { 7858 bdev_desc_free(desc); 7859 desc = NULL; 7860 } 7861 7862 *_desc = desc; 7863 7864 return rc; 7865 } 7866 7867 int 7868 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7869 void *event_ctx, struct spdk_bdev_desc **_desc) 7870 { 7871 int rc; 7872 7873 if (event_cb == NULL) { 7874 SPDK_ERRLOG("Missing event callback function\n"); 7875 return -EINVAL; 7876 } 7877 7878 spdk_spin_lock(&g_bdev_mgr.spinlock); 7879 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 7880 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7881 7882 return rc; 7883 } 7884 7885 struct spdk_bdev_open_async_ctx { 7886 char *bdev_name; 7887 spdk_bdev_event_cb_t event_cb; 7888 void *event_ctx; 7889 bool write; 7890 int rc; 7891 spdk_bdev_open_async_cb_t cb_fn; 7892 void *cb_arg; 7893 struct spdk_bdev_desc *desc; 7894 struct spdk_bdev_open_async_opts opts; 7895 uint64_t start_ticks; 7896 struct spdk_thread *orig_thread; 7897 struct spdk_poller *poller; 7898 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 7899 }; 7900 7901 static void 7902 bdev_open_async_done(void *arg) 7903 { 7904 struct spdk_bdev_open_async_ctx *ctx = arg; 7905 7906 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 7907 7908 free(ctx->bdev_name); 7909 free(ctx); 7910 } 7911 7912 static void 7913 bdev_open_async_cancel(void *arg) 7914 { 7915 struct spdk_bdev_open_async_ctx *ctx = arg; 7916 7917 assert(ctx->rc == -ESHUTDOWN); 7918 7919 spdk_poller_unregister(&ctx->poller); 7920 7921 bdev_open_async_done(ctx); 7922 } 7923 7924 /* This is called when the bdev library finishes at shutdown. */ 7925 static void 7926 bdev_open_async_fini(void) 7927 { 7928 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 7929 7930 spdk_spin_lock(&g_bdev_mgr.spinlock); 7931 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 7932 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 7933 /* 7934 * We have to move to ctx->orig_thread to unregister ctx->poller. 7935 * However, there is a chance that ctx->poller is executed before 7936 * message is executed, which could result in bdev_open_async_done() 7937 * being called twice. To avoid such race condition, set ctx->rc to 7938 * -ESHUTDOWN. 7939 */ 7940 ctx->rc = -ESHUTDOWN; 7941 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 7942 } 7943 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7944 } 7945 7946 static int bdev_open_async(void *arg); 7947 7948 static void 7949 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 7950 { 7951 uint64_t timeout_ticks; 7952 7953 if (ctx->rc == -ESHUTDOWN) { 7954 /* This context is being canceled. Do nothing. */ 7955 return; 7956 } 7957 7958 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 7959 &ctx->desc); 7960 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 7961 goto exit; 7962 } 7963 7964 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 7965 if (spdk_get_ticks() >= timeout_ticks) { 7966 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 7967 ctx->rc = -ETIMEDOUT; 7968 goto exit; 7969 } 7970 7971 return; 7972 7973 exit: 7974 spdk_poller_unregister(&ctx->poller); 7975 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 7976 7977 /* Completion callback is processed after stack unwinding. */ 7978 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 7979 } 7980 7981 static int 7982 bdev_open_async(void *arg) 7983 { 7984 struct spdk_bdev_open_async_ctx *ctx = arg; 7985 7986 spdk_spin_lock(&g_bdev_mgr.spinlock); 7987 7988 _bdev_open_async(ctx); 7989 7990 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7991 7992 return SPDK_POLLER_BUSY; 7993 } 7994 7995 static void 7996 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 7997 struct spdk_bdev_open_async_opts *opts_src, 7998 size_t size) 7999 { 8000 assert(opts); 8001 assert(opts_src); 8002 8003 opts->size = size; 8004 8005 #define SET_FIELD(field) \ 8006 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8007 opts->field = opts_src->field; \ 8008 } \ 8009 8010 SET_FIELD(timeout_ms); 8011 8012 /* Do not remove this statement, you should always update this statement when you adding a new field, 8013 * and do not forget to add the SET_FIELD statement for your added field. */ 8014 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8015 8016 #undef SET_FIELD 8017 } 8018 8019 static void 8020 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8021 { 8022 assert(opts); 8023 8024 opts->size = size; 8025 8026 #define SET_FIELD(field, value) \ 8027 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8028 opts->field = value; \ 8029 } \ 8030 8031 SET_FIELD(timeout_ms, 0); 8032 8033 #undef SET_FIELD 8034 } 8035 8036 int 8037 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8038 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8039 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8040 { 8041 struct spdk_bdev_open_async_ctx *ctx; 8042 8043 if (event_cb == NULL) { 8044 SPDK_ERRLOG("Missing event callback function\n"); 8045 return -EINVAL; 8046 } 8047 8048 if (open_cb == NULL) { 8049 SPDK_ERRLOG("Missing open callback function\n"); 8050 return -EINVAL; 8051 } 8052 8053 if (opts != NULL && opts->size == 0) { 8054 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8055 return -EINVAL; 8056 } 8057 8058 ctx = calloc(1, sizeof(*ctx)); 8059 if (ctx == NULL) { 8060 SPDK_ERRLOG("Failed to allocate open context\n"); 8061 return -ENOMEM; 8062 } 8063 8064 ctx->bdev_name = strdup(bdev_name); 8065 if (ctx->bdev_name == NULL) { 8066 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8067 free(ctx); 8068 return -ENOMEM; 8069 } 8070 8071 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8072 if (ctx->poller == NULL) { 8073 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8074 free(ctx->bdev_name); 8075 free(ctx); 8076 return -ENOMEM; 8077 } 8078 8079 ctx->cb_fn = open_cb; 8080 ctx->cb_arg = open_cb_arg; 8081 ctx->write = write; 8082 ctx->event_cb = event_cb; 8083 ctx->event_ctx = event_ctx; 8084 ctx->orig_thread = spdk_get_thread(); 8085 ctx->start_ticks = spdk_get_ticks(); 8086 8087 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8088 if (opts != NULL) { 8089 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8090 } 8091 8092 spdk_spin_lock(&g_bdev_mgr.spinlock); 8093 8094 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8095 _bdev_open_async(ctx); 8096 8097 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8098 8099 return 0; 8100 } 8101 8102 static void 8103 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8104 { 8105 int rc; 8106 8107 spdk_spin_lock(&bdev->internal.spinlock); 8108 spdk_spin_lock(&desc->spinlock); 8109 8110 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8111 8112 desc->closed = true; 8113 8114 if (desc->claim != NULL) { 8115 bdev_desc_release_claims(desc); 8116 } 8117 8118 if (0 == desc->refs) { 8119 spdk_spin_unlock(&desc->spinlock); 8120 bdev_desc_free(desc); 8121 } else { 8122 spdk_spin_unlock(&desc->spinlock); 8123 } 8124 8125 /* If no more descriptors, kill QoS channel */ 8126 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8127 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8128 bdev->name, spdk_get_thread()); 8129 8130 if (bdev_qos_destroy(bdev)) { 8131 /* There isn't anything we can do to recover here. Just let the 8132 * old QoS poller keep running. The QoS handling won't change 8133 * cores when the user allocates a new channel, but it won't break. */ 8134 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8135 } 8136 } 8137 8138 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8139 rc = bdev_unregister_unsafe(bdev); 8140 spdk_spin_unlock(&bdev->internal.spinlock); 8141 8142 if (rc == 0) { 8143 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8144 } 8145 } else { 8146 spdk_spin_unlock(&bdev->internal.spinlock); 8147 } 8148 } 8149 8150 void 8151 spdk_bdev_close(struct spdk_bdev_desc *desc) 8152 { 8153 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8154 8155 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8156 spdk_get_thread()); 8157 8158 assert(desc->thread == spdk_get_thread()); 8159 8160 spdk_poller_unregister(&desc->io_timeout_poller); 8161 8162 spdk_spin_lock(&g_bdev_mgr.spinlock); 8163 8164 bdev_close(bdev, desc); 8165 8166 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8167 } 8168 8169 static void 8170 bdev_register_finished(void *arg) 8171 { 8172 struct spdk_bdev_desc *desc = arg; 8173 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8174 8175 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8176 8177 spdk_spin_lock(&g_bdev_mgr.spinlock); 8178 8179 bdev_close(bdev, desc); 8180 8181 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8182 } 8183 8184 int 8185 spdk_bdev_register(struct spdk_bdev *bdev) 8186 { 8187 struct spdk_bdev_desc *desc; 8188 struct spdk_thread *thread = spdk_get_thread(); 8189 int rc; 8190 8191 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8192 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 8193 thread ? spdk_thread_get_name(thread) : "null"); 8194 return -EINVAL; 8195 } 8196 8197 rc = bdev_register(bdev); 8198 if (rc != 0) { 8199 return rc; 8200 } 8201 8202 /* A descriptor is opened to prevent bdev deletion during examination */ 8203 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8204 if (rc != 0) { 8205 spdk_bdev_unregister(bdev, NULL, NULL); 8206 return rc; 8207 } 8208 8209 rc = bdev_open(bdev, false, desc); 8210 if (rc != 0) { 8211 bdev_desc_free(desc); 8212 spdk_bdev_unregister(bdev, NULL, NULL); 8213 return rc; 8214 } 8215 8216 /* Examine configuration before initializing I/O */ 8217 bdev_examine(bdev); 8218 8219 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8220 if (rc != 0) { 8221 bdev_close(bdev, desc); 8222 spdk_bdev_unregister(bdev, NULL, NULL); 8223 } 8224 8225 return rc; 8226 } 8227 8228 int 8229 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8230 struct spdk_bdev_module *module) 8231 { 8232 spdk_spin_lock(&bdev->internal.spinlock); 8233 8234 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8235 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8236 spdk_spin_unlock(&bdev->internal.spinlock); 8237 return -EPERM; 8238 } 8239 8240 if (desc && !desc->write) { 8241 desc->write = true; 8242 } 8243 8244 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8245 bdev->internal.claim.v1.module = module; 8246 8247 spdk_spin_unlock(&bdev->internal.spinlock); 8248 return 0; 8249 } 8250 8251 void 8252 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8253 { 8254 spdk_spin_lock(&bdev->internal.spinlock); 8255 8256 assert(bdev->internal.claim.v1.module != NULL); 8257 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8258 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8259 bdev->internal.claim.v1.module = NULL; 8260 8261 spdk_spin_unlock(&bdev->internal.spinlock); 8262 } 8263 8264 /* 8265 * Start claims v2 8266 */ 8267 8268 const char * 8269 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8270 { 8271 switch (type) { 8272 case SPDK_BDEV_CLAIM_NONE: 8273 return "not_claimed"; 8274 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8275 return "exclusive_write"; 8276 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8277 return "read_many_write_one"; 8278 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8279 return "read_many_write_none"; 8280 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8281 return "read_many_write_many"; 8282 default: 8283 break; 8284 } 8285 return "invalid_claim"; 8286 } 8287 8288 static bool 8289 claim_type_is_v2(enum spdk_bdev_claim_type type) 8290 { 8291 switch (type) { 8292 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8293 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8294 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8295 return true; 8296 default: 8297 break; 8298 } 8299 return false; 8300 } 8301 8302 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8303 static bool 8304 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8305 { 8306 switch (type) { 8307 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8308 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8309 return true; 8310 default: 8311 break; 8312 } 8313 return false; 8314 } 8315 8316 void 8317 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8318 { 8319 if (opts == NULL) { 8320 SPDK_ERRLOG("opts should not be NULL\n"); 8321 assert(opts != NULL); 8322 return; 8323 } 8324 if (size == 0) { 8325 SPDK_ERRLOG("size should not be zero\n"); 8326 assert(size != 0); 8327 return; 8328 } 8329 8330 memset(opts, 0, size); 8331 opts->opts_size = size; 8332 8333 #define FIELD_OK(field) \ 8334 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8335 8336 #define SET_FIELD(field, value) \ 8337 if (FIELD_OK(field)) { \ 8338 opts->field = value; \ 8339 } \ 8340 8341 SET_FIELD(shared_claim_key, 0); 8342 8343 #undef FIELD_OK 8344 #undef SET_FIELD 8345 } 8346 8347 static int 8348 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8349 { 8350 if (src->opts_size == 0) { 8351 SPDK_ERRLOG("size should not be zero\n"); 8352 return -1; 8353 } 8354 8355 memset(dst, 0, sizeof(*dst)); 8356 dst->opts_size = src->opts_size; 8357 8358 #define FIELD_OK(field) \ 8359 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8360 8361 #define SET_FIELD(field) \ 8362 if (FIELD_OK(field)) { \ 8363 dst->field = src->field; \ 8364 } \ 8365 8366 if (FIELD_OK(name)) { 8367 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8368 } 8369 8370 SET_FIELD(shared_claim_key); 8371 8372 /* You should not remove this statement, but need to update the assert statement 8373 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8374 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8375 8376 #undef FIELD_OK 8377 #undef SET_FIELD 8378 return 0; 8379 } 8380 8381 /* Returns 0 if a read-write-once claim can be taken. */ 8382 static int 8383 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8384 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8385 { 8386 struct spdk_bdev *bdev = desc->bdev; 8387 struct spdk_bdev_desc *open_desc; 8388 8389 assert(spdk_spin_held(&bdev->internal.spinlock)); 8390 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8391 8392 if (opts->shared_claim_key != 0) { 8393 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8394 bdev->name); 8395 return -EINVAL; 8396 } 8397 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8398 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8399 return -EPERM; 8400 } 8401 if (desc->claim != NULL) { 8402 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8403 bdev->name, desc->claim->module->name); 8404 return -EPERM; 8405 } 8406 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8407 if (desc != open_desc && open_desc->write) { 8408 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8409 "another descriptor is open for writing\n", 8410 bdev->name); 8411 return -EPERM; 8412 } 8413 } 8414 8415 return 0; 8416 } 8417 8418 /* Returns 0 if a read-only-many claim can be taken. */ 8419 static int 8420 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8421 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8422 { 8423 struct spdk_bdev *bdev = desc->bdev; 8424 struct spdk_bdev_desc *open_desc; 8425 8426 assert(spdk_spin_held(&bdev->internal.spinlock)); 8427 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8428 assert(desc->claim == NULL); 8429 8430 if (desc->write) { 8431 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8432 bdev->name); 8433 return -EINVAL; 8434 } 8435 if (opts->shared_claim_key != 0) { 8436 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8437 return -EINVAL; 8438 } 8439 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8440 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8441 if (open_desc->write) { 8442 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8443 "another descriptor is open for writing\n", 8444 bdev->name); 8445 return -EPERM; 8446 } 8447 } 8448 } 8449 8450 return 0; 8451 } 8452 8453 /* Returns 0 if a read-write-many claim can be taken. */ 8454 static int 8455 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8456 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8457 { 8458 struct spdk_bdev *bdev = desc->bdev; 8459 struct spdk_bdev_desc *open_desc; 8460 8461 assert(spdk_spin_held(&bdev->internal.spinlock)); 8462 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8463 assert(desc->claim == NULL); 8464 8465 if (opts->shared_claim_key == 0) { 8466 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8467 bdev->name); 8468 return -EINVAL; 8469 } 8470 switch (bdev->internal.claim_type) { 8471 case SPDK_BDEV_CLAIM_NONE: 8472 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8473 if (open_desc == desc) { 8474 continue; 8475 } 8476 if (open_desc->write) { 8477 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8478 "another descriptor is open for writing without a " 8479 "claim\n", bdev->name); 8480 return -EPERM; 8481 } 8482 } 8483 break; 8484 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8485 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8486 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8487 return -EPERM; 8488 } 8489 break; 8490 default: 8491 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8492 return -EBUSY; 8493 } 8494 8495 return 0; 8496 } 8497 8498 /* Updates desc and its bdev with a v2 claim. */ 8499 static int 8500 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8501 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8502 { 8503 struct spdk_bdev *bdev = desc->bdev; 8504 struct spdk_bdev_module_claim *claim; 8505 8506 assert(spdk_spin_held(&bdev->internal.spinlock)); 8507 assert(claim_type_is_v2(type)); 8508 assert(desc->claim == NULL); 8509 8510 claim = calloc(1, sizeof(*desc->claim)); 8511 if (claim == NULL) { 8512 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8513 return -ENOMEM; 8514 } 8515 claim->module = module; 8516 claim->desc = desc; 8517 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8518 memcpy(claim->name, opts->name, sizeof(claim->name)); 8519 desc->claim = claim; 8520 8521 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8522 bdev->internal.claim_type = type; 8523 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8524 bdev->internal.claim.v2.key = opts->shared_claim_key; 8525 } 8526 assert(type == bdev->internal.claim_type); 8527 8528 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8529 8530 if (!desc->write && claim_type_promotes_to_write(type)) { 8531 desc->write = true; 8532 } 8533 8534 return 0; 8535 } 8536 8537 int 8538 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8539 struct spdk_bdev_claim_opts *_opts, 8540 struct spdk_bdev_module *module) 8541 { 8542 struct spdk_bdev *bdev; 8543 struct spdk_bdev_claim_opts opts; 8544 int rc = 0; 8545 8546 if (desc == NULL) { 8547 SPDK_ERRLOG("descriptor must not be NULL\n"); 8548 return -EINVAL; 8549 } 8550 8551 bdev = desc->bdev; 8552 8553 if (_opts == NULL) { 8554 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8555 } else if (claim_opts_copy(_opts, &opts) != 0) { 8556 return -EINVAL; 8557 } 8558 8559 spdk_spin_lock(&bdev->internal.spinlock); 8560 8561 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8562 bdev->internal.claim_type != type) { 8563 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8564 spdk_spin_unlock(&bdev->internal.spinlock); 8565 return -EPERM; 8566 } 8567 8568 if (claim_type_is_v2(type) && desc->claim != NULL) { 8569 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8570 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8571 spdk_spin_unlock(&bdev->internal.spinlock); 8572 return -EPERM; 8573 } 8574 8575 switch (type) { 8576 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8577 spdk_spin_unlock(&bdev->internal.spinlock); 8578 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8579 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8580 rc = claim_verify_rwo(desc, type, &opts, module); 8581 break; 8582 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8583 rc = claim_verify_rom(desc, type, &opts, module); 8584 break; 8585 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8586 rc = claim_verify_rwm(desc, type, &opts, module); 8587 break; 8588 default: 8589 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8590 rc = -ENOTSUP; 8591 } 8592 8593 if (rc == 0) { 8594 rc = claim_bdev(desc, type, &opts, module); 8595 } 8596 8597 spdk_spin_unlock(&bdev->internal.spinlock); 8598 return rc; 8599 } 8600 8601 static void 8602 claim_reset(struct spdk_bdev *bdev) 8603 { 8604 assert(spdk_spin_held(&bdev->internal.spinlock)); 8605 assert(claim_type_is_v2(bdev->internal.claim_type)); 8606 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8607 8608 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8609 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8610 } 8611 8612 static void 8613 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8614 { 8615 struct spdk_bdev *bdev = desc->bdev; 8616 8617 assert(spdk_spin_held(&bdev->internal.spinlock)); 8618 assert(claim_type_is_v2(bdev->internal.claim_type)); 8619 8620 if (bdev->internal.examine_in_progress == 0) { 8621 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8622 free(desc->claim); 8623 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8624 claim_reset(bdev); 8625 } 8626 } else { 8627 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8628 desc->claim->module = NULL; 8629 desc->claim->desc = NULL; 8630 } 8631 desc->claim = NULL; 8632 } 8633 8634 /* 8635 * End claims v2 8636 */ 8637 8638 struct spdk_bdev * 8639 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8640 { 8641 assert(desc != NULL); 8642 return desc->bdev; 8643 } 8644 8645 int 8646 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8647 { 8648 struct spdk_bdev *bdev, *tmp; 8649 struct spdk_bdev_desc *desc; 8650 int rc = 0; 8651 8652 assert(fn != NULL); 8653 8654 spdk_spin_lock(&g_bdev_mgr.spinlock); 8655 bdev = spdk_bdev_first(); 8656 while (bdev != NULL) { 8657 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8658 if (rc != 0) { 8659 break; 8660 } 8661 rc = bdev_open(bdev, false, desc); 8662 if (rc != 0) { 8663 bdev_desc_free(desc); 8664 if (rc == -ENODEV) { 8665 /* Ignore the error and move to the next bdev. */ 8666 rc = 0; 8667 bdev = spdk_bdev_next(bdev); 8668 continue; 8669 } 8670 break; 8671 } 8672 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8673 8674 rc = fn(ctx, bdev); 8675 8676 spdk_spin_lock(&g_bdev_mgr.spinlock); 8677 tmp = spdk_bdev_next(bdev); 8678 bdev_close(bdev, desc); 8679 if (rc != 0) { 8680 break; 8681 } 8682 bdev = tmp; 8683 } 8684 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8685 8686 return rc; 8687 } 8688 8689 int 8690 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8691 { 8692 struct spdk_bdev *bdev, *tmp; 8693 struct spdk_bdev_desc *desc; 8694 int rc = 0; 8695 8696 assert(fn != NULL); 8697 8698 spdk_spin_lock(&g_bdev_mgr.spinlock); 8699 bdev = spdk_bdev_first_leaf(); 8700 while (bdev != NULL) { 8701 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8702 if (rc != 0) { 8703 break; 8704 } 8705 rc = bdev_open(bdev, false, desc); 8706 if (rc != 0) { 8707 bdev_desc_free(desc); 8708 if (rc == -ENODEV) { 8709 /* Ignore the error and move to the next bdev. */ 8710 rc = 0; 8711 bdev = spdk_bdev_next_leaf(bdev); 8712 continue; 8713 } 8714 break; 8715 } 8716 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8717 8718 rc = fn(ctx, bdev); 8719 8720 spdk_spin_lock(&g_bdev_mgr.spinlock); 8721 tmp = spdk_bdev_next_leaf(bdev); 8722 bdev_close(bdev, desc); 8723 if (rc != 0) { 8724 break; 8725 } 8726 bdev = tmp; 8727 } 8728 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8729 8730 return rc; 8731 } 8732 8733 void 8734 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8735 { 8736 struct iovec *iovs; 8737 int iovcnt; 8738 8739 if (bdev_io == NULL) { 8740 return; 8741 } 8742 8743 switch (bdev_io->type) { 8744 case SPDK_BDEV_IO_TYPE_READ: 8745 case SPDK_BDEV_IO_TYPE_WRITE: 8746 case SPDK_BDEV_IO_TYPE_ZCOPY: 8747 iovs = bdev_io->u.bdev.iovs; 8748 iovcnt = bdev_io->u.bdev.iovcnt; 8749 break; 8750 default: 8751 iovs = NULL; 8752 iovcnt = 0; 8753 break; 8754 } 8755 8756 if (iovp) { 8757 *iovp = iovs; 8758 } 8759 if (iovcntp) { 8760 *iovcntp = iovcnt; 8761 } 8762 } 8763 8764 void * 8765 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8766 { 8767 if (bdev_io == NULL) { 8768 return NULL; 8769 } 8770 8771 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8772 return NULL; 8773 } 8774 8775 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8776 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8777 return bdev_io->u.bdev.md_buf; 8778 } 8779 8780 return NULL; 8781 } 8782 8783 void * 8784 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8785 { 8786 if (bdev_io == NULL) { 8787 assert(false); 8788 return NULL; 8789 } 8790 8791 return bdev_io->internal.caller_ctx; 8792 } 8793 8794 void 8795 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8796 { 8797 8798 if (spdk_bdev_module_list_find(bdev_module->name)) { 8799 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8800 assert(false); 8801 } 8802 8803 spdk_spin_init(&bdev_module->internal.spinlock); 8804 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8805 8806 /* 8807 * Modules with examine callbacks must be initialized first, so they are 8808 * ready to handle examine callbacks from later modules that will 8809 * register physical bdevs. 8810 */ 8811 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8812 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8813 } else { 8814 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8815 } 8816 } 8817 8818 struct spdk_bdev_module * 8819 spdk_bdev_module_list_find(const char *name) 8820 { 8821 struct spdk_bdev_module *bdev_module; 8822 8823 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8824 if (strcmp(name, bdev_module->name) == 0) { 8825 break; 8826 } 8827 } 8828 8829 return bdev_module; 8830 } 8831 8832 static int 8833 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8834 { 8835 uint64_t num_blocks; 8836 void *md_buf = NULL; 8837 8838 num_blocks = bdev_io->u.bdev.num_blocks; 8839 8840 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8841 md_buf = (char *)g_bdev_mgr.zero_buffer + 8842 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8843 } 8844 8845 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8846 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8847 g_bdev_mgr.zero_buffer, md_buf, 8848 bdev_io->u.bdev.offset_blocks, num_blocks, 8849 bdev_write_zero_buffer_done, bdev_io); 8850 } 8851 8852 static void 8853 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8854 { 8855 struct spdk_bdev_io *parent_io = cb_arg; 8856 8857 spdk_bdev_free_io(bdev_io); 8858 8859 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8860 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8861 } 8862 8863 static void 8864 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8865 { 8866 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8867 ctx->bdev->internal.qos_mod_in_progress = false; 8868 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8869 8870 if (ctx->cb_fn) { 8871 ctx->cb_fn(ctx->cb_arg, status); 8872 } 8873 free(ctx); 8874 } 8875 8876 static void 8877 bdev_disable_qos_done(void *cb_arg) 8878 { 8879 struct set_qos_limit_ctx *ctx = cb_arg; 8880 struct spdk_bdev *bdev = ctx->bdev; 8881 struct spdk_bdev_io *bdev_io; 8882 struct spdk_bdev_qos *qos; 8883 8884 spdk_spin_lock(&bdev->internal.spinlock); 8885 qos = bdev->internal.qos; 8886 bdev->internal.qos = NULL; 8887 spdk_spin_unlock(&bdev->internal.spinlock); 8888 8889 while (!TAILQ_EMPTY(&qos->queued)) { 8890 /* Send queued I/O back to their original thread for resubmission. */ 8891 bdev_io = TAILQ_FIRST(&qos->queued); 8892 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8893 8894 if (bdev_io->internal.io_submit_ch) { 8895 /* 8896 * Channel was changed when sending it to the QoS thread - change it back 8897 * before sending it back to the original thread. 8898 */ 8899 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8900 bdev_io->internal.io_submit_ch = NULL; 8901 } 8902 8903 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8904 _bdev_io_submit, bdev_io); 8905 } 8906 8907 if (qos->thread != NULL) { 8908 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8909 spdk_poller_unregister(&qos->poller); 8910 } 8911 8912 free(qos); 8913 8914 bdev_set_qos_limit_done(ctx, 0); 8915 } 8916 8917 static void 8918 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8919 { 8920 struct set_qos_limit_ctx *ctx = _ctx; 8921 struct spdk_thread *thread; 8922 8923 spdk_spin_lock(&bdev->internal.spinlock); 8924 thread = bdev->internal.qos->thread; 8925 spdk_spin_unlock(&bdev->internal.spinlock); 8926 8927 if (thread != NULL) { 8928 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8929 } else { 8930 bdev_disable_qos_done(ctx); 8931 } 8932 } 8933 8934 static void 8935 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8936 struct spdk_io_channel *ch, void *_ctx) 8937 { 8938 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8939 8940 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8941 8942 spdk_bdev_for_each_channel_continue(i, 0); 8943 } 8944 8945 static void 8946 bdev_update_qos_rate_limit_msg(void *cb_arg) 8947 { 8948 struct set_qos_limit_ctx *ctx = cb_arg; 8949 struct spdk_bdev *bdev = ctx->bdev; 8950 8951 spdk_spin_lock(&bdev->internal.spinlock); 8952 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8953 spdk_spin_unlock(&bdev->internal.spinlock); 8954 8955 bdev_set_qos_limit_done(ctx, 0); 8956 } 8957 8958 static void 8959 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8960 struct spdk_io_channel *ch, void *_ctx) 8961 { 8962 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8963 8964 spdk_spin_lock(&bdev->internal.spinlock); 8965 bdev_enable_qos(bdev, bdev_ch); 8966 spdk_spin_unlock(&bdev->internal.spinlock); 8967 spdk_bdev_for_each_channel_continue(i, 0); 8968 } 8969 8970 static void 8971 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8972 { 8973 struct set_qos_limit_ctx *ctx = _ctx; 8974 8975 bdev_set_qos_limit_done(ctx, status); 8976 } 8977 8978 static void 8979 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8980 { 8981 int i; 8982 8983 assert(bdev->internal.qos != NULL); 8984 8985 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8986 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8987 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8988 8989 if (limits[i] == 0) { 8990 bdev->internal.qos->rate_limits[i].limit = 8991 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8992 } 8993 } 8994 } 8995 } 8996 8997 void 8998 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8999 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9000 { 9001 struct set_qos_limit_ctx *ctx; 9002 uint32_t limit_set_complement; 9003 uint64_t min_limit_per_sec; 9004 int i; 9005 bool disable_rate_limit = true; 9006 9007 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9008 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9009 continue; 9010 } 9011 9012 if (limits[i] > 0) { 9013 disable_rate_limit = false; 9014 } 9015 9016 if (bdev_qos_is_iops_rate_limit(i) == true) { 9017 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9018 } else { 9019 /* Change from megabyte to byte rate limit */ 9020 limits[i] = limits[i] * 1024 * 1024; 9021 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9022 } 9023 9024 limit_set_complement = limits[i] % min_limit_per_sec; 9025 if (limit_set_complement) { 9026 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9027 limits[i], min_limit_per_sec); 9028 limits[i] += min_limit_per_sec - limit_set_complement; 9029 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9030 } 9031 } 9032 9033 ctx = calloc(1, sizeof(*ctx)); 9034 if (ctx == NULL) { 9035 cb_fn(cb_arg, -ENOMEM); 9036 return; 9037 } 9038 9039 ctx->cb_fn = cb_fn; 9040 ctx->cb_arg = cb_arg; 9041 ctx->bdev = bdev; 9042 9043 spdk_spin_lock(&bdev->internal.spinlock); 9044 if (bdev->internal.qos_mod_in_progress) { 9045 spdk_spin_unlock(&bdev->internal.spinlock); 9046 free(ctx); 9047 cb_fn(cb_arg, -EAGAIN); 9048 return; 9049 } 9050 bdev->internal.qos_mod_in_progress = true; 9051 9052 if (disable_rate_limit == true && bdev->internal.qos) { 9053 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9054 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9055 (bdev->internal.qos->rate_limits[i].limit > 0 && 9056 bdev->internal.qos->rate_limits[i].limit != 9057 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9058 disable_rate_limit = false; 9059 break; 9060 } 9061 } 9062 } 9063 9064 if (disable_rate_limit == false) { 9065 if (bdev->internal.qos == NULL) { 9066 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9067 if (!bdev->internal.qos) { 9068 spdk_spin_unlock(&bdev->internal.spinlock); 9069 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9070 bdev_set_qos_limit_done(ctx, -ENOMEM); 9071 return; 9072 } 9073 } 9074 9075 if (bdev->internal.qos->thread == NULL) { 9076 /* Enabling */ 9077 bdev_set_qos_rate_limits(bdev, limits); 9078 9079 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9080 bdev_enable_qos_done); 9081 } else { 9082 /* Updating */ 9083 bdev_set_qos_rate_limits(bdev, limits); 9084 9085 spdk_thread_send_msg(bdev->internal.qos->thread, 9086 bdev_update_qos_rate_limit_msg, ctx); 9087 } 9088 } else { 9089 if (bdev->internal.qos != NULL) { 9090 bdev_set_qos_rate_limits(bdev, limits); 9091 9092 /* Disabling */ 9093 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9094 bdev_disable_qos_msg_done); 9095 } else { 9096 spdk_spin_unlock(&bdev->internal.spinlock); 9097 bdev_set_qos_limit_done(ctx, 0); 9098 return; 9099 } 9100 } 9101 9102 spdk_spin_unlock(&bdev->internal.spinlock); 9103 } 9104 9105 struct spdk_bdev_histogram_ctx { 9106 spdk_bdev_histogram_status_cb cb_fn; 9107 void *cb_arg; 9108 struct spdk_bdev *bdev; 9109 int status; 9110 }; 9111 9112 static void 9113 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9114 { 9115 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9116 9117 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9118 ctx->bdev->internal.histogram_in_progress = false; 9119 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9120 ctx->cb_fn(ctx->cb_arg, ctx->status); 9121 free(ctx); 9122 } 9123 9124 static void 9125 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9126 struct spdk_io_channel *_ch, void *_ctx) 9127 { 9128 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9129 9130 if (ch->histogram != NULL) { 9131 spdk_histogram_data_free(ch->histogram); 9132 ch->histogram = NULL; 9133 } 9134 spdk_bdev_for_each_channel_continue(i, 0); 9135 } 9136 9137 static void 9138 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9139 { 9140 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9141 9142 if (status != 0) { 9143 ctx->status = status; 9144 ctx->bdev->internal.histogram_enabled = false; 9145 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9146 bdev_histogram_disable_channel_cb); 9147 } else { 9148 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9149 ctx->bdev->internal.histogram_in_progress = false; 9150 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9151 ctx->cb_fn(ctx->cb_arg, ctx->status); 9152 free(ctx); 9153 } 9154 } 9155 9156 static void 9157 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9158 struct spdk_io_channel *_ch, void *_ctx) 9159 { 9160 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9161 int status = 0; 9162 9163 if (ch->histogram == NULL) { 9164 ch->histogram = spdk_histogram_data_alloc(); 9165 if (ch->histogram == NULL) { 9166 status = -ENOMEM; 9167 } 9168 } 9169 9170 spdk_bdev_for_each_channel_continue(i, status); 9171 } 9172 9173 void 9174 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9175 void *cb_arg, bool enable) 9176 { 9177 struct spdk_bdev_histogram_ctx *ctx; 9178 9179 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9180 if (ctx == NULL) { 9181 cb_fn(cb_arg, -ENOMEM); 9182 return; 9183 } 9184 9185 ctx->bdev = bdev; 9186 ctx->status = 0; 9187 ctx->cb_fn = cb_fn; 9188 ctx->cb_arg = cb_arg; 9189 9190 spdk_spin_lock(&bdev->internal.spinlock); 9191 if (bdev->internal.histogram_in_progress) { 9192 spdk_spin_unlock(&bdev->internal.spinlock); 9193 free(ctx); 9194 cb_fn(cb_arg, -EAGAIN); 9195 return; 9196 } 9197 9198 bdev->internal.histogram_in_progress = true; 9199 spdk_spin_unlock(&bdev->internal.spinlock); 9200 9201 bdev->internal.histogram_enabled = enable; 9202 9203 if (enable) { 9204 /* Allocate histogram for each channel */ 9205 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9206 bdev_histogram_enable_channel_cb); 9207 } else { 9208 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9209 bdev_histogram_disable_channel_cb); 9210 } 9211 } 9212 9213 struct spdk_bdev_histogram_data_ctx { 9214 spdk_bdev_histogram_data_cb cb_fn; 9215 void *cb_arg; 9216 struct spdk_bdev *bdev; 9217 /** merged histogram data from all channels */ 9218 struct spdk_histogram_data *histogram; 9219 }; 9220 9221 static void 9222 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9223 { 9224 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9225 9226 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9227 free(ctx); 9228 } 9229 9230 static void 9231 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9232 struct spdk_io_channel *_ch, void *_ctx) 9233 { 9234 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9235 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9236 int status = 0; 9237 9238 if (ch->histogram == NULL) { 9239 status = -EFAULT; 9240 } else { 9241 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9242 } 9243 9244 spdk_bdev_for_each_channel_continue(i, status); 9245 } 9246 9247 void 9248 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9249 spdk_bdev_histogram_data_cb cb_fn, 9250 void *cb_arg) 9251 { 9252 struct spdk_bdev_histogram_data_ctx *ctx; 9253 9254 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9255 if (ctx == NULL) { 9256 cb_fn(cb_arg, -ENOMEM, NULL); 9257 return; 9258 } 9259 9260 ctx->bdev = bdev; 9261 ctx->cb_fn = cb_fn; 9262 ctx->cb_arg = cb_arg; 9263 9264 ctx->histogram = histogram; 9265 9266 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9267 bdev_histogram_get_channel_cb); 9268 } 9269 9270 void 9271 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9272 void *cb_arg) 9273 { 9274 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9275 int status = 0; 9276 9277 assert(cb_fn != NULL); 9278 9279 if (bdev_ch->histogram == NULL) { 9280 status = -EFAULT; 9281 } 9282 cb_fn(cb_arg, status, bdev_ch->histogram); 9283 } 9284 9285 size_t 9286 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9287 size_t max_events) 9288 { 9289 struct media_event_entry *entry; 9290 size_t num_events = 0; 9291 9292 for (; num_events < max_events; ++num_events) { 9293 entry = TAILQ_FIRST(&desc->pending_media_events); 9294 if (entry == NULL) { 9295 break; 9296 } 9297 9298 events[num_events] = entry->event; 9299 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9300 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9301 } 9302 9303 return num_events; 9304 } 9305 9306 int 9307 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9308 size_t num_events) 9309 { 9310 struct spdk_bdev_desc *desc; 9311 struct media_event_entry *entry; 9312 size_t event_id; 9313 int rc = 0; 9314 9315 assert(bdev->media_events); 9316 9317 spdk_spin_lock(&bdev->internal.spinlock); 9318 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9319 if (desc->write) { 9320 break; 9321 } 9322 } 9323 9324 if (desc == NULL || desc->media_events_buffer == NULL) { 9325 rc = -ENODEV; 9326 goto out; 9327 } 9328 9329 for (event_id = 0; event_id < num_events; ++event_id) { 9330 entry = TAILQ_FIRST(&desc->free_media_events); 9331 if (entry == NULL) { 9332 break; 9333 } 9334 9335 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9336 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9337 entry->event = events[event_id]; 9338 } 9339 9340 rc = event_id; 9341 out: 9342 spdk_spin_unlock(&bdev->internal.spinlock); 9343 return rc; 9344 } 9345 9346 static void 9347 _media_management_notify(void *arg) 9348 { 9349 struct spdk_bdev_desc *desc = arg; 9350 9351 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9352 } 9353 9354 void 9355 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9356 { 9357 struct spdk_bdev_desc *desc; 9358 9359 spdk_spin_lock(&bdev->internal.spinlock); 9360 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9361 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9362 event_notify(desc, _media_management_notify); 9363 } 9364 } 9365 spdk_spin_unlock(&bdev->internal.spinlock); 9366 } 9367 9368 struct locked_lba_range_ctx { 9369 struct lba_range range; 9370 struct lba_range *current_range; 9371 struct lba_range *owner_range; 9372 struct spdk_poller *poller; 9373 lock_range_cb cb_fn; 9374 void *cb_arg; 9375 }; 9376 9377 static void 9378 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9379 { 9380 struct locked_lba_range_ctx *ctx = _ctx; 9381 9382 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9383 free(ctx); 9384 } 9385 9386 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9387 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9388 9389 static void 9390 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9391 { 9392 struct locked_lba_range_ctx *ctx = _ctx; 9393 9394 if (status == -ENOMEM) { 9395 /* One of the channels could not allocate a range object. 9396 * So we have to go back and clean up any ranges that were 9397 * allocated successfully before we return error status to 9398 * the caller. We can reuse the unlock function to do that 9399 * clean up. 9400 */ 9401 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9402 bdev_lock_error_cleanup_cb); 9403 return; 9404 } 9405 9406 /* All channels have locked this range and no I/O overlapping the range 9407 * are outstanding! Set the owner_ch for the range object for the 9408 * locking channel, so that this channel will know that it is allowed 9409 * to write to this range. 9410 */ 9411 if (ctx->owner_range != NULL) { 9412 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9413 } 9414 9415 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9416 9417 /* Don't free the ctx here. Its range is in the bdev's global list of 9418 * locked ranges still, and will be removed and freed when this range 9419 * is later unlocked. 9420 */ 9421 } 9422 9423 static int 9424 bdev_lock_lba_range_check_io(void *_i) 9425 { 9426 struct spdk_bdev_channel_iter *i = _i; 9427 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9428 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9429 struct locked_lba_range_ctx *ctx = i->ctx; 9430 struct lba_range *range = ctx->current_range; 9431 struct spdk_bdev_io *bdev_io; 9432 9433 spdk_poller_unregister(&ctx->poller); 9434 9435 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9436 * range. But we need to wait until any outstanding IO overlapping with this range 9437 * are completed. 9438 */ 9439 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9440 if (bdev_io_range_is_locked(bdev_io, range)) { 9441 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9442 return SPDK_POLLER_BUSY; 9443 } 9444 } 9445 9446 spdk_bdev_for_each_channel_continue(i, 0); 9447 return SPDK_POLLER_BUSY; 9448 } 9449 9450 static void 9451 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9452 struct spdk_io_channel *_ch, void *_ctx) 9453 { 9454 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9455 struct locked_lba_range_ctx *ctx = _ctx; 9456 struct lba_range *range; 9457 9458 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9459 if (range->length == ctx->range.length && 9460 range->offset == ctx->range.offset && 9461 range->locked_ctx == ctx->range.locked_ctx) { 9462 /* This range already exists on this channel, so don't add 9463 * it again. This can happen when a new channel is created 9464 * while the for_each_channel operation is in progress. 9465 * Do not check for outstanding I/O in that case, since the 9466 * range was locked before any I/O could be submitted to the 9467 * new channel. 9468 */ 9469 spdk_bdev_for_each_channel_continue(i, 0); 9470 return; 9471 } 9472 } 9473 9474 range = calloc(1, sizeof(*range)); 9475 if (range == NULL) { 9476 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9477 return; 9478 } 9479 9480 range->length = ctx->range.length; 9481 range->offset = ctx->range.offset; 9482 range->locked_ctx = ctx->range.locked_ctx; 9483 ctx->current_range = range; 9484 if (ctx->range.owner_ch == ch) { 9485 /* This is the range object for the channel that will hold 9486 * the lock. Store it in the ctx object so that we can easily 9487 * set its owner_ch after the lock is finally acquired. 9488 */ 9489 ctx->owner_range = range; 9490 } 9491 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9492 bdev_lock_lba_range_check_io(i); 9493 } 9494 9495 static void 9496 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9497 { 9498 assert(spdk_get_thread() == ctx->range.owner_thread); 9499 assert(ctx->range.owner_ch == NULL || 9500 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9501 9502 /* We will add a copy of this range to each channel now. */ 9503 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9504 bdev_lock_lba_range_cb); 9505 } 9506 9507 static bool 9508 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9509 { 9510 struct lba_range *r; 9511 9512 TAILQ_FOREACH(r, tailq, tailq) { 9513 if (bdev_lba_range_overlapped(range, r)) { 9514 return true; 9515 } 9516 } 9517 return false; 9518 } 9519 9520 static int 9521 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9522 uint64_t offset, uint64_t length, 9523 lock_range_cb cb_fn, void *cb_arg) 9524 { 9525 struct locked_lba_range_ctx *ctx; 9526 9527 ctx = calloc(1, sizeof(*ctx)); 9528 if (ctx == NULL) { 9529 return -ENOMEM; 9530 } 9531 9532 ctx->range.offset = offset; 9533 ctx->range.length = length; 9534 ctx->range.owner_thread = spdk_get_thread(); 9535 ctx->range.owner_ch = ch; 9536 ctx->range.locked_ctx = cb_arg; 9537 ctx->range.bdev = bdev; 9538 ctx->cb_fn = cb_fn; 9539 ctx->cb_arg = cb_arg; 9540 9541 spdk_spin_lock(&bdev->internal.spinlock); 9542 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9543 /* There is an active lock overlapping with this range. 9544 * Put it on the pending list until this range no 9545 * longer overlaps with another. 9546 */ 9547 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9548 } else { 9549 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9550 bdev_lock_lba_range_ctx(bdev, ctx); 9551 } 9552 spdk_spin_unlock(&bdev->internal.spinlock); 9553 return 0; 9554 } 9555 9556 static int 9557 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9558 uint64_t offset, uint64_t length, 9559 lock_range_cb cb_fn, void *cb_arg) 9560 { 9561 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9562 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9563 9564 if (cb_arg == NULL) { 9565 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9566 return -EINVAL; 9567 } 9568 9569 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9570 } 9571 9572 static void 9573 bdev_lock_lba_range_ctx_msg(void *_ctx) 9574 { 9575 struct locked_lba_range_ctx *ctx = _ctx; 9576 9577 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9578 } 9579 9580 static void 9581 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9582 { 9583 struct locked_lba_range_ctx *ctx = _ctx; 9584 struct locked_lba_range_ctx *pending_ctx; 9585 struct lba_range *range, *tmp; 9586 9587 spdk_spin_lock(&bdev->internal.spinlock); 9588 /* Check if there are any pending locked ranges that overlap with this range 9589 * that was just unlocked. If there are, check that it doesn't overlap with any 9590 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9591 * the lock process. 9592 */ 9593 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9594 if (bdev_lba_range_overlapped(range, &ctx->range) && 9595 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9596 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9597 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9598 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9599 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9600 bdev_lock_lba_range_ctx_msg, pending_ctx); 9601 } 9602 } 9603 spdk_spin_unlock(&bdev->internal.spinlock); 9604 9605 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9606 free(ctx); 9607 } 9608 9609 static void 9610 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9611 struct spdk_io_channel *_ch, void *_ctx) 9612 { 9613 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9614 struct locked_lba_range_ctx *ctx = _ctx; 9615 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9616 struct spdk_bdev_io *bdev_io; 9617 struct lba_range *range; 9618 9619 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9620 if (ctx->range.offset == range->offset && 9621 ctx->range.length == range->length && 9622 ctx->range.locked_ctx == range->locked_ctx) { 9623 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9624 free(range); 9625 break; 9626 } 9627 } 9628 9629 /* Note: we should almost always be able to assert that the range specified 9630 * was found. But there are some very rare corner cases where a new channel 9631 * gets created simultaneously with a range unlock, where this function 9632 * would execute on that new channel and wouldn't have the range. 9633 * We also use this to clean up range allocations when a later allocation 9634 * fails in the locking path. 9635 * So we can't actually assert() here. 9636 */ 9637 9638 /* Swap the locked IO into a temporary list, and then try to submit them again. 9639 * We could hyper-optimize this to only resubmit locked I/O that overlap 9640 * with the range that was just unlocked, but this isn't a performance path so 9641 * we go for simplicity here. 9642 */ 9643 TAILQ_INIT(&io_locked); 9644 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9645 while (!TAILQ_EMPTY(&io_locked)) { 9646 bdev_io = TAILQ_FIRST(&io_locked); 9647 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9648 bdev_io_submit(bdev_io); 9649 } 9650 9651 spdk_bdev_for_each_channel_continue(i, 0); 9652 } 9653 9654 static int 9655 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9656 lock_range_cb cb_fn, void *cb_arg) 9657 { 9658 struct locked_lba_range_ctx *ctx; 9659 struct lba_range *range; 9660 9661 spdk_spin_lock(&bdev->internal.spinlock); 9662 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9663 * and remove it. This ensures new channels don't inherit the locked range. 9664 * Then we will send a message to each channel to remove the range from its 9665 * per-channel list. 9666 */ 9667 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9668 if (range->offset == offset && range->length == length && 9669 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9670 break; 9671 } 9672 } 9673 if (range == NULL) { 9674 assert(false); 9675 spdk_spin_unlock(&bdev->internal.spinlock); 9676 return -EINVAL; 9677 } 9678 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9679 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9680 spdk_spin_unlock(&bdev->internal.spinlock); 9681 9682 ctx->cb_fn = cb_fn; 9683 ctx->cb_arg = cb_arg; 9684 9685 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9686 bdev_unlock_lba_range_cb); 9687 return 0; 9688 } 9689 9690 static int 9691 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9692 uint64_t offset, uint64_t length, 9693 lock_range_cb cb_fn, void *cb_arg) 9694 { 9695 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9696 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9697 struct lba_range *range; 9698 bool range_found = false; 9699 9700 /* Let's make sure the specified channel actually has a lock on 9701 * the specified range. Note that the range must match exactly. 9702 */ 9703 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9704 if (range->offset == offset && range->length == length && 9705 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9706 range_found = true; 9707 break; 9708 } 9709 } 9710 9711 if (!range_found) { 9712 return -EINVAL; 9713 } 9714 9715 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9716 } 9717 9718 struct bdev_quiesce_ctx { 9719 spdk_bdev_quiesce_cb cb_fn; 9720 void *cb_arg; 9721 }; 9722 9723 static void 9724 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9725 { 9726 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9727 9728 if (quiesce_ctx->cb_fn != NULL) { 9729 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9730 } 9731 9732 free(quiesce_ctx); 9733 } 9734 9735 static void 9736 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9737 { 9738 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9739 struct spdk_bdev_module *module = range->bdev->module; 9740 9741 if (status != 0) { 9742 if (quiesce_ctx->cb_fn != NULL) { 9743 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9744 } 9745 free(quiesce_ctx); 9746 return; 9747 } 9748 9749 spdk_spin_lock(&module->internal.spinlock); 9750 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9751 spdk_spin_unlock(&module->internal.spinlock); 9752 9753 if (quiesce_ctx->cb_fn != NULL) { 9754 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9755 quiesce_ctx->cb_fn = NULL; 9756 quiesce_ctx->cb_arg = NULL; 9757 } 9758 /* quiesce_ctx will be freed on unquiesce */ 9759 } 9760 9761 static int 9762 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9763 uint64_t offset, uint64_t length, 9764 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9765 bool unquiesce) 9766 { 9767 struct bdev_quiesce_ctx *quiesce_ctx; 9768 int rc; 9769 9770 if (module != bdev->module) { 9771 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9772 return -EINVAL; 9773 } 9774 9775 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9776 return -EINVAL; 9777 } 9778 9779 if (unquiesce) { 9780 struct lba_range *range; 9781 9782 /* Make sure the specified range is actually quiesced in the specified module and 9783 * then remove it from the list. Note that the range must match exactly. 9784 */ 9785 spdk_spin_lock(&module->internal.spinlock); 9786 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9787 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9788 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9789 break; 9790 } 9791 } 9792 spdk_spin_unlock(&module->internal.spinlock); 9793 9794 if (range == NULL) { 9795 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9796 return -EINVAL; 9797 } 9798 9799 quiesce_ctx = range->locked_ctx; 9800 quiesce_ctx->cb_fn = cb_fn; 9801 quiesce_ctx->cb_arg = cb_arg; 9802 9803 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9804 } else { 9805 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9806 if (quiesce_ctx == NULL) { 9807 return -ENOMEM; 9808 } 9809 9810 quiesce_ctx->cb_fn = cb_fn; 9811 quiesce_ctx->cb_arg = cb_arg; 9812 9813 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9814 if (rc != 0) { 9815 free(quiesce_ctx); 9816 } 9817 } 9818 9819 return rc; 9820 } 9821 9822 int 9823 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9824 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9825 { 9826 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9827 } 9828 9829 int 9830 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9831 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9832 { 9833 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9834 } 9835 9836 int 9837 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9838 uint64_t offset, uint64_t length, 9839 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9840 { 9841 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9842 } 9843 9844 int 9845 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9846 uint64_t offset, uint64_t length, 9847 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9848 { 9849 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9850 } 9851 9852 int 9853 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9854 int array_size) 9855 { 9856 if (!bdev) { 9857 return -EINVAL; 9858 } 9859 9860 if (bdev->fn_table->get_memory_domains) { 9861 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9862 } 9863 9864 return 0; 9865 } 9866 9867 struct spdk_bdev_for_each_io_ctx { 9868 void *ctx; 9869 spdk_bdev_io_fn fn; 9870 spdk_bdev_for_each_io_cb cb; 9871 }; 9872 9873 static void 9874 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9875 struct spdk_io_channel *io_ch, void *_ctx) 9876 { 9877 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9878 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9879 struct spdk_bdev_io *bdev_io; 9880 int rc = 0; 9881 9882 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9883 rc = ctx->fn(ctx->ctx, bdev_io); 9884 if (rc != 0) { 9885 break; 9886 } 9887 } 9888 9889 spdk_bdev_for_each_channel_continue(i, rc); 9890 } 9891 9892 static void 9893 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9894 { 9895 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9896 9897 ctx->cb(ctx->ctx, status); 9898 9899 free(ctx); 9900 } 9901 9902 void 9903 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9904 spdk_bdev_for_each_io_cb cb) 9905 { 9906 struct spdk_bdev_for_each_io_ctx *ctx; 9907 9908 assert(fn != NULL && cb != NULL); 9909 9910 ctx = calloc(1, sizeof(*ctx)); 9911 if (ctx == NULL) { 9912 SPDK_ERRLOG("Failed to allocate context.\n"); 9913 cb(_ctx, -ENOMEM); 9914 return; 9915 } 9916 9917 ctx->ctx = _ctx; 9918 ctx->fn = fn; 9919 ctx->cb = cb; 9920 9921 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9922 bdev_for_each_io_done); 9923 } 9924 9925 void 9926 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9927 { 9928 spdk_for_each_channel_continue(iter->i, status); 9929 } 9930 9931 static struct spdk_bdev * 9932 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9933 { 9934 void *io_device = spdk_io_channel_iter_get_io_device(i); 9935 9936 return __bdev_from_io_dev(io_device); 9937 } 9938 9939 static void 9940 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9941 { 9942 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9943 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9944 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9945 9946 iter->i = i; 9947 iter->fn(iter, bdev, ch, iter->ctx); 9948 } 9949 9950 static void 9951 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9952 { 9953 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9954 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9955 9956 iter->i = i; 9957 iter->cpl(bdev, iter->ctx, status); 9958 9959 free(iter); 9960 } 9961 9962 void 9963 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9964 void *ctx, spdk_bdev_for_each_channel_done cpl) 9965 { 9966 struct spdk_bdev_channel_iter *iter; 9967 9968 assert(bdev != NULL && fn != NULL && ctx != NULL); 9969 9970 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9971 if (iter == NULL) { 9972 SPDK_ERRLOG("Unable to allocate iterator\n"); 9973 assert(false); 9974 return; 9975 } 9976 9977 iter->fn = fn; 9978 iter->cpl = cpl; 9979 iter->ctx = ctx; 9980 9981 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9982 iter, bdev_each_channel_cpl); 9983 } 9984 9985 static void 9986 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9987 { 9988 struct spdk_bdev_io *parent_io = cb_arg; 9989 9990 spdk_bdev_free_io(bdev_io); 9991 9992 /* Check return status of write */ 9993 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9994 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9995 } 9996 9997 static void 9998 bdev_copy_do_write(void *_bdev_io) 9999 { 10000 struct spdk_bdev_io *bdev_io = _bdev_io; 10001 int rc; 10002 10003 /* Write blocks */ 10004 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10005 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10006 bdev_io->u.bdev.iovs[0].iov_base, 10007 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10008 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10009 10010 if (rc == -ENOMEM) { 10011 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10012 } else if (rc != 0) { 10013 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10014 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10015 } 10016 } 10017 10018 static void 10019 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10020 { 10021 struct spdk_bdev_io *parent_io = cb_arg; 10022 10023 spdk_bdev_free_io(bdev_io); 10024 10025 /* Check return status of read */ 10026 if (!success) { 10027 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10028 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10029 return; 10030 } 10031 10032 /* Do write */ 10033 bdev_copy_do_write(parent_io); 10034 } 10035 10036 static void 10037 bdev_copy_do_read(void *_bdev_io) 10038 { 10039 struct spdk_bdev_io *bdev_io = _bdev_io; 10040 int rc; 10041 10042 /* Read blocks */ 10043 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10044 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10045 bdev_io->u.bdev.iovs[0].iov_base, 10046 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10047 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10048 10049 if (rc == -ENOMEM) { 10050 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10051 } else if (rc != 0) { 10052 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10053 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10054 } 10055 } 10056 10057 static void 10058 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10059 { 10060 if (!success) { 10061 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10062 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10063 return; 10064 } 10065 10066 bdev_copy_do_read(bdev_io); 10067 } 10068 10069 int 10070 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10071 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10072 spdk_bdev_io_completion_cb cb, void *cb_arg) 10073 { 10074 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10075 struct spdk_bdev_io *bdev_io; 10076 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10077 10078 if (!desc->write) { 10079 return -EBADF; 10080 } 10081 10082 if (num_blocks == 0) { 10083 SPDK_ERRLOG("Can't copy 0 blocks\n"); 10084 return -EINVAL; 10085 } 10086 10087 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10088 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10089 SPDK_DEBUGLOG(bdev, 10090 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10091 dst_offset_blocks, src_offset_blocks, num_blocks); 10092 return -EINVAL; 10093 } 10094 10095 bdev_io = bdev_channel_get_io(channel); 10096 if (!bdev_io) { 10097 return -ENOMEM; 10098 } 10099 10100 bdev_io->internal.ch = channel; 10101 bdev_io->internal.desc = desc; 10102 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10103 10104 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10105 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10106 bdev_io->u.bdev.num_blocks = num_blocks; 10107 bdev_io->u.bdev.memory_domain = NULL; 10108 bdev_io->u.bdev.memory_domain_ctx = NULL; 10109 bdev_io->u.bdev.iovs = NULL; 10110 bdev_io->u.bdev.iovcnt = 0; 10111 bdev_io->u.bdev.md_buf = NULL; 10112 bdev_io->u.bdev.accel_sequence = NULL; 10113 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10114 10115 if (dst_offset_blocks == src_offset_blocks) { 10116 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 10117 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 10118 10119 return 0; 10120 } 10121 10122 10123 /* If the copy size is large and should be split, use the generic split logic 10124 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10125 * 10126 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10127 * emulate it using regular read and write requests otherwise. 10128 */ 10129 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10130 bdev_io->internal.split) { 10131 bdev_io_submit(bdev_io); 10132 return 0; 10133 } 10134 10135 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10136 10137 return 0; 10138 } 10139 10140 SPDK_LOG_REGISTER_COMPONENT(bdev) 10141 10142 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10143 { 10144 struct spdk_trace_tpoint_opts opts[] = { 10145 { 10146 "BDEV_IO_START", TRACE_BDEV_IO_START, 10147 OWNER_BDEV, OBJECT_BDEV_IO, 1, 10148 { 10149 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10150 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10151 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10152 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10153 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10154 } 10155 }, 10156 { 10157 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10158 OWNER_BDEV, OBJECT_BDEV_IO, 0, 10159 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10160 }, 10161 { 10162 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10163 OWNER_BDEV, OBJECT_NONE, 1, 10164 { 10165 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10166 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10167 } 10168 }, 10169 { 10170 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10171 OWNER_BDEV, OBJECT_NONE, 0, 10172 { 10173 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10174 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10175 } 10176 }, 10177 }; 10178 10179 10180 spdk_trace_register_owner(OWNER_BDEV, 'b'); 10181 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10182 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10183 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10184 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10185 } 10186