1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "v23.09", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 112 113 #ifdef SPDK_CONFIG_VTUNE 114 __itt_domain *domain; 115 #endif 116 }; 117 118 static struct spdk_bdev_mgr g_bdev_mgr = { 119 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 120 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 121 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 122 .init_complete = false, 123 .module_init_complete = false, 124 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 125 }; 126 127 static void 128 __attribute__((constructor)) 129 _bdev_init(void) 130 { 131 spdk_spin_init(&g_bdev_mgr.spinlock); 132 } 133 134 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 135 136 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 137 138 struct lba_range { 139 struct spdk_bdev *bdev; 140 uint64_t offset; 141 uint64_t length; 142 void *locked_ctx; 143 struct spdk_thread *owner_thread; 144 struct spdk_bdev_channel *owner_ch; 145 TAILQ_ENTRY(lba_range) tailq; 146 TAILQ_ENTRY(lba_range) tailq_module; 147 }; 148 149 static struct spdk_bdev_opts g_bdev_opts = { 150 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 151 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 152 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. */ 180 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 182 /** Function to update for the submitted IO. */ 183 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 184 }; 185 186 struct spdk_bdev_qos { 187 /** Types of structure of rate limits. */ 188 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 189 190 /** The channel that all I/O are funneled through. */ 191 struct spdk_bdev_channel *ch; 192 193 /** The thread on which the poller is running. */ 194 struct spdk_thread *thread; 195 196 /** Queue of I/O waiting to be issued. */ 197 bdev_io_tailq_t queued; 198 199 /** Size of a timeslice in tsc ticks. */ 200 uint64_t timeslice_size; 201 202 /** Timestamp of start of last timeslice. */ 203 uint64_t last_timeslice; 204 205 /** Poller that processes queued I/O commands each time slice. */ 206 struct spdk_poller *poller; 207 }; 208 209 struct spdk_bdev_mgmt_channel { 210 /* 211 * Each thread keeps a cache of bdev_io - this allows 212 * bdev threads which are *not* DPDK threads to still 213 * benefit from a per-thread bdev_io cache. Without 214 * this, non-DPDK threads fetching from the mempool 215 * incur a cmpxchg on get and put. 216 */ 217 bdev_io_stailq_t per_thread_cache; 218 uint32_t per_thread_cache_count; 219 uint32_t bdev_io_cache_size; 220 221 struct spdk_iobuf_channel iobuf; 222 223 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 224 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 225 }; 226 227 /* 228 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 229 * will queue here their IO that awaits retry. It makes it possible to retry sending 230 * IO to one bdev after IO from other bdev completes. 231 */ 232 struct spdk_bdev_shared_resource { 233 /* The bdev management channel */ 234 struct spdk_bdev_mgmt_channel *mgmt_ch; 235 236 /* 237 * Count of I/O submitted to bdev module and waiting for completion. 238 * Incremented before submit_request() is called on an spdk_bdev_io. 239 */ 240 uint64_t io_outstanding; 241 242 /* 243 * Queue of IO awaiting retry because of a previous NOMEM status returned 244 * on this channel. 245 */ 246 bdev_io_tailq_t nomem_io; 247 248 /* 249 * Threshold which io_outstanding must drop to before retrying nomem_io. 250 */ 251 uint64_t nomem_threshold; 252 253 /* I/O channel allocated by a bdev module */ 254 struct spdk_io_channel *shared_ch; 255 256 /* Refcount of bdev channels using this resource */ 257 uint32_t ref; 258 259 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 260 }; 261 262 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 263 #define BDEV_CH_QOS_ENABLED (1 << 1) 264 265 struct spdk_bdev_channel { 266 struct spdk_bdev *bdev; 267 268 /* The channel for the underlying device */ 269 struct spdk_io_channel *channel; 270 271 /* Accel channel */ 272 struct spdk_io_channel *accel_channel; 273 274 /* Per io_device per thread data */ 275 struct spdk_bdev_shared_resource *shared_resource; 276 277 struct spdk_bdev_io_stat *stat; 278 279 /* 280 * Count of I/O submitted to the underlying dev module through this channel 281 * and waiting for completion. 282 */ 283 uint64_t io_outstanding; 284 285 /* 286 * List of all submitted I/Os including I/O that are generated via splitting. 287 */ 288 bdev_io_tailq_t io_submitted; 289 290 /* 291 * List of spdk_bdev_io that are currently queued because they write to a locked 292 * LBA range. 293 */ 294 bdev_io_tailq_t io_locked; 295 296 /* List of I/Os with accel sequence being currently executed */ 297 bdev_io_tailq_t io_accel_exec; 298 299 /* List of I/Os doing memory domain pull/push */ 300 bdev_io_tailq_t io_memory_domain; 301 302 uint32_t flags; 303 304 struct spdk_histogram_data *histogram; 305 306 #ifdef SPDK_CONFIG_VTUNE 307 uint64_t start_tsc; 308 uint64_t interval_tsc; 309 __itt_string_handle *handle; 310 struct spdk_bdev_io_stat *prev_stat; 311 #endif 312 313 bdev_io_tailq_t queued_resets; 314 315 lba_range_tailq_t locked_ranges; 316 }; 317 318 struct media_event_entry { 319 struct spdk_bdev_media_event event; 320 TAILQ_ENTRY(media_event_entry) tailq; 321 }; 322 323 #define MEDIA_EVENT_POOL_SIZE 64 324 325 struct spdk_bdev_desc { 326 struct spdk_bdev *bdev; 327 struct spdk_thread *thread; 328 struct { 329 spdk_bdev_event_cb_t event_fn; 330 void *ctx; 331 } callback; 332 bool closed; 333 bool write; 334 bool memory_domains_supported; 335 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 336 struct spdk_spinlock spinlock; 337 uint32_t refs; 338 TAILQ_HEAD(, media_event_entry) pending_media_events; 339 TAILQ_HEAD(, media_event_entry) free_media_events; 340 struct media_event_entry *media_events_buffer; 341 TAILQ_ENTRY(spdk_bdev_desc) link; 342 343 uint64_t timeout_in_sec; 344 spdk_bdev_io_timeout_cb cb_fn; 345 void *cb_arg; 346 struct spdk_poller *io_timeout_poller; 347 struct spdk_bdev_module_claim *claim; 348 }; 349 350 struct spdk_bdev_iostat_ctx { 351 struct spdk_bdev_io_stat *stat; 352 spdk_bdev_get_device_stat_cb cb; 353 void *cb_arg; 354 }; 355 356 struct set_qos_limit_ctx { 357 void (*cb_fn)(void *cb_arg, int status); 358 void *cb_arg; 359 struct spdk_bdev *bdev; 360 }; 361 362 struct spdk_bdev_channel_iter { 363 spdk_bdev_for_each_channel_msg fn; 364 spdk_bdev_for_each_channel_done cpl; 365 struct spdk_io_channel_iter *i; 366 void *ctx; 367 }; 368 369 struct spdk_bdev_io_error_stat { 370 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 371 }; 372 373 enum bdev_io_retry_state { 374 BDEV_IO_RETRY_STATE_INVALID, 375 BDEV_IO_RETRY_STATE_PULL, 376 BDEV_IO_RETRY_STATE_PULL_MD, 377 BDEV_IO_RETRY_STATE_SUBMIT, 378 BDEV_IO_RETRY_STATE_PUSH, 379 BDEV_IO_RETRY_STATE_PUSH_MD, 380 }; 381 382 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 383 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 384 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 385 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 386 387 static inline void bdev_io_complete(void *ctx); 388 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 389 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 390 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 391 392 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 393 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 394 395 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 396 struct spdk_io_channel *ch, void *_ctx); 397 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 398 399 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 400 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 401 uint64_t num_blocks, 402 struct spdk_memory_domain *domain, void *domain_ctx, 403 struct spdk_accel_sequence *seq, 404 spdk_bdev_io_completion_cb cb, void *cb_arg); 405 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 406 struct iovec *iov, int iovcnt, void *md_buf, 407 uint64_t offset_blocks, uint64_t num_blocks, 408 struct spdk_memory_domain *domain, void *domain_ctx, 409 struct spdk_accel_sequence *seq, 410 spdk_bdev_io_completion_cb cb, void *cb_arg); 411 412 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 413 uint64_t offset, uint64_t length, 414 lock_range_cb cb_fn, void *cb_arg); 415 416 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 417 uint64_t offset, uint64_t length, 418 lock_range_cb cb_fn, void *cb_arg); 419 420 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 421 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 422 423 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 424 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 425 static void claim_reset(struct spdk_bdev *bdev); 426 427 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 428 429 #define bdev_get_ext_io_opt(opts, field, defval) \ 430 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 431 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 432 433 void 434 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 435 { 436 if (!opts) { 437 SPDK_ERRLOG("opts should not be NULL\n"); 438 return; 439 } 440 441 if (!opts_size) { 442 SPDK_ERRLOG("opts_size should not be zero value\n"); 443 return; 444 } 445 446 opts->opts_size = opts_size; 447 448 #define SET_FIELD(field) \ 449 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 450 opts->field = g_bdev_opts.field; \ 451 } \ 452 453 SET_FIELD(bdev_io_pool_size); 454 SET_FIELD(bdev_io_cache_size); 455 SET_FIELD(bdev_auto_examine); 456 457 /* Do not remove this statement, you should always update this statement when you adding a new field, 458 * and do not forget to add the SET_FIELD statement for your added field. */ 459 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 460 461 #undef SET_FIELD 462 } 463 464 int 465 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 466 { 467 uint32_t min_pool_size; 468 469 if (!opts) { 470 SPDK_ERRLOG("opts cannot be NULL\n"); 471 return -1; 472 } 473 474 if (!opts->opts_size) { 475 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 476 return -1; 477 } 478 479 /* 480 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 481 * initialization. A second mgmt_ch will be created on the same thread when the application starts 482 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 483 */ 484 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 485 if (opts->bdev_io_pool_size < min_pool_size) { 486 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 487 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 488 spdk_thread_get_count()); 489 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 490 return -1; 491 } 492 493 #define SET_FIELD(field) \ 494 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 495 g_bdev_opts.field = opts->field; \ 496 } \ 497 498 SET_FIELD(bdev_io_pool_size); 499 SET_FIELD(bdev_io_cache_size); 500 SET_FIELD(bdev_auto_examine); 501 502 g_bdev_opts.opts_size = opts->opts_size; 503 504 #undef SET_FIELD 505 506 return 0; 507 } 508 509 static struct spdk_bdev * 510 bdev_get_by_name(const char *bdev_name) 511 { 512 struct spdk_bdev_name find; 513 struct spdk_bdev_name *res; 514 515 find.name = (char *)bdev_name; 516 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 517 if (res != NULL) { 518 return res->bdev; 519 } 520 521 return NULL; 522 } 523 524 struct spdk_bdev * 525 spdk_bdev_get_by_name(const char *bdev_name) 526 { 527 struct spdk_bdev *bdev; 528 529 spdk_spin_lock(&g_bdev_mgr.spinlock); 530 bdev = bdev_get_by_name(bdev_name); 531 spdk_spin_unlock(&g_bdev_mgr.spinlock); 532 533 return bdev; 534 } 535 536 struct bdev_io_status_string { 537 enum spdk_bdev_io_status status; 538 const char *str; 539 }; 540 541 static const struct bdev_io_status_string bdev_io_status_strings[] = { 542 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 543 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 544 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 545 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 546 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 547 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 548 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 549 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 550 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 551 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 552 }; 553 554 static const char * 555 bdev_io_status_get_string(enum spdk_bdev_io_status status) 556 { 557 uint32_t i; 558 559 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 560 if (bdev_io_status_strings[i].status == status) { 561 return bdev_io_status_strings[i].str; 562 } 563 } 564 565 return "reserved"; 566 } 567 568 struct spdk_bdev_wait_for_examine_ctx { 569 struct spdk_poller *poller; 570 spdk_bdev_wait_for_examine_cb cb_fn; 571 void *cb_arg; 572 }; 573 574 static bool bdev_module_all_actions_completed(void); 575 576 static int 577 bdev_wait_for_examine_cb(void *arg) 578 { 579 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 580 581 if (!bdev_module_all_actions_completed()) { 582 return SPDK_POLLER_IDLE; 583 } 584 585 spdk_poller_unregister(&ctx->poller); 586 ctx->cb_fn(ctx->cb_arg); 587 free(ctx); 588 589 return SPDK_POLLER_BUSY; 590 } 591 592 int 593 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 594 { 595 struct spdk_bdev_wait_for_examine_ctx *ctx; 596 597 ctx = calloc(1, sizeof(*ctx)); 598 if (ctx == NULL) { 599 return -ENOMEM; 600 } 601 ctx->cb_fn = cb_fn; 602 ctx->cb_arg = cb_arg; 603 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 604 605 return 0; 606 } 607 608 struct spdk_bdev_examine_item { 609 char *name; 610 TAILQ_ENTRY(spdk_bdev_examine_item) link; 611 }; 612 613 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 614 615 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 616 g_bdev_examine_allowlist); 617 618 static inline bool 619 bdev_examine_allowlist_check(const char *name) 620 { 621 struct spdk_bdev_examine_item *item; 622 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 623 if (strcmp(name, item->name) == 0) { 624 return true; 625 } 626 } 627 return false; 628 } 629 630 static inline void 631 bdev_examine_allowlist_free(void) 632 { 633 struct spdk_bdev_examine_item *item; 634 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 635 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 636 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 637 free(item->name); 638 free(item); 639 } 640 } 641 642 static inline bool 643 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 644 { 645 struct spdk_bdev_alias *tmp; 646 if (bdev_examine_allowlist_check(bdev->name)) { 647 return true; 648 } 649 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 650 if (bdev_examine_allowlist_check(tmp->alias.name)) { 651 return true; 652 } 653 } 654 return false; 655 } 656 657 static inline bool 658 bdev_ok_to_examine(struct spdk_bdev *bdev) 659 { 660 if (g_bdev_opts.bdev_auto_examine) { 661 return true; 662 } else { 663 return bdev_in_examine_allowlist(bdev); 664 } 665 } 666 667 static void 668 bdev_examine(struct spdk_bdev *bdev) 669 { 670 struct spdk_bdev_module *module; 671 struct spdk_bdev_module_claim *claim, *tmpclaim; 672 uint32_t action; 673 674 if (!bdev_ok_to_examine(bdev)) { 675 return; 676 } 677 678 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 679 if (module->examine_config) { 680 spdk_spin_lock(&module->internal.spinlock); 681 action = module->internal.action_in_progress; 682 module->internal.action_in_progress++; 683 spdk_spin_unlock(&module->internal.spinlock); 684 module->examine_config(bdev); 685 if (action != module->internal.action_in_progress) { 686 SPDK_ERRLOG("examine_config for module %s did not call " 687 "spdk_bdev_module_examine_done()\n", module->name); 688 } 689 } 690 } 691 692 spdk_spin_lock(&bdev->internal.spinlock); 693 694 switch (bdev->internal.claim_type) { 695 case SPDK_BDEV_CLAIM_NONE: 696 /* Examine by all bdev modules */ 697 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 698 if (module->examine_disk) { 699 spdk_spin_lock(&module->internal.spinlock); 700 module->internal.action_in_progress++; 701 spdk_spin_unlock(&module->internal.spinlock); 702 spdk_spin_unlock(&bdev->internal.spinlock); 703 module->examine_disk(bdev); 704 spdk_spin_lock(&bdev->internal.spinlock); 705 } 706 } 707 break; 708 case SPDK_BDEV_CLAIM_EXCL_WRITE: 709 /* Examine by the one bdev module with a v1 claim */ 710 module = bdev->internal.claim.v1.module; 711 if (module->examine_disk) { 712 spdk_spin_lock(&module->internal.spinlock); 713 module->internal.action_in_progress++; 714 spdk_spin_unlock(&module->internal.spinlock); 715 spdk_spin_unlock(&bdev->internal.spinlock); 716 module->examine_disk(bdev); 717 return; 718 } 719 break; 720 default: 721 /* Examine by all bdev modules with a v2 claim */ 722 assert(claim_type_is_v2(bdev->internal.claim_type)); 723 /* 724 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 725 * list, perhaps accessing freed memory. Without protection, this could happen 726 * while the lock is dropped during the examine callback. 727 */ 728 bdev->internal.examine_in_progress++; 729 730 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 731 module = claim->module; 732 733 if (module == NULL) { 734 /* This is a vestigial claim, held by examine_count */ 735 continue; 736 } 737 738 if (module->examine_disk == NULL) { 739 continue; 740 } 741 742 spdk_spin_lock(&module->internal.spinlock); 743 module->internal.action_in_progress++; 744 spdk_spin_unlock(&module->internal.spinlock); 745 746 /* Call examine_disk without holding internal.spinlock. */ 747 spdk_spin_unlock(&bdev->internal.spinlock); 748 module->examine_disk(bdev); 749 spdk_spin_lock(&bdev->internal.spinlock); 750 } 751 752 assert(bdev->internal.examine_in_progress > 0); 753 bdev->internal.examine_in_progress--; 754 if (bdev->internal.examine_in_progress == 0) { 755 /* Remove any claims that were released during examine_disk */ 756 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 757 if (claim->desc != NULL) { 758 continue; 759 } 760 761 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 762 free(claim); 763 } 764 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 765 claim_reset(bdev); 766 } 767 } 768 } 769 770 spdk_spin_unlock(&bdev->internal.spinlock); 771 } 772 773 int 774 spdk_bdev_examine(const char *name) 775 { 776 struct spdk_bdev *bdev; 777 struct spdk_bdev_examine_item *item; 778 struct spdk_thread *thread = spdk_get_thread(); 779 780 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 781 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 782 thread ? spdk_thread_get_name(thread) : "null"); 783 return -EINVAL; 784 } 785 786 if (g_bdev_opts.bdev_auto_examine) { 787 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 788 return -EINVAL; 789 } 790 791 if (bdev_examine_allowlist_check(name)) { 792 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 793 return -EEXIST; 794 } 795 796 item = calloc(1, sizeof(*item)); 797 if (!item) { 798 return -ENOMEM; 799 } 800 item->name = strdup(name); 801 if (!item->name) { 802 free(item); 803 return -ENOMEM; 804 } 805 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 806 807 bdev = spdk_bdev_get_by_name(name); 808 if (bdev) { 809 bdev_examine(bdev); 810 } 811 return 0; 812 } 813 814 static inline void 815 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 816 { 817 struct spdk_bdev_examine_item *item; 818 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 819 spdk_json_write_object_begin(w); 820 spdk_json_write_named_string(w, "method", "bdev_examine"); 821 spdk_json_write_named_object_begin(w, "params"); 822 spdk_json_write_named_string(w, "name", item->name); 823 spdk_json_write_object_end(w); 824 spdk_json_write_object_end(w); 825 } 826 } 827 828 struct spdk_bdev * 829 spdk_bdev_first(void) 830 { 831 struct spdk_bdev *bdev; 832 833 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 834 if (bdev) { 835 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 836 } 837 838 return bdev; 839 } 840 841 struct spdk_bdev * 842 spdk_bdev_next(struct spdk_bdev *prev) 843 { 844 struct spdk_bdev *bdev; 845 846 bdev = TAILQ_NEXT(prev, internal.link); 847 if (bdev) { 848 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 849 } 850 851 return bdev; 852 } 853 854 static struct spdk_bdev * 855 _bdev_next_leaf(struct spdk_bdev *bdev) 856 { 857 while (bdev != NULL) { 858 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 859 return bdev; 860 } else { 861 bdev = TAILQ_NEXT(bdev, internal.link); 862 } 863 } 864 865 return bdev; 866 } 867 868 struct spdk_bdev * 869 spdk_bdev_first_leaf(void) 870 { 871 struct spdk_bdev *bdev; 872 873 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 874 875 if (bdev) { 876 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 877 } 878 879 return bdev; 880 } 881 882 struct spdk_bdev * 883 spdk_bdev_next_leaf(struct spdk_bdev *prev) 884 { 885 struct spdk_bdev *bdev; 886 887 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 888 889 if (bdev) { 890 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 891 } 892 893 return bdev; 894 } 895 896 static inline bool 897 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 898 { 899 return bdev_io->internal.memory_domain; 900 } 901 902 static inline bool 903 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 904 { 905 return bdev_io->internal.has_accel_sequence; 906 } 907 908 static inline void 909 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 910 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 911 { 912 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 913 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 914 * channels we will instead wait for half to complete. 915 */ 916 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 917 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 918 919 assert(state != BDEV_IO_RETRY_STATE_INVALID); 920 bdev_io->internal.retry_state = state; 921 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 922 } 923 924 static inline void 925 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 926 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 927 { 928 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 929 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 930 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 931 932 assert(state != BDEV_IO_RETRY_STATE_INVALID); 933 bdev_io->internal.retry_state = state; 934 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 935 } 936 937 void 938 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 939 { 940 struct iovec *iovs; 941 942 if (bdev_io->u.bdev.iovs == NULL) { 943 bdev_io->u.bdev.iovs = &bdev_io->iov; 944 bdev_io->u.bdev.iovcnt = 1; 945 } 946 947 iovs = bdev_io->u.bdev.iovs; 948 949 assert(iovs != NULL); 950 assert(bdev_io->u.bdev.iovcnt >= 1); 951 952 iovs[0].iov_base = buf; 953 iovs[0].iov_len = len; 954 } 955 956 void 957 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 958 { 959 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 960 bdev_io->u.bdev.md_buf = md_buf; 961 } 962 963 static bool 964 _is_buf_allocated(const struct iovec *iovs) 965 { 966 if (iovs == NULL) { 967 return false; 968 } 969 970 return iovs[0].iov_base != NULL; 971 } 972 973 static bool 974 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 975 { 976 int i; 977 uintptr_t iov_base; 978 979 if (spdk_likely(alignment == 1)) { 980 return true; 981 } 982 983 for (i = 0; i < iovcnt; i++) { 984 iov_base = (uintptr_t)iovs[i].iov_base; 985 if ((iov_base & (alignment - 1)) != 0) { 986 return false; 987 } 988 } 989 990 return true; 991 } 992 993 static inline bool 994 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 995 { 996 if (!bdev_io->internal.accel_sequence) { 997 return false; 998 } 999 1000 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1001 * bdev module didn't support accel sequences */ 1002 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1003 } 1004 1005 static inline void 1006 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1007 struct spdk_bdev_shared_resource *shared_resource) 1008 { 1009 bdev_ch->io_outstanding++; 1010 shared_resource->io_outstanding++; 1011 } 1012 1013 static inline void 1014 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1015 struct spdk_bdev_shared_resource *shared_resource) 1016 { 1017 assert(bdev_ch->io_outstanding > 0); 1018 assert(shared_resource->io_outstanding > 0); 1019 bdev_ch->io_outstanding--; 1020 shared_resource->io_outstanding--; 1021 } 1022 1023 static void 1024 bdev_io_submit_sequence_cb(void *ctx, int status) 1025 { 1026 struct spdk_bdev_io *bdev_io = ctx; 1027 1028 bdev_io->u.bdev.accel_sequence = NULL; 1029 bdev_io->internal.accel_sequence = NULL; 1030 1031 if (spdk_unlikely(status != 0)) { 1032 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1033 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1034 bdev_io_complete_unsubmitted(bdev_io); 1035 return; 1036 } 1037 1038 bdev_io_submit(bdev_io); 1039 } 1040 1041 static void 1042 bdev_io_exec_sequence_cb(void *ctx, int status) 1043 { 1044 struct spdk_bdev_io *bdev_io = ctx; 1045 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1046 1047 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1048 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1049 1050 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1051 bdev_ch_retry_io(ch); 1052 } 1053 1054 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1055 } 1056 1057 static void 1058 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1059 { 1060 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1061 1062 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1063 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1064 1065 /* Since the operations are appended during submission, they're in the opposite order than 1066 * how we want to execute them for reads (i.e. we need to execute the most recently added 1067 * operation first), so reverse the sequence before executing it. 1068 */ 1069 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1070 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1071 } 1072 1073 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1074 bdev_io_increment_outstanding(ch, ch->shared_resource); 1075 bdev_io->internal.data_transfer_cpl = cb_fn; 1076 1077 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1078 bdev_io_exec_sequence_cb, bdev_io); 1079 } 1080 1081 static void 1082 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1083 { 1084 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1085 void *buf; 1086 1087 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1088 buf = bdev_io->internal.buf; 1089 bdev_io->internal.buf = NULL; 1090 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1091 bdev_io->internal.get_aux_buf_cb = NULL; 1092 } else { 1093 assert(bdev_io->internal.get_buf_cb != NULL); 1094 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1095 bdev_io->internal.get_buf_cb = NULL; 1096 } 1097 } 1098 1099 static void 1100 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1101 { 1102 struct spdk_bdev_io *bdev_io = ctx; 1103 1104 if (rc) { 1105 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1106 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1107 } 1108 bdev_io_get_buf_complete(bdev_io, !rc); 1109 } 1110 1111 static void 1112 bdev_io_pull_md_buf_done(void *ctx, int status) 1113 { 1114 struct spdk_bdev_io *bdev_io = ctx; 1115 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1116 1117 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1118 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1119 1120 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1121 bdev_ch_retry_io(ch); 1122 } 1123 1124 assert(bdev_io->internal.data_transfer_cpl); 1125 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1126 } 1127 1128 static void 1129 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1130 { 1131 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1132 int rc = 0; 1133 1134 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1135 if (bdev_io_use_memory_domain(bdev_io)) { 1136 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1137 bdev_io_increment_outstanding(ch, ch->shared_resource); 1138 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1139 bdev_io->internal.memory_domain_ctx, 1140 &bdev_io->internal.orig_md_iov, 1, 1141 &bdev_io->internal.bounce_md_iov, 1, 1142 bdev_io_pull_md_buf_done, bdev_io); 1143 if (rc == 0) { 1144 /* Continue to submit IO in completion callback */ 1145 return; 1146 } 1147 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1148 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1149 if (rc != -ENOMEM) { 1150 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1151 spdk_memory_domain_get_dma_device_id( 1152 bdev_io->internal.memory_domain), rc); 1153 } 1154 } else { 1155 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1156 bdev_io->internal.orig_md_iov.iov_base, 1157 bdev_io->internal.orig_md_iov.iov_len); 1158 } 1159 } 1160 1161 if (spdk_unlikely(rc == -ENOMEM)) { 1162 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1163 } else { 1164 assert(bdev_io->internal.data_transfer_cpl); 1165 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1166 } 1167 } 1168 1169 static void 1170 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1171 { 1172 /* save original md_buf */ 1173 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1174 bdev_io->internal.orig_md_iov.iov_len = len; 1175 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1176 bdev_io->internal.bounce_md_iov.iov_len = len; 1177 /* set bounce md_buf */ 1178 bdev_io->u.bdev.md_buf = md_buf; 1179 1180 bdev_io_pull_md_buf(bdev_io); 1181 } 1182 1183 static void 1184 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1185 { 1186 struct spdk_bdev *bdev = bdev_io->bdev; 1187 uint64_t md_len; 1188 void *buf; 1189 1190 if (spdk_bdev_is_md_separate(bdev)) { 1191 assert(!bdev_io_use_accel_sequence(bdev_io)); 1192 1193 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1194 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1195 1196 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1197 1198 if (bdev_io->u.bdev.md_buf != NULL) { 1199 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1200 return; 1201 } else { 1202 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1203 } 1204 } 1205 1206 bdev_io_get_buf_complete(bdev_io, true); 1207 } 1208 1209 static inline void 1210 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1211 { 1212 if (rc) { 1213 SPDK_ERRLOG("Failed to get data buffer\n"); 1214 assert(bdev_io->internal.data_transfer_cpl); 1215 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1216 return; 1217 } 1218 1219 _bdev_io_set_md_buf(bdev_io); 1220 } 1221 1222 static void 1223 bdev_io_pull_data_done_and_track(void *ctx, int status) 1224 { 1225 struct spdk_bdev_io *bdev_io = ctx; 1226 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1227 1228 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1229 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1230 1231 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1232 bdev_ch_retry_io(ch); 1233 } 1234 1235 bdev_io_pull_data_done(bdev_io, status); 1236 } 1237 1238 static void 1239 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1240 { 1241 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1242 int rc = 0; 1243 1244 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1245 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1246 * operation */ 1247 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1248 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1249 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1250 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1251 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1252 NULL, NULL, 1253 bdev_io->internal.orig_iovs, 1254 bdev_io->internal.orig_iovcnt, 1255 bdev_io->internal.memory_domain, 1256 bdev_io->internal.memory_domain_ctx, 1257 0, NULL, NULL); 1258 } else { 1259 /* We need to reverse the src/dst for reads */ 1260 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1261 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1262 bdev_io->internal.orig_iovs, 1263 bdev_io->internal.orig_iovcnt, 1264 bdev_io->internal.memory_domain, 1265 bdev_io->internal.memory_domain_ctx, 1266 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1267 NULL, NULL, 0, NULL, NULL); 1268 } 1269 1270 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1271 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1272 bdev_io->internal.accel_sequence); 1273 } 1274 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1275 /* if this is write path, copy data from original buffer to bounce buffer */ 1276 if (bdev_io_use_memory_domain(bdev_io)) { 1277 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1278 bdev_io_increment_outstanding(ch, ch->shared_resource); 1279 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1280 bdev_io->internal.memory_domain_ctx, 1281 bdev_io->internal.orig_iovs, 1282 (uint32_t) bdev_io->internal.orig_iovcnt, 1283 bdev_io->u.bdev.iovs, 1, 1284 bdev_io_pull_data_done_and_track, 1285 bdev_io); 1286 if (rc == 0) { 1287 /* Continue to submit IO in completion callback */ 1288 return; 1289 } 1290 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1291 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1292 if (rc != -ENOMEM) { 1293 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1294 spdk_memory_domain_get_dma_device_id( 1295 bdev_io->internal.memory_domain)); 1296 } 1297 } else { 1298 assert(bdev_io->u.bdev.iovcnt == 1); 1299 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1300 bdev_io->u.bdev.iovs[0].iov_len, 1301 bdev_io->internal.orig_iovs, 1302 bdev_io->internal.orig_iovcnt); 1303 } 1304 } 1305 1306 if (spdk_unlikely(rc == -ENOMEM)) { 1307 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1308 } else { 1309 bdev_io_pull_data_done(bdev_io, rc); 1310 } 1311 } 1312 1313 static void 1314 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1315 bdev_copy_bounce_buffer_cpl cpl_cb) 1316 { 1317 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1318 1319 bdev_io->internal.data_transfer_cpl = cpl_cb; 1320 /* save original iovec */ 1321 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1322 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1323 /* set bounce iov */ 1324 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1325 bdev_io->u.bdev.iovcnt = 1; 1326 /* set bounce buffer for this operation */ 1327 bdev_io->u.bdev.iovs[0].iov_base = buf; 1328 bdev_io->u.bdev.iovs[0].iov_len = len; 1329 1330 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1331 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1332 } else { 1333 bdev_io_pull_data(bdev_io); 1334 } 1335 } 1336 1337 static void 1338 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1339 { 1340 struct spdk_bdev *bdev = bdev_io->bdev; 1341 bool buf_allocated; 1342 uint64_t alignment; 1343 void *aligned_buf; 1344 1345 bdev_io->internal.buf = buf; 1346 1347 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1348 bdev_io_get_buf_complete(bdev_io, true); 1349 return; 1350 } 1351 1352 alignment = spdk_bdev_get_buf_align(bdev); 1353 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1354 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1355 1356 if (buf_allocated) { 1357 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1358 /* Continue in completion callback */ 1359 return; 1360 } else { 1361 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1362 } 1363 1364 _bdev_io_set_md_buf(bdev_io); 1365 } 1366 1367 static inline uint64_t 1368 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1369 { 1370 struct spdk_bdev *bdev = bdev_io->bdev; 1371 uint64_t md_len, alignment; 1372 1373 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1374 1375 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1376 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1377 1378 return len + alignment + md_len; 1379 } 1380 1381 static void 1382 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1383 { 1384 struct spdk_bdev_mgmt_channel *ch; 1385 1386 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1387 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1388 } 1389 1390 static void 1391 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1392 { 1393 assert(bdev_io->internal.buf != NULL); 1394 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1395 bdev_io->internal.buf = NULL; 1396 } 1397 1398 void 1399 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1400 { 1401 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1402 1403 assert(buf != NULL); 1404 _bdev_io_put_buf(bdev_io, buf, len); 1405 } 1406 1407 static inline void 1408 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1409 struct spdk_bdev_io *bdev_io) 1410 { 1411 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1412 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1413 * sequence pointer to make sure we won't touch it anymore. */ 1414 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1415 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1416 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1417 bdev_io->internal.accel_sequence = NULL; 1418 } 1419 1420 bdev->fn_table->submit_request(ioch, bdev_io); 1421 } 1422 1423 static inline void 1424 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1425 { 1426 struct spdk_bdev *bdev = bdev_ch->bdev; 1427 1428 bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource); 1429 bdev_io->internal.error.nvme.cdw0 = 0; 1430 bdev_io->num_retries++; 1431 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1432 } 1433 1434 static void 1435 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1436 { 1437 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1438 struct spdk_bdev_io *bdev_io; 1439 1440 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1441 /* 1442 * Allow some more I/O to complete before retrying the nomem_io queue. 1443 * Some drivers (such as nvme) cannot immediately take a new I/O in 1444 * the context of a completion, because the resources for the I/O are 1445 * not released until control returns to the bdev poller. Also, we 1446 * may require several small I/O to complete before a larger I/O 1447 * (that requires splitting) can be submitted. 1448 */ 1449 return; 1450 } 1451 1452 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1453 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1454 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1455 1456 switch (bdev_io->internal.retry_state) { 1457 case BDEV_IO_RETRY_STATE_SUBMIT: 1458 bdev_ch_resubmit_io(bdev_ch, bdev_io); 1459 break; 1460 case BDEV_IO_RETRY_STATE_PULL: 1461 bdev_io_pull_data(bdev_io); 1462 break; 1463 case BDEV_IO_RETRY_STATE_PULL_MD: 1464 bdev_io_pull_md_buf(bdev_io); 1465 break; 1466 case BDEV_IO_RETRY_STATE_PUSH: 1467 bdev_io_push_bounce_data(bdev_io); 1468 break; 1469 case BDEV_IO_RETRY_STATE_PUSH_MD: 1470 bdev_io_push_bounce_md_buf(bdev_io); 1471 break; 1472 default: 1473 assert(0 && "invalid retry state"); 1474 break; 1475 } 1476 1477 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1478 /* This IO completed again with NOMEM status, so break the loop and 1479 * don't try anymore. Note that a bdev_io that fails with NOMEM 1480 * always gets requeued at the front of the list, to maintain 1481 * ordering. 1482 */ 1483 break; 1484 } 1485 } 1486 } 1487 1488 static inline bool 1489 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1490 { 1491 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1492 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1493 1494 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1495 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1496 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1497 1498 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1499 * ownership of that sequence is transferred back to the bdev layer, so we need to 1500 * restore internal.accel_sequence to make sure that the sequence is handled 1501 * correctly in case the I/O is later aborted. */ 1502 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1503 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1504 assert(bdev_io->internal.accel_sequence == NULL); 1505 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1506 } 1507 1508 return true; 1509 } 1510 1511 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1512 bdev_ch_retry_io(bdev_ch); 1513 } 1514 1515 return false; 1516 } 1517 1518 static void 1519 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1520 { 1521 struct spdk_bdev_io *bdev_io = ctx; 1522 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1523 1524 if (rc) { 1525 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1526 } 1527 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1528 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1529 */ 1530 bdev_io_put_buf(bdev_io); 1531 1532 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1533 bdev_ch_retry_io(ch); 1534 } 1535 1536 /* Continue with IO completion flow */ 1537 bdev_io_complete(bdev_io); 1538 } 1539 1540 static void 1541 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1542 { 1543 struct spdk_bdev_io *bdev_io = ctx; 1544 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1545 1546 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1547 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1548 1549 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1550 bdev_ch_retry_io(ch); 1551 } 1552 1553 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1554 } 1555 1556 static inline void 1557 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1558 { 1559 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1560 int rc = 0; 1561 1562 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1563 /* do the same for metadata buffer */ 1564 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1565 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1566 1567 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1568 if (bdev_io_use_memory_domain(bdev_io)) { 1569 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1570 bdev_io_increment_outstanding(ch, ch->shared_resource); 1571 /* If memory domain is used then we need to call async push function */ 1572 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1573 bdev_io->internal.memory_domain_ctx, 1574 &bdev_io->internal.orig_md_iov, 1575 (uint32_t)bdev_io->internal.orig_iovcnt, 1576 &bdev_io->internal.bounce_md_iov, 1, 1577 bdev_io_push_bounce_md_buf_done, 1578 bdev_io); 1579 if (rc == 0) { 1580 /* Continue IO completion in async callback */ 1581 return; 1582 } 1583 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1584 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1585 if (rc != -ENOMEM) { 1586 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1587 spdk_memory_domain_get_dma_device_id( 1588 bdev_io->internal.memory_domain)); 1589 } 1590 } else { 1591 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1592 bdev_io->internal.orig_md_iov.iov_len); 1593 } 1594 } 1595 } 1596 1597 if (spdk_unlikely(rc == -ENOMEM)) { 1598 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1599 } else { 1600 assert(bdev_io->internal.data_transfer_cpl); 1601 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1602 } 1603 } 1604 1605 static inline void 1606 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1607 { 1608 assert(bdev_io->internal.data_transfer_cpl); 1609 if (rc) { 1610 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1611 return; 1612 } 1613 1614 /* set original buffer for this io */ 1615 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1616 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1617 /* disable bouncing buffer for this io */ 1618 bdev_io->internal.orig_iovcnt = 0; 1619 bdev_io->internal.orig_iovs = NULL; 1620 1621 bdev_io_push_bounce_md_buf(bdev_io); 1622 } 1623 1624 static void 1625 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1626 { 1627 struct spdk_bdev_io *bdev_io = ctx; 1628 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1629 1630 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1631 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1632 1633 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1634 bdev_ch_retry_io(ch); 1635 } 1636 1637 bdev_io_push_bounce_data_done(bdev_io, status); 1638 } 1639 1640 static inline void 1641 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1642 { 1643 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1644 int rc = 0; 1645 1646 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1647 assert(!bdev_io_use_accel_sequence(bdev_io)); 1648 1649 /* if this is read path, copy data from bounce buffer to original buffer */ 1650 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1651 if (bdev_io_use_memory_domain(bdev_io)) { 1652 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1653 bdev_io_increment_outstanding(ch, ch->shared_resource); 1654 /* If memory domain is used then we need to call async push function */ 1655 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1656 bdev_io->internal.memory_domain_ctx, 1657 bdev_io->internal.orig_iovs, 1658 (uint32_t)bdev_io->internal.orig_iovcnt, 1659 &bdev_io->internal.bounce_iov, 1, 1660 bdev_io_push_bounce_data_done_and_track, 1661 bdev_io); 1662 if (rc == 0) { 1663 /* Continue IO completion in async callback */ 1664 return; 1665 } 1666 1667 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1668 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1669 if (rc != -ENOMEM) { 1670 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1671 spdk_memory_domain_get_dma_device_id( 1672 bdev_io->internal.memory_domain)); 1673 } 1674 } else { 1675 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1676 bdev_io->internal.orig_iovcnt, 1677 bdev_io->internal.bounce_iov.iov_base, 1678 bdev_io->internal.bounce_iov.iov_len); 1679 } 1680 } 1681 1682 if (spdk_unlikely(rc == -ENOMEM)) { 1683 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1684 } else { 1685 bdev_io_push_bounce_data_done(bdev_io, rc); 1686 } 1687 } 1688 1689 static inline void 1690 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1691 { 1692 bdev_io->internal.data_transfer_cpl = cpl_cb; 1693 bdev_io_push_bounce_data(bdev_io); 1694 } 1695 1696 static void 1697 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1698 { 1699 struct spdk_bdev_io *bdev_io; 1700 1701 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1702 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1703 } 1704 1705 static void 1706 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1707 { 1708 struct spdk_bdev_mgmt_channel *mgmt_ch; 1709 uint64_t max_len; 1710 void *buf; 1711 1712 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1713 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1714 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1715 1716 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1717 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1718 bdev_io_get_buf_complete(bdev_io, false); 1719 return; 1720 } 1721 1722 bdev_io->internal.buf_len = len; 1723 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1724 bdev_io_get_iobuf_cb); 1725 if (buf != NULL) { 1726 _bdev_io_set_buf(bdev_io, buf, len); 1727 } 1728 } 1729 1730 void 1731 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1732 { 1733 struct spdk_bdev *bdev = bdev_io->bdev; 1734 uint64_t alignment; 1735 1736 assert(cb != NULL); 1737 bdev_io->internal.get_buf_cb = cb; 1738 1739 alignment = spdk_bdev_get_buf_align(bdev); 1740 1741 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1742 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1743 /* Buffer already present and aligned */ 1744 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1745 return; 1746 } 1747 1748 bdev_io_get_buf(bdev_io, len); 1749 } 1750 1751 static void 1752 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1753 bool success) 1754 { 1755 if (!success) { 1756 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1757 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1758 bdev_io_complete_unsubmitted(bdev_io); 1759 return; 1760 } 1761 1762 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1763 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1764 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1765 return; 1766 } 1767 /* For reads we'll execute the sequence after the data is read, so, for now, only 1768 * clear out accel_sequence pointer and submit the IO */ 1769 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1770 bdev_io->u.bdev.accel_sequence = NULL; 1771 } 1772 1773 bdev_io_submit(bdev_io); 1774 } 1775 1776 static void 1777 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1778 uint64_t len) 1779 { 1780 assert(cb != NULL); 1781 bdev_io->internal.get_buf_cb = cb; 1782 1783 bdev_io_get_buf(bdev_io, len); 1784 } 1785 1786 void 1787 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1788 { 1789 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1790 1791 assert(cb != NULL); 1792 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1793 bdev_io->internal.get_aux_buf_cb = cb; 1794 bdev_io_get_buf(bdev_io, len); 1795 } 1796 1797 static int 1798 bdev_module_get_max_ctx_size(void) 1799 { 1800 struct spdk_bdev_module *bdev_module; 1801 int max_bdev_module_size = 0; 1802 1803 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1804 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1805 max_bdev_module_size = bdev_module->get_ctx_size(); 1806 } 1807 } 1808 1809 return max_bdev_module_size; 1810 } 1811 1812 static void 1813 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1814 { 1815 int i; 1816 struct spdk_bdev_qos *qos = bdev->internal.qos; 1817 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1818 1819 if (!qos) { 1820 return; 1821 } 1822 1823 spdk_bdev_get_qos_rate_limits(bdev, limits); 1824 1825 spdk_json_write_object_begin(w); 1826 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1827 1828 spdk_json_write_named_object_begin(w, "params"); 1829 spdk_json_write_named_string(w, "name", bdev->name); 1830 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1831 if (limits[i] > 0) { 1832 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1833 } 1834 } 1835 spdk_json_write_object_end(w); 1836 1837 spdk_json_write_object_end(w); 1838 } 1839 1840 void 1841 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1842 { 1843 struct spdk_bdev_module *bdev_module; 1844 struct spdk_bdev *bdev; 1845 1846 assert(w != NULL); 1847 1848 spdk_json_write_array_begin(w); 1849 1850 spdk_json_write_object_begin(w); 1851 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1852 spdk_json_write_named_object_begin(w, "params"); 1853 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1854 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1855 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1856 spdk_json_write_object_end(w); 1857 spdk_json_write_object_end(w); 1858 1859 bdev_examine_allowlist_config_json(w); 1860 1861 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1862 if (bdev_module->config_json) { 1863 bdev_module->config_json(w); 1864 } 1865 } 1866 1867 spdk_spin_lock(&g_bdev_mgr.spinlock); 1868 1869 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1870 if (bdev->fn_table->write_config_json) { 1871 bdev->fn_table->write_config_json(bdev, w); 1872 } 1873 1874 bdev_qos_config_json(bdev, w); 1875 } 1876 1877 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1878 1879 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1880 spdk_json_write_object_begin(w); 1881 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1882 spdk_json_write_object_end(w); 1883 1884 spdk_json_write_array_end(w); 1885 } 1886 1887 static void 1888 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1889 { 1890 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1891 struct spdk_bdev_io *bdev_io; 1892 1893 spdk_iobuf_channel_fini(&ch->iobuf); 1894 1895 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1896 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1897 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1898 ch->per_thread_cache_count--; 1899 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1900 } 1901 1902 assert(ch->per_thread_cache_count == 0); 1903 } 1904 1905 static int 1906 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1907 { 1908 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1909 struct spdk_bdev_io *bdev_io; 1910 uint32_t i; 1911 int rc; 1912 1913 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1914 if (rc != 0) { 1915 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1916 return -1; 1917 } 1918 1919 STAILQ_INIT(&ch->per_thread_cache); 1920 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1921 1922 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1923 ch->per_thread_cache_count = 0; 1924 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1925 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1926 if (bdev_io == NULL) { 1927 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1928 assert(false); 1929 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1930 return -1; 1931 } 1932 ch->per_thread_cache_count++; 1933 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1934 } 1935 1936 TAILQ_INIT(&ch->shared_resources); 1937 TAILQ_INIT(&ch->io_wait_queue); 1938 1939 return 0; 1940 } 1941 1942 static void 1943 bdev_init_complete(int rc) 1944 { 1945 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1946 void *cb_arg = g_init_cb_arg; 1947 struct spdk_bdev_module *m; 1948 1949 g_bdev_mgr.init_complete = true; 1950 g_init_cb_fn = NULL; 1951 g_init_cb_arg = NULL; 1952 1953 /* 1954 * For modules that need to know when subsystem init is complete, 1955 * inform them now. 1956 */ 1957 if (rc == 0) { 1958 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1959 if (m->init_complete) { 1960 m->init_complete(); 1961 } 1962 } 1963 } 1964 1965 cb_fn(cb_arg, rc); 1966 } 1967 1968 static bool 1969 bdev_module_all_actions_completed(void) 1970 { 1971 struct spdk_bdev_module *m; 1972 1973 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1974 if (m->internal.action_in_progress > 0) { 1975 return false; 1976 } 1977 } 1978 return true; 1979 } 1980 1981 static void 1982 bdev_module_action_complete(void) 1983 { 1984 /* 1985 * Don't finish bdev subsystem initialization if 1986 * module pre-initialization is still in progress, or 1987 * the subsystem been already initialized. 1988 */ 1989 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1990 return; 1991 } 1992 1993 /* 1994 * Check all bdev modules for inits/examinations in progress. If any 1995 * exist, return immediately since we cannot finish bdev subsystem 1996 * initialization until all are completed. 1997 */ 1998 if (!bdev_module_all_actions_completed()) { 1999 return; 2000 } 2001 2002 /* 2003 * Modules already finished initialization - now that all 2004 * the bdev modules have finished their asynchronous I/O 2005 * processing, the entire bdev layer can be marked as complete. 2006 */ 2007 bdev_init_complete(0); 2008 } 2009 2010 static void 2011 bdev_module_action_done(struct spdk_bdev_module *module) 2012 { 2013 spdk_spin_lock(&module->internal.spinlock); 2014 assert(module->internal.action_in_progress > 0); 2015 module->internal.action_in_progress--; 2016 spdk_spin_unlock(&module->internal.spinlock); 2017 bdev_module_action_complete(); 2018 } 2019 2020 void 2021 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2022 { 2023 assert(module->async_init); 2024 bdev_module_action_done(module); 2025 } 2026 2027 void 2028 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2029 { 2030 bdev_module_action_done(module); 2031 } 2032 2033 /** The last initialized bdev module */ 2034 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2035 2036 static void 2037 bdev_init_failed(void *cb_arg) 2038 { 2039 struct spdk_bdev_module *module = cb_arg; 2040 2041 spdk_spin_lock(&module->internal.spinlock); 2042 assert(module->internal.action_in_progress > 0); 2043 module->internal.action_in_progress--; 2044 spdk_spin_unlock(&module->internal.spinlock); 2045 bdev_init_complete(-1); 2046 } 2047 2048 static int 2049 bdev_modules_init(void) 2050 { 2051 struct spdk_bdev_module *module; 2052 int rc = 0; 2053 2054 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2055 g_resume_bdev_module = module; 2056 if (module->async_init) { 2057 spdk_spin_lock(&module->internal.spinlock); 2058 module->internal.action_in_progress = 1; 2059 spdk_spin_unlock(&module->internal.spinlock); 2060 } 2061 rc = module->module_init(); 2062 if (rc != 0) { 2063 /* Bump action_in_progress to prevent other modules from completion of modules_init 2064 * Send message to defer application shutdown until resources are cleaned up */ 2065 spdk_spin_lock(&module->internal.spinlock); 2066 module->internal.action_in_progress = 1; 2067 spdk_spin_unlock(&module->internal.spinlock); 2068 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2069 return rc; 2070 } 2071 } 2072 2073 g_resume_bdev_module = NULL; 2074 return 0; 2075 } 2076 2077 void 2078 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2079 { 2080 int rc = 0; 2081 char mempool_name[32]; 2082 2083 assert(cb_fn != NULL); 2084 2085 g_init_cb_fn = cb_fn; 2086 g_init_cb_arg = cb_arg; 2087 2088 spdk_notify_type_register("bdev_register"); 2089 spdk_notify_type_register("bdev_unregister"); 2090 2091 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2092 2093 rc = spdk_iobuf_register_module("bdev"); 2094 if (rc != 0) { 2095 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2096 bdev_init_complete(-1); 2097 return; 2098 } 2099 2100 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2101 g_bdev_opts.bdev_io_pool_size, 2102 sizeof(struct spdk_bdev_io) + 2103 bdev_module_get_max_ctx_size(), 2104 0, 2105 SPDK_ENV_SOCKET_ID_ANY); 2106 2107 if (g_bdev_mgr.bdev_io_pool == NULL) { 2108 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2109 bdev_init_complete(-1); 2110 return; 2111 } 2112 2113 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2114 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2115 if (!g_bdev_mgr.zero_buffer) { 2116 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2117 bdev_init_complete(-1); 2118 return; 2119 } 2120 2121 #ifdef SPDK_CONFIG_VTUNE 2122 SPDK_LOG_DEPRECATED(vtune_support); 2123 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2124 #endif 2125 2126 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2127 bdev_mgmt_channel_destroy, 2128 sizeof(struct spdk_bdev_mgmt_channel), 2129 "bdev_mgr"); 2130 2131 rc = bdev_modules_init(); 2132 g_bdev_mgr.module_init_complete = true; 2133 if (rc != 0) { 2134 SPDK_ERRLOG("bdev modules init failed\n"); 2135 return; 2136 } 2137 2138 bdev_module_action_complete(); 2139 } 2140 2141 static void 2142 bdev_mgr_unregister_cb(void *io_device) 2143 { 2144 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2145 2146 if (g_bdev_mgr.bdev_io_pool) { 2147 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2148 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2149 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2150 g_bdev_opts.bdev_io_pool_size); 2151 } 2152 2153 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2154 } 2155 2156 spdk_free(g_bdev_mgr.zero_buffer); 2157 2158 bdev_examine_allowlist_free(); 2159 2160 cb_fn(g_fini_cb_arg); 2161 g_fini_cb_fn = NULL; 2162 g_fini_cb_arg = NULL; 2163 g_bdev_mgr.init_complete = false; 2164 g_bdev_mgr.module_init_complete = false; 2165 } 2166 2167 static void 2168 bdev_module_fini_iter(void *arg) 2169 { 2170 struct spdk_bdev_module *bdev_module; 2171 2172 /* FIXME: Handling initialization failures is broken now, 2173 * so we won't even try cleaning up after successfully 2174 * initialized modules. if module_init_complete is false, 2175 * just call spdk_bdev_mgr_unregister_cb 2176 */ 2177 if (!g_bdev_mgr.module_init_complete) { 2178 bdev_mgr_unregister_cb(NULL); 2179 return; 2180 } 2181 2182 /* Start iterating from the last touched module */ 2183 if (!g_resume_bdev_module) { 2184 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2185 } else { 2186 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2187 internal.tailq); 2188 } 2189 2190 while (bdev_module) { 2191 if (bdev_module->async_fini) { 2192 /* Save our place so we can resume later. We must 2193 * save the variable here, before calling module_fini() 2194 * below, because in some cases the module may immediately 2195 * call spdk_bdev_module_fini_done() and re-enter 2196 * this function to continue iterating. */ 2197 g_resume_bdev_module = bdev_module; 2198 } 2199 2200 if (bdev_module->module_fini) { 2201 bdev_module->module_fini(); 2202 } 2203 2204 if (bdev_module->async_fini) { 2205 return; 2206 } 2207 2208 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2209 internal.tailq); 2210 } 2211 2212 g_resume_bdev_module = NULL; 2213 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2214 } 2215 2216 void 2217 spdk_bdev_module_fini_done(void) 2218 { 2219 if (spdk_get_thread() != g_fini_thread) { 2220 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2221 } else { 2222 bdev_module_fini_iter(NULL); 2223 } 2224 } 2225 2226 static void 2227 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2228 { 2229 struct spdk_bdev *bdev = cb_arg; 2230 2231 if (bdeverrno && bdev) { 2232 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2233 bdev->name); 2234 2235 /* 2236 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2237 * bdev; try to continue by manually removing this bdev from the list and continue 2238 * with the next bdev in the list. 2239 */ 2240 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2241 } 2242 2243 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2244 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2245 /* 2246 * Bdev module finish need to be deferred as we might be in the middle of some context 2247 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2248 * after returning. 2249 */ 2250 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2251 return; 2252 } 2253 2254 /* 2255 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2256 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2257 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2258 * base bdevs. 2259 * 2260 * Also, walk the list in the reverse order. 2261 */ 2262 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2263 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2264 spdk_spin_lock(&bdev->internal.spinlock); 2265 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2266 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2267 spdk_spin_unlock(&bdev->internal.spinlock); 2268 continue; 2269 } 2270 spdk_spin_unlock(&bdev->internal.spinlock); 2271 2272 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2273 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2274 return; 2275 } 2276 2277 /* 2278 * If any bdev fails to unclaim underlying bdev properly, we may face the 2279 * case of bdev list consisting of claimed bdevs only (if claims are managed 2280 * correctly, this would mean there's a loop in the claims graph which is 2281 * clearly impossible). Warn and unregister last bdev on the list then. 2282 */ 2283 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2284 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2285 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2286 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2287 return; 2288 } 2289 } 2290 2291 static void 2292 bdev_module_fini_start_iter(void *arg) 2293 { 2294 struct spdk_bdev_module *bdev_module; 2295 2296 if (!g_resume_bdev_module) { 2297 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2298 } else { 2299 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2300 } 2301 2302 while (bdev_module) { 2303 if (bdev_module->async_fini_start) { 2304 /* Save our place so we can resume later. We must 2305 * save the variable here, before calling fini_start() 2306 * below, because in some cases the module may immediately 2307 * call spdk_bdev_module_fini_start_done() and re-enter 2308 * this function to continue iterating. */ 2309 g_resume_bdev_module = bdev_module; 2310 } 2311 2312 if (bdev_module->fini_start) { 2313 bdev_module->fini_start(); 2314 } 2315 2316 if (bdev_module->async_fini_start) { 2317 return; 2318 } 2319 2320 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2321 } 2322 2323 g_resume_bdev_module = NULL; 2324 2325 bdev_finish_unregister_bdevs_iter(NULL, 0); 2326 } 2327 2328 void 2329 spdk_bdev_module_fini_start_done(void) 2330 { 2331 if (spdk_get_thread() != g_fini_thread) { 2332 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2333 } else { 2334 bdev_module_fini_start_iter(NULL); 2335 } 2336 } 2337 2338 static void 2339 bdev_finish_wait_for_examine_done(void *cb_arg) 2340 { 2341 bdev_module_fini_start_iter(NULL); 2342 } 2343 2344 static void bdev_open_async_fini(void); 2345 2346 void 2347 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2348 { 2349 int rc; 2350 2351 assert(cb_fn != NULL); 2352 2353 g_fini_thread = spdk_get_thread(); 2354 2355 g_fini_cb_fn = cb_fn; 2356 g_fini_cb_arg = cb_arg; 2357 2358 bdev_open_async_fini(); 2359 2360 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2361 if (rc != 0) { 2362 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2363 bdev_finish_wait_for_examine_done(NULL); 2364 } 2365 } 2366 2367 struct spdk_bdev_io * 2368 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2369 { 2370 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2371 struct spdk_bdev_io *bdev_io; 2372 2373 if (ch->per_thread_cache_count > 0) { 2374 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2375 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2376 ch->per_thread_cache_count--; 2377 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2378 /* 2379 * Don't try to look for bdev_ios in the global pool if there are 2380 * waiters on bdev_ios - we don't want this caller to jump the line. 2381 */ 2382 bdev_io = NULL; 2383 } else { 2384 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2385 } 2386 2387 return bdev_io; 2388 } 2389 2390 void 2391 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2392 { 2393 struct spdk_bdev_mgmt_channel *ch; 2394 2395 assert(bdev_io != NULL); 2396 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2397 2398 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2399 2400 if (bdev_io->internal.buf != NULL) { 2401 bdev_io_put_buf(bdev_io); 2402 } 2403 2404 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2405 ch->per_thread_cache_count++; 2406 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2407 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2408 struct spdk_bdev_io_wait_entry *entry; 2409 2410 entry = TAILQ_FIRST(&ch->io_wait_queue); 2411 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2412 entry->cb_fn(entry->cb_arg); 2413 } 2414 } else { 2415 /* We should never have a full cache with entries on the io wait queue. */ 2416 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2417 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2418 } 2419 } 2420 2421 static bool 2422 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2423 { 2424 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2425 2426 switch (limit) { 2427 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2428 return true; 2429 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2430 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2431 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2432 return false; 2433 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2434 default: 2435 return false; 2436 } 2437 } 2438 2439 static bool 2440 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2441 { 2442 switch (bdev_io->type) { 2443 case SPDK_BDEV_IO_TYPE_NVME_IO: 2444 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2445 case SPDK_BDEV_IO_TYPE_READ: 2446 case SPDK_BDEV_IO_TYPE_WRITE: 2447 return true; 2448 case SPDK_BDEV_IO_TYPE_ZCOPY: 2449 if (bdev_io->u.bdev.zcopy.start) { 2450 return true; 2451 } else { 2452 return false; 2453 } 2454 default: 2455 return false; 2456 } 2457 } 2458 2459 static bool 2460 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2461 { 2462 switch (bdev_io->type) { 2463 case SPDK_BDEV_IO_TYPE_NVME_IO: 2464 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2465 /* Bit 1 (0x2) set for read operation */ 2466 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2467 return true; 2468 } else { 2469 return false; 2470 } 2471 case SPDK_BDEV_IO_TYPE_READ: 2472 return true; 2473 case SPDK_BDEV_IO_TYPE_ZCOPY: 2474 /* Populate to read from disk */ 2475 if (bdev_io->u.bdev.zcopy.populate) { 2476 return true; 2477 } else { 2478 return false; 2479 } 2480 default: 2481 return false; 2482 } 2483 } 2484 2485 static uint64_t 2486 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2487 { 2488 struct spdk_bdev *bdev = bdev_io->bdev; 2489 2490 switch (bdev_io->type) { 2491 case SPDK_BDEV_IO_TYPE_NVME_IO: 2492 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2493 return bdev_io->u.nvme_passthru.nbytes; 2494 case SPDK_BDEV_IO_TYPE_READ: 2495 case SPDK_BDEV_IO_TYPE_WRITE: 2496 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2497 case SPDK_BDEV_IO_TYPE_ZCOPY: 2498 /* Track the data in the start phase only */ 2499 if (bdev_io->u.bdev.zcopy.start) { 2500 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2501 } else { 2502 return 0; 2503 } 2504 default: 2505 return 0; 2506 } 2507 } 2508 2509 static bool 2510 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2511 { 2512 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2513 return true; 2514 } else { 2515 return false; 2516 } 2517 } 2518 2519 static bool 2520 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2521 { 2522 if (bdev_is_read_io(io) == false) { 2523 return false; 2524 } 2525 2526 return bdev_qos_rw_queue_io(limit, io); 2527 } 2528 2529 static bool 2530 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2531 { 2532 if (bdev_is_read_io(io) == true) { 2533 return false; 2534 } 2535 2536 return bdev_qos_rw_queue_io(limit, io); 2537 } 2538 2539 static void 2540 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2541 { 2542 limit->remaining_this_timeslice--; 2543 } 2544 2545 static void 2546 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2547 { 2548 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2549 } 2550 2551 static void 2552 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2553 { 2554 if (bdev_is_read_io(io) == false) { 2555 return; 2556 } 2557 2558 return bdev_qos_rw_bps_update_quota(limit, io); 2559 } 2560 2561 static void 2562 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2563 { 2564 if (bdev_is_read_io(io) == true) { 2565 return; 2566 } 2567 2568 return bdev_qos_rw_bps_update_quota(limit, io); 2569 } 2570 2571 static void 2572 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2573 { 2574 int i; 2575 2576 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2577 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2578 qos->rate_limits[i].queue_io = NULL; 2579 qos->rate_limits[i].update_quota = NULL; 2580 continue; 2581 } 2582 2583 switch (i) { 2584 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2585 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2586 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2587 break; 2588 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2589 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2590 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2591 break; 2592 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2593 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2594 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2595 break; 2596 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2597 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2598 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2599 break; 2600 default: 2601 break; 2602 } 2603 } 2604 } 2605 2606 static void 2607 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2608 struct spdk_bdev_io *bdev_io, 2609 enum spdk_bdev_io_status status) 2610 { 2611 bdev_io->internal.in_submit_request = true; 2612 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2613 spdk_bdev_io_complete(bdev_io, status); 2614 bdev_io->internal.in_submit_request = false; 2615 } 2616 2617 static inline void 2618 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2619 { 2620 struct spdk_bdev *bdev = bdev_io->bdev; 2621 struct spdk_io_channel *ch = bdev_ch->channel; 2622 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2623 2624 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2625 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2626 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2627 2628 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2629 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2630 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2631 SPDK_BDEV_IO_STATUS_SUCCESS); 2632 return; 2633 } 2634 } 2635 2636 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2637 bdev_io->bdev->split_on_write_unit && 2638 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2639 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2640 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2641 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2642 return; 2643 } 2644 2645 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2646 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2647 bdev_io->internal.in_submit_request = true; 2648 bdev_submit_request(bdev, ch, bdev_io); 2649 bdev_io->internal.in_submit_request = false; 2650 } else { 2651 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2652 } 2653 } 2654 2655 static bool 2656 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2657 { 2658 int i; 2659 2660 if (bdev_qos_io_to_limit(bdev_io) == true) { 2661 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2662 if (!qos->rate_limits[i].queue_io) { 2663 continue; 2664 } 2665 2666 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2667 bdev_io) == true) { 2668 return true; 2669 } 2670 } 2671 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2672 if (!qos->rate_limits[i].update_quota) { 2673 continue; 2674 } 2675 2676 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2677 } 2678 } 2679 2680 return false; 2681 } 2682 2683 static inline void 2684 _bdev_io_do_submit(void *ctx) 2685 { 2686 struct spdk_bdev_io *bdev_io = ctx; 2687 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2688 2689 bdev_io_do_submit(ch, bdev_io); 2690 } 2691 2692 static int 2693 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2694 { 2695 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2696 int submitted_ios = 0; 2697 2698 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2699 if (!bdev_qos_queue_io(qos, bdev_io)) { 2700 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2701 2702 if (bdev_io->internal.io_submit_ch) { 2703 /* Send back the IO to the original thread for the actual processing. */ 2704 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2705 bdev_io->internal.io_submit_ch = NULL; 2706 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2707 _bdev_io_do_submit, bdev_io); 2708 } else { 2709 bdev_io_do_submit(ch, bdev_io); 2710 } 2711 2712 submitted_ios++; 2713 } 2714 } 2715 2716 return submitted_ios; 2717 } 2718 2719 static void 2720 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2721 { 2722 int rc; 2723 2724 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2725 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2726 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2727 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2728 &bdev_io->internal.waitq_entry); 2729 if (rc != 0) { 2730 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2731 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2732 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2733 } 2734 } 2735 2736 static bool 2737 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2738 { 2739 uint32_t io_boundary; 2740 struct spdk_bdev *bdev = bdev_io->bdev; 2741 uint32_t max_size = bdev->max_segment_size; 2742 int max_segs = bdev->max_num_segments; 2743 2744 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2745 io_boundary = bdev->write_unit_size; 2746 } else if (bdev->split_on_optimal_io_boundary) { 2747 io_boundary = bdev->optimal_io_boundary; 2748 } else { 2749 io_boundary = 0; 2750 } 2751 2752 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2753 return false; 2754 } 2755 2756 if (io_boundary) { 2757 uint64_t start_stripe, end_stripe; 2758 2759 start_stripe = bdev_io->u.bdev.offset_blocks; 2760 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2761 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2762 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2763 start_stripe >>= spdk_u32log2(io_boundary); 2764 end_stripe >>= spdk_u32log2(io_boundary); 2765 } else { 2766 start_stripe /= io_boundary; 2767 end_stripe /= io_boundary; 2768 } 2769 2770 if (start_stripe != end_stripe) { 2771 return true; 2772 } 2773 } 2774 2775 if (max_segs) { 2776 if (bdev_io->u.bdev.iovcnt > max_segs) { 2777 return true; 2778 } 2779 } 2780 2781 if (max_size) { 2782 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2783 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2784 return true; 2785 } 2786 } 2787 } 2788 2789 return false; 2790 } 2791 2792 static bool 2793 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2794 { 2795 uint32_t num_unmap_segments; 2796 2797 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2798 return false; 2799 } 2800 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2801 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2802 return true; 2803 } 2804 2805 return false; 2806 } 2807 2808 static bool 2809 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2810 { 2811 if (!bdev_io->bdev->max_write_zeroes) { 2812 return false; 2813 } 2814 2815 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2816 return true; 2817 } 2818 2819 return false; 2820 } 2821 2822 static bool 2823 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2824 { 2825 if (bdev_io->bdev->max_copy != 0 && 2826 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2827 return true; 2828 } 2829 2830 return false; 2831 } 2832 2833 static bool 2834 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2835 { 2836 switch (bdev_io->type) { 2837 case SPDK_BDEV_IO_TYPE_READ: 2838 case SPDK_BDEV_IO_TYPE_WRITE: 2839 return bdev_rw_should_split(bdev_io); 2840 case SPDK_BDEV_IO_TYPE_UNMAP: 2841 return bdev_unmap_should_split(bdev_io); 2842 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2843 return bdev_write_zeroes_should_split(bdev_io); 2844 case SPDK_BDEV_IO_TYPE_COPY: 2845 return bdev_copy_should_split(bdev_io); 2846 default: 2847 return false; 2848 } 2849 } 2850 2851 static uint32_t 2852 _to_next_boundary(uint64_t offset, uint32_t boundary) 2853 { 2854 return (boundary - (offset % boundary)); 2855 } 2856 2857 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2858 2859 static void _bdev_rw_split(void *_bdev_io); 2860 2861 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2862 2863 static void 2864 _bdev_unmap_split(void *_bdev_io) 2865 { 2866 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2867 } 2868 2869 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2870 2871 static void 2872 _bdev_write_zeroes_split(void *_bdev_io) 2873 { 2874 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2875 } 2876 2877 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2878 2879 static void 2880 _bdev_copy_split(void *_bdev_io) 2881 { 2882 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2883 } 2884 2885 static int 2886 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2887 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2888 { 2889 int rc; 2890 uint64_t current_offset, current_remaining, current_src_offset; 2891 spdk_bdev_io_wait_cb io_wait_fn; 2892 2893 current_offset = *offset; 2894 current_remaining = *remaining; 2895 2896 bdev_io->u.bdev.split_outstanding++; 2897 2898 io_wait_fn = _bdev_rw_split; 2899 switch (bdev_io->type) { 2900 case SPDK_BDEV_IO_TYPE_READ: 2901 assert(bdev_io->u.bdev.accel_sequence == NULL); 2902 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2903 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2904 iov, iovcnt, md_buf, current_offset, 2905 num_blocks, bdev_io->internal.memory_domain, 2906 bdev_io->internal.memory_domain_ctx, NULL, 2907 bdev_io_split_done, bdev_io); 2908 break; 2909 case SPDK_BDEV_IO_TYPE_WRITE: 2910 assert(bdev_io->u.bdev.accel_sequence == NULL); 2911 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2912 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2913 iov, iovcnt, md_buf, current_offset, 2914 num_blocks, bdev_io->internal.memory_domain, 2915 bdev_io->internal.memory_domain_ctx, NULL, 2916 bdev_io_split_done, bdev_io); 2917 break; 2918 case SPDK_BDEV_IO_TYPE_UNMAP: 2919 io_wait_fn = _bdev_unmap_split; 2920 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2921 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2922 current_offset, num_blocks, 2923 bdev_io_split_done, bdev_io); 2924 break; 2925 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2926 io_wait_fn = _bdev_write_zeroes_split; 2927 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2928 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2929 current_offset, num_blocks, 2930 bdev_io_split_done, bdev_io); 2931 break; 2932 case SPDK_BDEV_IO_TYPE_COPY: 2933 io_wait_fn = _bdev_copy_split; 2934 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2935 (current_offset - bdev_io->u.bdev.offset_blocks); 2936 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2937 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2938 current_offset, current_src_offset, num_blocks, 2939 bdev_io_split_done, bdev_io); 2940 break; 2941 default: 2942 assert(false); 2943 rc = -EINVAL; 2944 break; 2945 } 2946 2947 if (rc == 0) { 2948 current_offset += num_blocks; 2949 current_remaining -= num_blocks; 2950 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2951 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2952 *offset = current_offset; 2953 *remaining = current_remaining; 2954 } else { 2955 bdev_io->u.bdev.split_outstanding--; 2956 if (rc == -ENOMEM) { 2957 if (bdev_io->u.bdev.split_outstanding == 0) { 2958 /* No I/O is outstanding. Hence we should wait here. */ 2959 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2960 } 2961 } else { 2962 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2963 if (bdev_io->u.bdev.split_outstanding == 0) { 2964 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2965 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2966 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2967 } 2968 } 2969 } 2970 2971 return rc; 2972 } 2973 2974 static void 2975 _bdev_rw_split(void *_bdev_io) 2976 { 2977 struct iovec *parent_iov, *iov; 2978 struct spdk_bdev_io *bdev_io = _bdev_io; 2979 struct spdk_bdev *bdev = bdev_io->bdev; 2980 uint64_t parent_offset, current_offset, remaining; 2981 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2982 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2983 uint32_t iovcnt, iov_len, child_iovsize; 2984 uint32_t blocklen = bdev->blocklen; 2985 uint32_t io_boundary; 2986 uint32_t max_segment_size = bdev->max_segment_size; 2987 uint32_t max_child_iovcnt = bdev->max_num_segments; 2988 void *md_buf = NULL; 2989 int rc; 2990 2991 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2992 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2993 SPDK_BDEV_IO_NUM_CHILD_IOV; 2994 2995 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2996 io_boundary = bdev->write_unit_size; 2997 } else if (bdev->split_on_optimal_io_boundary) { 2998 io_boundary = bdev->optimal_io_boundary; 2999 } else { 3000 io_boundary = UINT32_MAX; 3001 } 3002 3003 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3004 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3005 parent_offset = bdev_io->u.bdev.offset_blocks; 3006 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3007 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3008 3009 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3010 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3011 if (parent_iov_offset < parent_iov->iov_len) { 3012 break; 3013 } 3014 parent_iov_offset -= parent_iov->iov_len; 3015 } 3016 3017 child_iovcnt = 0; 3018 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3019 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3020 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3021 to_next_boundary = spdk_min(remaining, to_next_boundary); 3022 to_next_boundary_bytes = to_next_boundary * blocklen; 3023 3024 iov = &bdev_io->child_iov[child_iovcnt]; 3025 iovcnt = 0; 3026 3027 if (bdev_io->u.bdev.md_buf) { 3028 md_buf = (char *)bdev_io->u.bdev.md_buf + 3029 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3030 } 3031 3032 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3033 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3034 iovcnt < child_iovsize) { 3035 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3036 iov_len = parent_iov->iov_len - parent_iov_offset; 3037 3038 iov_len = spdk_min(iov_len, max_segment_size); 3039 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3040 to_next_boundary_bytes -= iov_len; 3041 3042 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3043 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3044 3045 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3046 parent_iov_offset += iov_len; 3047 } else { 3048 parent_iovpos++; 3049 parent_iov_offset = 0; 3050 } 3051 child_iovcnt++; 3052 iovcnt++; 3053 } 3054 3055 if (to_next_boundary_bytes > 0) { 3056 /* We had to stop this child I/O early because we ran out of 3057 * child_iov space or were limited by max_num_segments. 3058 * Ensure the iovs to be aligned with block size and 3059 * then adjust to_next_boundary before starting the 3060 * child I/O. 3061 */ 3062 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3063 iovcnt == child_iovsize); 3064 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3065 if (to_last_block_bytes != 0) { 3066 uint32_t child_iovpos = child_iovcnt - 1; 3067 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3068 * so the loop will naturally end 3069 */ 3070 3071 to_last_block_bytes = blocklen - to_last_block_bytes; 3072 to_next_boundary_bytes += to_last_block_bytes; 3073 while (to_last_block_bytes > 0 && iovcnt > 0) { 3074 iov_len = spdk_min(to_last_block_bytes, 3075 bdev_io->child_iov[child_iovpos].iov_len); 3076 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3077 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3078 child_iovpos--; 3079 if (--iovcnt == 0) { 3080 /* If the child IO is less than a block size just return. 3081 * If the first child IO of any split round is less than 3082 * a block size, an error exit. 3083 */ 3084 if (bdev_io->u.bdev.split_outstanding == 0) { 3085 SPDK_ERRLOG("The first child io was less than a block size\n"); 3086 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3087 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3088 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3089 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3090 } 3091 3092 return; 3093 } 3094 } 3095 3096 to_last_block_bytes -= iov_len; 3097 3098 if (parent_iov_offset == 0) { 3099 parent_iovpos--; 3100 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3101 } 3102 parent_iov_offset -= iov_len; 3103 } 3104 3105 assert(to_last_block_bytes == 0); 3106 } 3107 to_next_boundary -= to_next_boundary_bytes / blocklen; 3108 } 3109 3110 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3111 ¤t_offset, &remaining); 3112 if (spdk_unlikely(rc)) { 3113 return; 3114 } 3115 } 3116 } 3117 3118 static void 3119 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3120 { 3121 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3122 uint32_t num_children_reqs = 0; 3123 int rc; 3124 3125 offset = bdev_io->u.bdev.split_current_offset_blocks; 3126 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3127 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3128 3129 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3130 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3131 3132 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3133 &offset, &remaining); 3134 if (spdk_likely(rc == 0)) { 3135 num_children_reqs++; 3136 } else { 3137 return; 3138 } 3139 } 3140 } 3141 3142 static void 3143 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3144 { 3145 uint64_t offset, write_zeroes_blocks, remaining; 3146 uint32_t num_children_reqs = 0; 3147 int rc; 3148 3149 offset = bdev_io->u.bdev.split_current_offset_blocks; 3150 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3151 3152 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3153 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3154 3155 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3156 &offset, &remaining); 3157 if (spdk_likely(rc == 0)) { 3158 num_children_reqs++; 3159 } else { 3160 return; 3161 } 3162 } 3163 } 3164 3165 static void 3166 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3167 { 3168 uint64_t offset, copy_blocks, remaining; 3169 uint32_t num_children_reqs = 0; 3170 int rc; 3171 3172 offset = bdev_io->u.bdev.split_current_offset_blocks; 3173 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3174 3175 assert(bdev_io->bdev->max_copy != 0); 3176 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3177 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3178 3179 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3180 &offset, &remaining); 3181 if (spdk_likely(rc == 0)) { 3182 num_children_reqs++; 3183 } else { 3184 return; 3185 } 3186 } 3187 } 3188 3189 static void 3190 parent_bdev_io_complete(void *ctx, int rc) 3191 { 3192 struct spdk_bdev_io *parent_io = ctx; 3193 3194 if (rc) { 3195 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3196 } 3197 3198 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3199 parent_io->internal.caller_ctx); 3200 } 3201 3202 static void 3203 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3204 { 3205 struct spdk_bdev_io *bdev_io = ctx; 3206 3207 /* u.bdev.accel_sequence should have already been cleared at this point */ 3208 assert(bdev_io->u.bdev.accel_sequence == NULL); 3209 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3210 bdev_io->internal.accel_sequence = NULL; 3211 3212 if (spdk_unlikely(status != 0)) { 3213 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3214 } 3215 3216 parent_bdev_io_complete(bdev_io, status); 3217 } 3218 3219 static void 3220 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3221 { 3222 struct spdk_bdev_io *parent_io = cb_arg; 3223 3224 spdk_bdev_free_io(bdev_io); 3225 3226 if (!success) { 3227 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3228 /* If any child I/O failed, stop further splitting process. */ 3229 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3230 parent_io->u.bdev.split_remaining_num_blocks = 0; 3231 } 3232 parent_io->u.bdev.split_outstanding--; 3233 if (parent_io->u.bdev.split_outstanding != 0) { 3234 return; 3235 } 3236 3237 /* 3238 * Parent I/O finishes when all blocks are consumed. 3239 */ 3240 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3241 assert(parent_io->internal.cb != bdev_io_split_done); 3242 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3243 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3244 3245 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3246 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3247 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3248 return; 3249 } else if (parent_io->internal.orig_iovcnt != 0 && 3250 !bdev_io_use_accel_sequence(bdev_io)) { 3251 /* bdev IO will be completed in the callback */ 3252 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3253 return; 3254 } 3255 } 3256 3257 parent_bdev_io_complete(parent_io, 0); 3258 return; 3259 } 3260 3261 /* 3262 * Continue with the splitting process. This function will complete the parent I/O if the 3263 * splitting is done. 3264 */ 3265 switch (parent_io->type) { 3266 case SPDK_BDEV_IO_TYPE_READ: 3267 case SPDK_BDEV_IO_TYPE_WRITE: 3268 _bdev_rw_split(parent_io); 3269 break; 3270 case SPDK_BDEV_IO_TYPE_UNMAP: 3271 bdev_unmap_split(parent_io); 3272 break; 3273 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3274 bdev_write_zeroes_split(parent_io); 3275 break; 3276 case SPDK_BDEV_IO_TYPE_COPY: 3277 bdev_copy_split(parent_io); 3278 break; 3279 default: 3280 assert(false); 3281 break; 3282 } 3283 } 3284 3285 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3286 bool success); 3287 3288 static void 3289 bdev_io_split(struct spdk_bdev_io *bdev_io) 3290 { 3291 assert(bdev_io_should_split(bdev_io)); 3292 3293 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3294 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3295 bdev_io->u.bdev.split_outstanding = 0; 3296 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3297 3298 switch (bdev_io->type) { 3299 case SPDK_BDEV_IO_TYPE_READ: 3300 case SPDK_BDEV_IO_TYPE_WRITE: 3301 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3302 _bdev_rw_split(bdev_io); 3303 } else { 3304 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3305 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3306 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3307 } 3308 break; 3309 case SPDK_BDEV_IO_TYPE_UNMAP: 3310 bdev_unmap_split(bdev_io); 3311 break; 3312 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3313 bdev_write_zeroes_split(bdev_io); 3314 break; 3315 case SPDK_BDEV_IO_TYPE_COPY: 3316 bdev_copy_split(bdev_io); 3317 break; 3318 default: 3319 assert(false); 3320 break; 3321 } 3322 } 3323 3324 static void 3325 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3326 { 3327 if (!success) { 3328 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3329 return; 3330 } 3331 3332 _bdev_rw_split(bdev_io); 3333 } 3334 3335 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3336 * be inlined, at least on some compilers. 3337 */ 3338 static inline void 3339 _bdev_io_submit(void *ctx) 3340 { 3341 struct spdk_bdev_io *bdev_io = ctx; 3342 struct spdk_bdev *bdev = bdev_io->bdev; 3343 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3344 3345 if (spdk_likely(bdev_ch->flags == 0)) { 3346 bdev_io_do_submit(bdev_ch, bdev_io); 3347 return; 3348 } 3349 3350 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3351 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3352 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3353 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3354 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3355 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3356 } else { 3357 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3358 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3359 } 3360 } else { 3361 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3362 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3363 } 3364 } 3365 3366 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3367 3368 bool 3369 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3370 { 3371 if (range1->length == 0 || range2->length == 0) { 3372 return false; 3373 } 3374 3375 if (range1->offset + range1->length <= range2->offset) { 3376 return false; 3377 } 3378 3379 if (range2->offset + range2->length <= range1->offset) { 3380 return false; 3381 } 3382 3383 return true; 3384 } 3385 3386 static bool 3387 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3388 { 3389 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3390 struct lba_range r; 3391 3392 switch (bdev_io->type) { 3393 case SPDK_BDEV_IO_TYPE_NVME_IO: 3394 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3395 /* Don't try to decode the NVMe command - just assume worst-case and that 3396 * it overlaps a locked range. 3397 */ 3398 return true; 3399 case SPDK_BDEV_IO_TYPE_WRITE: 3400 case SPDK_BDEV_IO_TYPE_UNMAP: 3401 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3402 case SPDK_BDEV_IO_TYPE_ZCOPY: 3403 case SPDK_BDEV_IO_TYPE_COPY: 3404 r.offset = bdev_io->u.bdev.offset_blocks; 3405 r.length = bdev_io->u.bdev.num_blocks; 3406 if (!bdev_lba_range_overlapped(range, &r)) { 3407 /* This I/O doesn't overlap the specified LBA range. */ 3408 return false; 3409 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3410 /* This I/O overlaps, but the I/O is on the same channel that locked this 3411 * range, and the caller_ctx is the same as the locked_ctx. This means 3412 * that this I/O is associated with the lock, and is allowed to execute. 3413 */ 3414 return false; 3415 } else { 3416 return true; 3417 } 3418 default: 3419 return false; 3420 } 3421 } 3422 3423 void 3424 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3425 { 3426 struct spdk_bdev *bdev = bdev_io->bdev; 3427 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3428 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3429 3430 assert(thread != NULL); 3431 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3432 3433 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3434 struct lba_range *range; 3435 3436 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3437 if (bdev_io_range_is_locked(bdev_io, range)) { 3438 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3439 return; 3440 } 3441 } 3442 } 3443 3444 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3445 3446 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3447 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3448 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3449 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3450 spdk_bdev_get_name(bdev)); 3451 3452 if (bdev_io->internal.split) { 3453 bdev_io_split(bdev_io); 3454 return; 3455 } 3456 3457 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3458 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3459 _bdev_io_submit(bdev_io); 3460 } else { 3461 bdev_io->internal.io_submit_ch = ch; 3462 bdev_io->internal.ch = bdev->internal.qos->ch; 3463 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3464 } 3465 } else { 3466 _bdev_io_submit(bdev_io); 3467 } 3468 } 3469 3470 static inline void 3471 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3472 { 3473 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3474 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3475 * For write operation we need to pull buffers from memory domain before submitting IO. 3476 * Once read operation completes, we need to use memory_domain push functionality to 3477 * update data in original memory domain IO buffer 3478 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3479 bdev_io->u.bdev.memory_domain = NULL; 3480 bdev_io->u.bdev.memory_domain_ctx = NULL; 3481 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3482 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3483 } 3484 3485 static inline void 3486 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3487 { 3488 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3489 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3490 3491 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3492 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3493 bdev_io_complete_unsubmitted(bdev_io); 3494 return; 3495 } 3496 3497 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3498 * support them, but we need to execute an accel sequence and the data buffer is from accel 3499 * memory domain (to avoid doing a push/pull from that domain). 3500 */ 3501 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3502 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3503 _bdev_io_ext_use_bounce_buffer(bdev_io); 3504 return; 3505 } 3506 3507 if (needs_exec) { 3508 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3509 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3510 return; 3511 } 3512 /* For reads we'll execute the sequence after the data is read, so, for now, only 3513 * clear out accel_sequence pointer and submit the IO */ 3514 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3515 bdev_io->u.bdev.accel_sequence = NULL; 3516 } 3517 3518 bdev_io_submit(bdev_io); 3519 } 3520 3521 static void 3522 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3523 { 3524 struct spdk_bdev *bdev = bdev_io->bdev; 3525 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3526 struct spdk_io_channel *ch = bdev_ch->channel; 3527 3528 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3529 3530 bdev_io->internal.in_submit_request = true; 3531 bdev_submit_request(bdev, ch, bdev_io); 3532 bdev_io->internal.in_submit_request = false; 3533 } 3534 3535 void 3536 bdev_io_init(struct spdk_bdev_io *bdev_io, 3537 struct spdk_bdev *bdev, void *cb_arg, 3538 spdk_bdev_io_completion_cb cb) 3539 { 3540 bdev_io->bdev = bdev; 3541 bdev_io->internal.caller_ctx = cb_arg; 3542 bdev_io->internal.cb = cb; 3543 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3544 bdev_io->internal.in_submit_request = false; 3545 bdev_io->internal.buf = NULL; 3546 bdev_io->internal.io_submit_ch = NULL; 3547 bdev_io->internal.orig_iovs = NULL; 3548 bdev_io->internal.orig_iovcnt = 0; 3549 bdev_io->internal.orig_md_iov.iov_base = NULL; 3550 bdev_io->internal.error.nvme.cdw0 = 0; 3551 bdev_io->num_retries = 0; 3552 bdev_io->internal.get_buf_cb = NULL; 3553 bdev_io->internal.get_aux_buf_cb = NULL; 3554 bdev_io->internal.memory_domain = NULL; 3555 bdev_io->internal.memory_domain_ctx = NULL; 3556 bdev_io->internal.data_transfer_cpl = NULL; 3557 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3558 bdev_io->internal.accel_sequence = NULL; 3559 bdev_io->internal.has_accel_sequence = false; 3560 } 3561 3562 static bool 3563 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3564 { 3565 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3566 } 3567 3568 bool 3569 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3570 { 3571 bool supported; 3572 3573 supported = bdev_io_type_supported(bdev, io_type); 3574 3575 if (!supported) { 3576 switch (io_type) { 3577 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3578 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3579 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3580 break; 3581 default: 3582 break; 3583 } 3584 } 3585 3586 return supported; 3587 } 3588 3589 uint64_t 3590 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3591 { 3592 return bdev_io->internal.submit_tsc; 3593 } 3594 3595 int 3596 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3597 { 3598 if (bdev->fn_table->dump_info_json) { 3599 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3600 } 3601 3602 return 0; 3603 } 3604 3605 static void 3606 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3607 { 3608 uint32_t max_per_timeslice = 0; 3609 int i; 3610 3611 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3612 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3613 qos->rate_limits[i].max_per_timeslice = 0; 3614 continue; 3615 } 3616 3617 max_per_timeslice = qos->rate_limits[i].limit * 3618 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3619 3620 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3621 qos->rate_limits[i].min_per_timeslice); 3622 3623 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3624 } 3625 3626 bdev_qos_set_ops(qos); 3627 } 3628 3629 static int 3630 bdev_channel_poll_qos(void *arg) 3631 { 3632 struct spdk_bdev_qos *qos = arg; 3633 uint64_t now = spdk_get_ticks(); 3634 int i; 3635 3636 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3637 /* We received our callback earlier than expected - return 3638 * immediately and wait to do accounting until at least one 3639 * timeslice has actually expired. This should never happen 3640 * with a well-behaved timer implementation. 3641 */ 3642 return SPDK_POLLER_IDLE; 3643 } 3644 3645 /* Reset for next round of rate limiting */ 3646 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3647 /* We may have allowed the IOs or bytes to slightly overrun in the last 3648 * timeslice. remaining_this_timeslice is signed, so if it's negative 3649 * here, we'll account for the overrun so that the next timeslice will 3650 * be appropriately reduced. 3651 */ 3652 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3653 qos->rate_limits[i].remaining_this_timeslice = 0; 3654 } 3655 } 3656 3657 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3658 qos->last_timeslice += qos->timeslice_size; 3659 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3660 qos->rate_limits[i].remaining_this_timeslice += 3661 qos->rate_limits[i].max_per_timeslice; 3662 } 3663 } 3664 3665 return bdev_qos_io_submit(qos->ch, qos); 3666 } 3667 3668 static void 3669 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3670 { 3671 struct spdk_bdev_shared_resource *shared_resource; 3672 struct lba_range *range; 3673 3674 bdev_free_io_stat(ch->stat); 3675 #ifdef SPDK_CONFIG_VTUNE 3676 bdev_free_io_stat(ch->prev_stat); 3677 #endif 3678 3679 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3680 range = TAILQ_FIRST(&ch->locked_ranges); 3681 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3682 free(range); 3683 } 3684 3685 spdk_put_io_channel(ch->channel); 3686 spdk_put_io_channel(ch->accel_channel); 3687 3688 shared_resource = ch->shared_resource; 3689 3690 assert(TAILQ_EMPTY(&ch->io_locked)); 3691 assert(TAILQ_EMPTY(&ch->io_submitted)); 3692 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3693 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3694 assert(ch->io_outstanding == 0); 3695 assert(shared_resource->ref > 0); 3696 shared_resource->ref--; 3697 if (shared_resource->ref == 0) { 3698 assert(shared_resource->io_outstanding == 0); 3699 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3700 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3701 free(shared_resource); 3702 } 3703 } 3704 3705 static void 3706 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3707 { 3708 struct spdk_bdev_qos *qos = bdev->internal.qos; 3709 int i; 3710 3711 assert(spdk_spin_held(&bdev->internal.spinlock)); 3712 3713 /* Rate limiting on this bdev enabled */ 3714 if (qos) { 3715 if (qos->ch == NULL) { 3716 struct spdk_io_channel *io_ch; 3717 3718 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3719 bdev->name, spdk_get_thread()); 3720 3721 /* No qos channel has been selected, so set one up */ 3722 3723 /* Take another reference to ch */ 3724 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3725 assert(io_ch != NULL); 3726 qos->ch = ch; 3727 3728 qos->thread = spdk_io_channel_get_thread(io_ch); 3729 3730 TAILQ_INIT(&qos->queued); 3731 3732 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3733 if (bdev_qos_is_iops_rate_limit(i) == true) { 3734 qos->rate_limits[i].min_per_timeslice = 3735 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3736 } else { 3737 qos->rate_limits[i].min_per_timeslice = 3738 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3739 } 3740 3741 if (qos->rate_limits[i].limit == 0) { 3742 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3743 } 3744 } 3745 bdev_qos_update_max_quota_per_timeslice(qos); 3746 qos->timeslice_size = 3747 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3748 qos->last_timeslice = spdk_get_ticks(); 3749 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3750 qos, 3751 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3752 } 3753 3754 ch->flags |= BDEV_CH_QOS_ENABLED; 3755 } 3756 } 3757 3758 struct poll_timeout_ctx { 3759 struct spdk_bdev_desc *desc; 3760 uint64_t timeout_in_sec; 3761 spdk_bdev_io_timeout_cb cb_fn; 3762 void *cb_arg; 3763 }; 3764 3765 static void 3766 bdev_desc_free(struct spdk_bdev_desc *desc) 3767 { 3768 spdk_spin_destroy(&desc->spinlock); 3769 free(desc->media_events_buffer); 3770 free(desc); 3771 } 3772 3773 static void 3774 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3775 { 3776 struct poll_timeout_ctx *ctx = _ctx; 3777 struct spdk_bdev_desc *desc = ctx->desc; 3778 3779 free(ctx); 3780 3781 spdk_spin_lock(&desc->spinlock); 3782 desc->refs--; 3783 if (desc->closed == true && desc->refs == 0) { 3784 spdk_spin_unlock(&desc->spinlock); 3785 bdev_desc_free(desc); 3786 return; 3787 } 3788 spdk_spin_unlock(&desc->spinlock); 3789 } 3790 3791 static void 3792 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3793 struct spdk_io_channel *io_ch, void *_ctx) 3794 { 3795 struct poll_timeout_ctx *ctx = _ctx; 3796 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3797 struct spdk_bdev_desc *desc = ctx->desc; 3798 struct spdk_bdev_io *bdev_io; 3799 uint64_t now; 3800 3801 spdk_spin_lock(&desc->spinlock); 3802 if (desc->closed == true) { 3803 spdk_spin_unlock(&desc->spinlock); 3804 spdk_bdev_for_each_channel_continue(i, -1); 3805 return; 3806 } 3807 spdk_spin_unlock(&desc->spinlock); 3808 3809 now = spdk_get_ticks(); 3810 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3811 /* Exclude any I/O that are generated via splitting. */ 3812 if (bdev_io->internal.cb == bdev_io_split_done) { 3813 continue; 3814 } 3815 3816 /* Once we find an I/O that has not timed out, we can immediately 3817 * exit the loop. 3818 */ 3819 if (now < (bdev_io->internal.submit_tsc + 3820 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3821 goto end; 3822 } 3823 3824 if (bdev_io->internal.desc == desc) { 3825 ctx->cb_fn(ctx->cb_arg, bdev_io); 3826 } 3827 } 3828 3829 end: 3830 spdk_bdev_for_each_channel_continue(i, 0); 3831 } 3832 3833 static int 3834 bdev_poll_timeout_io(void *arg) 3835 { 3836 struct spdk_bdev_desc *desc = arg; 3837 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3838 struct poll_timeout_ctx *ctx; 3839 3840 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3841 if (!ctx) { 3842 SPDK_ERRLOG("failed to allocate memory\n"); 3843 return SPDK_POLLER_BUSY; 3844 } 3845 ctx->desc = desc; 3846 ctx->cb_arg = desc->cb_arg; 3847 ctx->cb_fn = desc->cb_fn; 3848 ctx->timeout_in_sec = desc->timeout_in_sec; 3849 3850 /* Take a ref on the descriptor in case it gets closed while we are checking 3851 * all of the channels. 3852 */ 3853 spdk_spin_lock(&desc->spinlock); 3854 desc->refs++; 3855 spdk_spin_unlock(&desc->spinlock); 3856 3857 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3858 bdev_channel_poll_timeout_io_done); 3859 3860 return SPDK_POLLER_BUSY; 3861 } 3862 3863 int 3864 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3865 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3866 { 3867 assert(desc->thread == spdk_get_thread()); 3868 3869 spdk_poller_unregister(&desc->io_timeout_poller); 3870 3871 if (timeout_in_sec) { 3872 assert(cb_fn != NULL); 3873 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3874 desc, 3875 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3876 1000); 3877 if (desc->io_timeout_poller == NULL) { 3878 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3879 return -1; 3880 } 3881 } 3882 3883 desc->cb_fn = cb_fn; 3884 desc->cb_arg = cb_arg; 3885 desc->timeout_in_sec = timeout_in_sec; 3886 3887 return 0; 3888 } 3889 3890 static int 3891 bdev_channel_create(void *io_device, void *ctx_buf) 3892 { 3893 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3894 struct spdk_bdev_channel *ch = ctx_buf; 3895 struct spdk_io_channel *mgmt_io_ch; 3896 struct spdk_bdev_mgmt_channel *mgmt_ch; 3897 struct spdk_bdev_shared_resource *shared_resource; 3898 struct lba_range *range; 3899 3900 ch->bdev = bdev; 3901 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3902 if (!ch->channel) { 3903 return -1; 3904 } 3905 3906 ch->accel_channel = spdk_accel_get_io_channel(); 3907 if (!ch->accel_channel) { 3908 spdk_put_io_channel(ch->channel); 3909 return -1; 3910 } 3911 3912 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3913 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3914 3915 assert(ch->histogram == NULL); 3916 if (bdev->internal.histogram_enabled) { 3917 ch->histogram = spdk_histogram_data_alloc(); 3918 if (ch->histogram == NULL) { 3919 SPDK_ERRLOG("Could not allocate histogram\n"); 3920 } 3921 } 3922 3923 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3924 if (!mgmt_io_ch) { 3925 spdk_put_io_channel(ch->channel); 3926 spdk_put_io_channel(ch->accel_channel); 3927 return -1; 3928 } 3929 3930 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3931 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3932 if (shared_resource->shared_ch == ch->channel) { 3933 spdk_put_io_channel(mgmt_io_ch); 3934 shared_resource->ref++; 3935 break; 3936 } 3937 } 3938 3939 if (shared_resource == NULL) { 3940 shared_resource = calloc(1, sizeof(*shared_resource)); 3941 if (shared_resource == NULL) { 3942 spdk_put_io_channel(ch->channel); 3943 spdk_put_io_channel(ch->accel_channel); 3944 spdk_put_io_channel(mgmt_io_ch); 3945 return -1; 3946 } 3947 3948 shared_resource->mgmt_ch = mgmt_ch; 3949 shared_resource->io_outstanding = 0; 3950 TAILQ_INIT(&shared_resource->nomem_io); 3951 shared_resource->nomem_threshold = 0; 3952 shared_resource->shared_ch = ch->channel; 3953 shared_resource->ref = 1; 3954 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3955 } 3956 3957 ch->io_outstanding = 0; 3958 TAILQ_INIT(&ch->queued_resets); 3959 TAILQ_INIT(&ch->locked_ranges); 3960 ch->flags = 0; 3961 ch->shared_resource = shared_resource; 3962 3963 TAILQ_INIT(&ch->io_submitted); 3964 TAILQ_INIT(&ch->io_locked); 3965 TAILQ_INIT(&ch->io_accel_exec); 3966 TAILQ_INIT(&ch->io_memory_domain); 3967 3968 ch->stat = bdev_alloc_io_stat(false); 3969 if (ch->stat == NULL) { 3970 bdev_channel_destroy_resource(ch); 3971 return -1; 3972 } 3973 3974 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3975 3976 #ifdef SPDK_CONFIG_VTUNE 3977 { 3978 char *name; 3979 __itt_init_ittlib(NULL, 0); 3980 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3981 if (!name) { 3982 bdev_channel_destroy_resource(ch); 3983 return -1; 3984 } 3985 ch->handle = __itt_string_handle_create(name); 3986 free(name); 3987 ch->start_tsc = spdk_get_ticks(); 3988 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3989 ch->prev_stat = bdev_alloc_io_stat(false); 3990 if (ch->prev_stat == NULL) { 3991 bdev_channel_destroy_resource(ch); 3992 return -1; 3993 } 3994 } 3995 #endif 3996 3997 spdk_spin_lock(&bdev->internal.spinlock); 3998 bdev_enable_qos(bdev, ch); 3999 4000 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4001 struct lba_range *new_range; 4002 4003 new_range = calloc(1, sizeof(*new_range)); 4004 if (new_range == NULL) { 4005 spdk_spin_unlock(&bdev->internal.spinlock); 4006 bdev_channel_destroy_resource(ch); 4007 return -1; 4008 } 4009 new_range->length = range->length; 4010 new_range->offset = range->offset; 4011 new_range->locked_ctx = range->locked_ctx; 4012 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4013 } 4014 4015 spdk_spin_unlock(&bdev->internal.spinlock); 4016 4017 return 0; 4018 } 4019 4020 static int 4021 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4022 void *cb_ctx) 4023 { 4024 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4025 struct spdk_bdev_io *bdev_io; 4026 uint64_t buf_len; 4027 4028 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4029 if (bdev_io->internal.ch == bdev_ch) { 4030 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4031 spdk_iobuf_entry_abort(ch, entry, buf_len); 4032 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4033 } 4034 4035 return 0; 4036 } 4037 4038 /* 4039 * Abort I/O that are waiting on a data buffer. 4040 */ 4041 static void 4042 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4043 { 4044 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4045 bdev_abort_all_buf_io_cb, ch); 4046 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4047 bdev_abort_all_buf_io_cb, ch); 4048 } 4049 4050 /* 4051 * Abort I/O that are queued waiting for submission. These types of I/O are 4052 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4053 */ 4054 static void 4055 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4056 { 4057 struct spdk_bdev_io *bdev_io, *tmp; 4058 4059 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4060 if (bdev_io->internal.ch == ch) { 4061 TAILQ_REMOVE(queue, bdev_io, internal.link); 4062 /* 4063 * spdk_bdev_io_complete() assumes that the completed I/O had 4064 * been submitted to the bdev module. Since in this case it 4065 * hadn't, bump io_outstanding to account for the decrement 4066 * that spdk_bdev_io_complete() will do. 4067 */ 4068 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4069 bdev_io_increment_outstanding(ch, ch->shared_resource); 4070 } 4071 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4072 } 4073 } 4074 } 4075 4076 static bool 4077 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4078 { 4079 struct spdk_bdev_io *bdev_io; 4080 4081 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4082 if (bdev_io == bio_to_abort) { 4083 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4084 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4085 return true; 4086 } 4087 } 4088 4089 return false; 4090 } 4091 4092 static int 4093 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4094 { 4095 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4096 uint64_t buf_len; 4097 4098 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4099 if (bdev_io == bio_to_abort) { 4100 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4101 spdk_iobuf_entry_abort(ch, entry, buf_len); 4102 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4103 return 1; 4104 } 4105 4106 return 0; 4107 } 4108 4109 static bool 4110 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4111 { 4112 int rc; 4113 4114 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4115 bdev_abort_buf_io_cb, bio_to_abort); 4116 if (rc == 1) { 4117 return true; 4118 } 4119 4120 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4121 bdev_abort_buf_io_cb, bio_to_abort); 4122 return rc == 1; 4123 } 4124 4125 static void 4126 bdev_qos_channel_destroy(void *cb_arg) 4127 { 4128 struct spdk_bdev_qos *qos = cb_arg; 4129 4130 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4131 spdk_poller_unregister(&qos->poller); 4132 4133 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4134 4135 free(qos); 4136 } 4137 4138 static int 4139 bdev_qos_destroy(struct spdk_bdev *bdev) 4140 { 4141 int i; 4142 4143 /* 4144 * Cleanly shutting down the QoS poller is tricky, because 4145 * during the asynchronous operation the user could open 4146 * a new descriptor and create a new channel, spawning 4147 * a new QoS poller. 4148 * 4149 * The strategy is to create a new QoS structure here and swap it 4150 * in. The shutdown path then continues to refer to the old one 4151 * until it completes and then releases it. 4152 */ 4153 struct spdk_bdev_qos *new_qos, *old_qos; 4154 4155 old_qos = bdev->internal.qos; 4156 4157 new_qos = calloc(1, sizeof(*new_qos)); 4158 if (!new_qos) { 4159 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4160 return -ENOMEM; 4161 } 4162 4163 /* Copy the old QoS data into the newly allocated structure */ 4164 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4165 4166 /* Zero out the key parts of the QoS structure */ 4167 new_qos->ch = NULL; 4168 new_qos->thread = NULL; 4169 new_qos->poller = NULL; 4170 TAILQ_INIT(&new_qos->queued); 4171 /* 4172 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4173 * It will be used later for the new QoS structure. 4174 */ 4175 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4176 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4177 new_qos->rate_limits[i].min_per_timeslice = 0; 4178 new_qos->rate_limits[i].max_per_timeslice = 0; 4179 } 4180 4181 bdev->internal.qos = new_qos; 4182 4183 if (old_qos->thread == NULL) { 4184 free(old_qos); 4185 } else { 4186 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4187 } 4188 4189 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4190 * been destroyed yet. The destruction path will end up waiting for the final 4191 * channel to be put before it releases resources. */ 4192 4193 return 0; 4194 } 4195 4196 void 4197 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4198 { 4199 total->bytes_read += add->bytes_read; 4200 total->num_read_ops += add->num_read_ops; 4201 total->bytes_written += add->bytes_written; 4202 total->num_write_ops += add->num_write_ops; 4203 total->bytes_unmapped += add->bytes_unmapped; 4204 total->num_unmap_ops += add->num_unmap_ops; 4205 total->bytes_copied += add->bytes_copied; 4206 total->num_copy_ops += add->num_copy_ops; 4207 total->read_latency_ticks += add->read_latency_ticks; 4208 total->write_latency_ticks += add->write_latency_ticks; 4209 total->unmap_latency_ticks += add->unmap_latency_ticks; 4210 total->copy_latency_ticks += add->copy_latency_ticks; 4211 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4212 total->max_read_latency_ticks = add->max_read_latency_ticks; 4213 } 4214 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4215 total->min_read_latency_ticks = add->min_read_latency_ticks; 4216 } 4217 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4218 total->max_write_latency_ticks = add->max_write_latency_ticks; 4219 } 4220 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4221 total->min_write_latency_ticks = add->min_write_latency_ticks; 4222 } 4223 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4224 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4225 } 4226 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4227 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4228 } 4229 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4230 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4231 } 4232 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4233 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4234 } 4235 } 4236 4237 static void 4238 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4239 { 4240 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4241 4242 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4243 memcpy(to_stat->io_error, from_stat->io_error, 4244 sizeof(struct spdk_bdev_io_error_stat)); 4245 } 4246 } 4247 4248 void 4249 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4250 { 4251 stat->max_read_latency_ticks = 0; 4252 stat->min_read_latency_ticks = UINT64_MAX; 4253 stat->max_write_latency_ticks = 0; 4254 stat->min_write_latency_ticks = UINT64_MAX; 4255 stat->max_unmap_latency_ticks = 0; 4256 stat->min_unmap_latency_ticks = UINT64_MAX; 4257 stat->max_copy_latency_ticks = 0; 4258 stat->min_copy_latency_ticks = UINT64_MAX; 4259 4260 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4261 return; 4262 } 4263 4264 stat->bytes_read = 0; 4265 stat->num_read_ops = 0; 4266 stat->bytes_written = 0; 4267 stat->num_write_ops = 0; 4268 stat->bytes_unmapped = 0; 4269 stat->num_unmap_ops = 0; 4270 stat->bytes_copied = 0; 4271 stat->num_copy_ops = 0; 4272 stat->read_latency_ticks = 0; 4273 stat->write_latency_ticks = 0; 4274 stat->unmap_latency_ticks = 0; 4275 stat->copy_latency_ticks = 0; 4276 4277 if (stat->io_error != NULL) { 4278 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4279 } 4280 } 4281 4282 struct spdk_bdev_io_stat * 4283 bdev_alloc_io_stat(bool io_error_stat) 4284 { 4285 struct spdk_bdev_io_stat *stat; 4286 4287 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4288 if (stat == NULL) { 4289 return NULL; 4290 } 4291 4292 if (io_error_stat) { 4293 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4294 if (stat->io_error == NULL) { 4295 free(stat); 4296 return NULL; 4297 } 4298 } else { 4299 stat->io_error = NULL; 4300 } 4301 4302 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4303 4304 return stat; 4305 } 4306 4307 void 4308 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4309 { 4310 if (stat != NULL) { 4311 free(stat->io_error); 4312 free(stat); 4313 } 4314 } 4315 4316 void 4317 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4318 { 4319 int i; 4320 4321 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4322 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4323 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4324 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4325 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4326 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4327 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4328 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4329 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4330 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4331 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4332 stat->min_read_latency_ticks != UINT64_MAX ? 4333 stat->min_read_latency_ticks : 0); 4334 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4335 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4336 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4337 stat->min_write_latency_ticks != UINT64_MAX ? 4338 stat->min_write_latency_ticks : 0); 4339 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4340 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4341 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4342 stat->min_unmap_latency_ticks != UINT64_MAX ? 4343 stat->min_unmap_latency_ticks : 0); 4344 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4345 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4346 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4347 stat->min_copy_latency_ticks != UINT64_MAX ? 4348 stat->min_copy_latency_ticks : 0); 4349 4350 if (stat->io_error != NULL) { 4351 spdk_json_write_named_object_begin(w, "io_error"); 4352 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4353 if (stat->io_error->error_status[i] != 0) { 4354 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4355 stat->io_error->error_status[i]); 4356 } 4357 } 4358 spdk_json_write_object_end(w); 4359 } 4360 } 4361 4362 static void 4363 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4364 { 4365 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4366 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4367 4368 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4369 bdev_abort_all_buf_io(mgmt_ch, ch); 4370 } 4371 4372 static void 4373 bdev_channel_destroy(void *io_device, void *ctx_buf) 4374 { 4375 struct spdk_bdev_channel *ch = ctx_buf; 4376 4377 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4378 spdk_get_thread()); 4379 4380 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4381 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4382 4383 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4384 spdk_spin_lock(&ch->bdev->internal.spinlock); 4385 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4386 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4387 4388 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4389 4390 bdev_channel_abort_queued_ios(ch); 4391 4392 if (ch->histogram) { 4393 spdk_histogram_data_free(ch->histogram); 4394 } 4395 4396 bdev_channel_destroy_resource(ch); 4397 } 4398 4399 /* 4400 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4401 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4402 */ 4403 static int 4404 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4405 { 4406 struct spdk_bdev_name *tmp; 4407 4408 bdev_name->name = strdup(name); 4409 if (bdev_name->name == NULL) { 4410 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4411 return -ENOMEM; 4412 } 4413 4414 bdev_name->bdev = bdev; 4415 4416 spdk_spin_lock(&g_bdev_mgr.spinlock); 4417 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4418 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4419 4420 if (tmp != NULL) { 4421 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4422 free(bdev_name->name); 4423 return -EEXIST; 4424 } 4425 4426 return 0; 4427 } 4428 4429 static void 4430 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4431 { 4432 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4433 free(bdev_name->name); 4434 } 4435 4436 static void 4437 bdev_name_del(struct spdk_bdev_name *bdev_name) 4438 { 4439 spdk_spin_lock(&g_bdev_mgr.spinlock); 4440 bdev_name_del_unsafe(bdev_name); 4441 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4442 } 4443 4444 int 4445 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4446 { 4447 struct spdk_bdev_alias *tmp; 4448 int ret; 4449 4450 if (alias == NULL) { 4451 SPDK_ERRLOG("Empty alias passed\n"); 4452 return -EINVAL; 4453 } 4454 4455 tmp = calloc(1, sizeof(*tmp)); 4456 if (tmp == NULL) { 4457 SPDK_ERRLOG("Unable to allocate alias\n"); 4458 return -ENOMEM; 4459 } 4460 4461 ret = bdev_name_add(&tmp->alias, bdev, alias); 4462 if (ret != 0) { 4463 free(tmp); 4464 return ret; 4465 } 4466 4467 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4468 4469 return 0; 4470 } 4471 4472 static int 4473 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4474 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4475 { 4476 struct spdk_bdev_alias *tmp; 4477 4478 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4479 if (strcmp(alias, tmp->alias.name) == 0) { 4480 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4481 alias_del_fn(&tmp->alias); 4482 free(tmp); 4483 return 0; 4484 } 4485 } 4486 4487 return -ENOENT; 4488 } 4489 4490 int 4491 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4492 { 4493 int rc; 4494 4495 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4496 if (rc == -ENOENT) { 4497 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4498 } 4499 4500 return rc; 4501 } 4502 4503 void 4504 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4505 { 4506 struct spdk_bdev_alias *p, *tmp; 4507 4508 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4509 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4510 bdev_name_del(&p->alias); 4511 free(p); 4512 } 4513 } 4514 4515 struct spdk_io_channel * 4516 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4517 { 4518 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4519 } 4520 4521 void * 4522 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4523 { 4524 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4525 void *ctx = NULL; 4526 4527 if (bdev->fn_table->get_module_ctx) { 4528 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4529 } 4530 4531 return ctx; 4532 } 4533 4534 const char * 4535 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4536 { 4537 return bdev->module->name; 4538 } 4539 4540 const char * 4541 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4542 { 4543 return bdev->name; 4544 } 4545 4546 const char * 4547 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4548 { 4549 return bdev->product_name; 4550 } 4551 4552 const struct spdk_bdev_aliases_list * 4553 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4554 { 4555 return &bdev->aliases; 4556 } 4557 4558 uint32_t 4559 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4560 { 4561 return bdev->blocklen; 4562 } 4563 4564 uint32_t 4565 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4566 { 4567 return bdev->write_unit_size; 4568 } 4569 4570 uint64_t 4571 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4572 { 4573 return bdev->blockcnt; 4574 } 4575 4576 const char * 4577 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4578 { 4579 return qos_rpc_type[type]; 4580 } 4581 4582 void 4583 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4584 { 4585 int i; 4586 4587 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4588 4589 spdk_spin_lock(&bdev->internal.spinlock); 4590 if (bdev->internal.qos) { 4591 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4592 if (bdev->internal.qos->rate_limits[i].limit != 4593 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4594 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4595 if (bdev_qos_is_iops_rate_limit(i) == false) { 4596 /* Change from Byte to Megabyte which is user visible. */ 4597 limits[i] = limits[i] / 1024 / 1024; 4598 } 4599 } 4600 } 4601 } 4602 spdk_spin_unlock(&bdev->internal.spinlock); 4603 } 4604 4605 size_t 4606 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4607 { 4608 return 1 << bdev->required_alignment; 4609 } 4610 4611 uint32_t 4612 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4613 { 4614 return bdev->optimal_io_boundary; 4615 } 4616 4617 bool 4618 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4619 { 4620 return bdev->write_cache; 4621 } 4622 4623 const struct spdk_uuid * 4624 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4625 { 4626 return &bdev->uuid; 4627 } 4628 4629 uint16_t 4630 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4631 { 4632 return bdev->acwu; 4633 } 4634 4635 uint32_t 4636 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4637 { 4638 return bdev->md_len; 4639 } 4640 4641 bool 4642 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4643 { 4644 return (bdev->md_len != 0) && bdev->md_interleave; 4645 } 4646 4647 bool 4648 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4649 { 4650 return (bdev->md_len != 0) && !bdev->md_interleave; 4651 } 4652 4653 bool 4654 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4655 { 4656 return bdev->zoned; 4657 } 4658 4659 uint32_t 4660 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4661 { 4662 if (spdk_bdev_is_md_interleaved(bdev)) { 4663 return bdev->blocklen - bdev->md_len; 4664 } else { 4665 return bdev->blocklen; 4666 } 4667 } 4668 4669 uint32_t 4670 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4671 { 4672 return bdev->phys_blocklen; 4673 } 4674 4675 static uint32_t 4676 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4677 { 4678 if (!spdk_bdev_is_md_interleaved(bdev)) { 4679 return bdev->blocklen + bdev->md_len; 4680 } else { 4681 return bdev->blocklen; 4682 } 4683 } 4684 4685 /* We have to use the typedef in the function declaration to appease astyle. */ 4686 typedef enum spdk_dif_type spdk_dif_type_t; 4687 4688 spdk_dif_type_t 4689 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4690 { 4691 if (bdev->md_len != 0) { 4692 return bdev->dif_type; 4693 } else { 4694 return SPDK_DIF_DISABLE; 4695 } 4696 } 4697 4698 bool 4699 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4700 { 4701 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4702 return bdev->dif_is_head_of_md; 4703 } else { 4704 return false; 4705 } 4706 } 4707 4708 bool 4709 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4710 enum spdk_dif_check_type check_type) 4711 { 4712 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4713 return false; 4714 } 4715 4716 switch (check_type) { 4717 case SPDK_DIF_CHECK_TYPE_REFTAG: 4718 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4719 case SPDK_DIF_CHECK_TYPE_APPTAG: 4720 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4721 case SPDK_DIF_CHECK_TYPE_GUARD: 4722 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4723 default: 4724 return false; 4725 } 4726 } 4727 4728 static uint32_t 4729 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4730 { 4731 uint64_t aligned_length, max_write_blocks; 4732 4733 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4734 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4735 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4736 4737 return max_write_blocks; 4738 } 4739 4740 uint32_t 4741 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4742 { 4743 return bdev->max_copy; 4744 } 4745 4746 uint64_t 4747 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4748 { 4749 return bdev->internal.measured_queue_depth; 4750 } 4751 4752 uint64_t 4753 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4754 { 4755 return bdev->internal.period; 4756 } 4757 4758 uint64_t 4759 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4760 { 4761 return bdev->internal.weighted_io_time; 4762 } 4763 4764 uint64_t 4765 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4766 { 4767 return bdev->internal.io_time; 4768 } 4769 4770 static void bdev_update_qd_sampling_period(void *ctx); 4771 4772 static void 4773 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4774 { 4775 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4776 4777 if (bdev->internal.measured_queue_depth) { 4778 bdev->internal.io_time += bdev->internal.period; 4779 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4780 } 4781 4782 bdev->internal.qd_poll_in_progress = false; 4783 4784 bdev_update_qd_sampling_period(bdev); 4785 } 4786 4787 static void 4788 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4789 struct spdk_io_channel *io_ch, void *_ctx) 4790 { 4791 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4792 4793 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4794 spdk_bdev_for_each_channel_continue(i, 0); 4795 } 4796 4797 static int 4798 bdev_calculate_measured_queue_depth(void *ctx) 4799 { 4800 struct spdk_bdev *bdev = ctx; 4801 4802 bdev->internal.qd_poll_in_progress = true; 4803 bdev->internal.temporary_queue_depth = 0; 4804 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4805 return SPDK_POLLER_BUSY; 4806 } 4807 4808 static void 4809 bdev_update_qd_sampling_period(void *ctx) 4810 { 4811 struct spdk_bdev *bdev = ctx; 4812 4813 if (bdev->internal.period == bdev->internal.new_period) { 4814 return; 4815 } 4816 4817 if (bdev->internal.qd_poll_in_progress) { 4818 return; 4819 } 4820 4821 bdev->internal.period = bdev->internal.new_period; 4822 4823 spdk_poller_unregister(&bdev->internal.qd_poller); 4824 if (bdev->internal.period != 0) { 4825 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4826 bdev, bdev->internal.period); 4827 } else { 4828 spdk_bdev_close(bdev->internal.qd_desc); 4829 bdev->internal.qd_desc = NULL; 4830 } 4831 } 4832 4833 static void 4834 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4835 { 4836 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4837 } 4838 4839 void 4840 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4841 { 4842 int rc; 4843 4844 if (bdev->internal.new_period == period) { 4845 return; 4846 } 4847 4848 bdev->internal.new_period = period; 4849 4850 if (bdev->internal.qd_desc != NULL) { 4851 assert(bdev->internal.period != 0); 4852 4853 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4854 bdev_update_qd_sampling_period, bdev); 4855 return; 4856 } 4857 4858 assert(bdev->internal.period == 0); 4859 4860 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4861 NULL, &bdev->internal.qd_desc); 4862 if (rc != 0) { 4863 return; 4864 } 4865 4866 bdev->internal.period = period; 4867 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4868 bdev, period); 4869 } 4870 4871 struct bdev_get_current_qd_ctx { 4872 uint64_t current_qd; 4873 spdk_bdev_get_current_qd_cb cb_fn; 4874 void *cb_arg; 4875 }; 4876 4877 static void 4878 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4879 { 4880 struct bdev_get_current_qd_ctx *ctx = _ctx; 4881 4882 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4883 4884 free(ctx); 4885 } 4886 4887 static void 4888 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4889 struct spdk_io_channel *io_ch, void *_ctx) 4890 { 4891 struct bdev_get_current_qd_ctx *ctx = _ctx; 4892 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4893 4894 ctx->current_qd += bdev_ch->io_outstanding; 4895 4896 spdk_bdev_for_each_channel_continue(i, 0); 4897 } 4898 4899 void 4900 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4901 void *cb_arg) 4902 { 4903 struct bdev_get_current_qd_ctx *ctx; 4904 4905 assert(cb_fn != NULL); 4906 4907 ctx = calloc(1, sizeof(*ctx)); 4908 if (ctx == NULL) { 4909 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4910 return; 4911 } 4912 4913 ctx->cb_fn = cb_fn; 4914 ctx->cb_arg = cb_arg; 4915 4916 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4917 } 4918 4919 static void 4920 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4921 { 4922 assert(desc->thread == spdk_get_thread()); 4923 4924 spdk_spin_lock(&desc->spinlock); 4925 desc->refs--; 4926 if (!desc->closed) { 4927 spdk_spin_unlock(&desc->spinlock); 4928 desc->callback.event_fn(type, 4929 desc->bdev, 4930 desc->callback.ctx); 4931 return; 4932 } else if (desc->refs == 0) { 4933 /* This descriptor was closed after this event_notify message was sent. 4934 * spdk_bdev_close() could not free the descriptor since this message was 4935 * in flight, so we free it now using bdev_desc_free(). 4936 */ 4937 spdk_spin_unlock(&desc->spinlock); 4938 bdev_desc_free(desc); 4939 return; 4940 } 4941 spdk_spin_unlock(&desc->spinlock); 4942 } 4943 4944 static void 4945 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4946 { 4947 spdk_spin_lock(&desc->spinlock); 4948 desc->refs++; 4949 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4950 spdk_spin_unlock(&desc->spinlock); 4951 } 4952 4953 static void 4954 _resize_notify(void *ctx) 4955 { 4956 struct spdk_bdev_desc *desc = ctx; 4957 4958 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4959 } 4960 4961 int 4962 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4963 { 4964 struct spdk_bdev_desc *desc; 4965 int ret; 4966 4967 if (size == bdev->blockcnt) { 4968 return 0; 4969 } 4970 4971 spdk_spin_lock(&bdev->internal.spinlock); 4972 4973 /* bdev has open descriptors */ 4974 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4975 bdev->blockcnt > size) { 4976 ret = -EBUSY; 4977 } else { 4978 bdev->blockcnt = size; 4979 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4980 event_notify(desc, _resize_notify); 4981 } 4982 ret = 0; 4983 } 4984 4985 spdk_spin_unlock(&bdev->internal.spinlock); 4986 4987 return ret; 4988 } 4989 4990 /* 4991 * Convert I/O offset and length from bytes to blocks. 4992 * 4993 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4994 */ 4995 static uint64_t 4996 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4997 uint64_t num_bytes, uint64_t *num_blocks) 4998 { 4999 uint32_t block_size = bdev->blocklen; 5000 uint8_t shift_cnt; 5001 5002 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5003 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5004 shift_cnt = spdk_u32log2(block_size); 5005 *offset_blocks = offset_bytes >> shift_cnt; 5006 *num_blocks = num_bytes >> shift_cnt; 5007 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5008 (num_bytes - (*num_blocks << shift_cnt)); 5009 } else { 5010 *offset_blocks = offset_bytes / block_size; 5011 *num_blocks = num_bytes / block_size; 5012 return (offset_bytes % block_size) | (num_bytes % block_size); 5013 } 5014 } 5015 5016 static bool 5017 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5018 { 5019 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5020 * has been an overflow and hence the offset has been wrapped around */ 5021 if (offset_blocks + num_blocks < offset_blocks) { 5022 return false; 5023 } 5024 5025 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5026 if (offset_blocks + num_blocks > bdev->blockcnt) { 5027 return false; 5028 } 5029 5030 return true; 5031 } 5032 5033 static void 5034 bdev_seek_complete_cb(void *ctx) 5035 { 5036 struct spdk_bdev_io *bdev_io = ctx; 5037 5038 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5039 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5040 } 5041 5042 static int 5043 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5044 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5045 spdk_bdev_io_completion_cb cb, void *cb_arg) 5046 { 5047 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5048 struct spdk_bdev_io *bdev_io; 5049 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5050 5051 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5052 5053 /* Check if offset_blocks is valid looking at the validity of one block */ 5054 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5055 return -EINVAL; 5056 } 5057 5058 bdev_io = bdev_channel_get_io(channel); 5059 if (!bdev_io) { 5060 return -ENOMEM; 5061 } 5062 5063 bdev_io->internal.ch = channel; 5064 bdev_io->internal.desc = desc; 5065 bdev_io->type = io_type; 5066 bdev_io->u.bdev.offset_blocks = offset_blocks; 5067 bdev_io->u.bdev.memory_domain = NULL; 5068 bdev_io->u.bdev.memory_domain_ctx = NULL; 5069 bdev_io->u.bdev.accel_sequence = NULL; 5070 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5071 5072 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5073 /* In case bdev doesn't support seek to next data/hole offset, 5074 * it is assumed that only data and no holes are present */ 5075 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5076 bdev_io->u.bdev.seek.offset = offset_blocks; 5077 } else { 5078 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5079 } 5080 5081 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5082 return 0; 5083 } 5084 5085 bdev_io_submit(bdev_io); 5086 return 0; 5087 } 5088 5089 int 5090 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5091 uint64_t offset_blocks, 5092 spdk_bdev_io_completion_cb cb, void *cb_arg) 5093 { 5094 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5095 } 5096 5097 int 5098 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5099 uint64_t offset_blocks, 5100 spdk_bdev_io_completion_cb cb, void *cb_arg) 5101 { 5102 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5103 } 5104 5105 uint64_t 5106 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5107 { 5108 return bdev_io->u.bdev.seek.offset; 5109 } 5110 5111 static int 5112 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5113 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5114 spdk_bdev_io_completion_cb cb, void *cb_arg) 5115 { 5116 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5117 struct spdk_bdev_io *bdev_io; 5118 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5119 5120 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5121 return -EINVAL; 5122 } 5123 5124 bdev_io = bdev_channel_get_io(channel); 5125 if (!bdev_io) { 5126 return -ENOMEM; 5127 } 5128 5129 bdev_io->internal.ch = channel; 5130 bdev_io->internal.desc = desc; 5131 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5132 bdev_io->u.bdev.iovs = &bdev_io->iov; 5133 bdev_io->u.bdev.iovs[0].iov_base = buf; 5134 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5135 bdev_io->u.bdev.iovcnt = 1; 5136 bdev_io->u.bdev.md_buf = md_buf; 5137 bdev_io->u.bdev.num_blocks = num_blocks; 5138 bdev_io->u.bdev.offset_blocks = offset_blocks; 5139 bdev_io->u.bdev.memory_domain = NULL; 5140 bdev_io->u.bdev.memory_domain_ctx = NULL; 5141 bdev_io->u.bdev.accel_sequence = NULL; 5142 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5143 5144 bdev_io_submit(bdev_io); 5145 return 0; 5146 } 5147 5148 int 5149 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5150 void *buf, uint64_t offset, uint64_t nbytes, 5151 spdk_bdev_io_completion_cb cb, void *cb_arg) 5152 { 5153 uint64_t offset_blocks, num_blocks; 5154 5155 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5156 nbytes, &num_blocks) != 0) { 5157 return -EINVAL; 5158 } 5159 5160 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5161 } 5162 5163 int 5164 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5165 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5166 spdk_bdev_io_completion_cb cb, void *cb_arg) 5167 { 5168 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5169 } 5170 5171 int 5172 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5173 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5174 spdk_bdev_io_completion_cb cb, void *cb_arg) 5175 { 5176 struct iovec iov = { 5177 .iov_base = buf, 5178 }; 5179 5180 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5181 return -EINVAL; 5182 } 5183 5184 if (md_buf && !_is_buf_allocated(&iov)) { 5185 return -EINVAL; 5186 } 5187 5188 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5189 cb, cb_arg); 5190 } 5191 5192 int 5193 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5194 struct iovec *iov, int iovcnt, 5195 uint64_t offset, uint64_t nbytes, 5196 spdk_bdev_io_completion_cb cb, void *cb_arg) 5197 { 5198 uint64_t offset_blocks, num_blocks; 5199 5200 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5201 nbytes, &num_blocks) != 0) { 5202 return -EINVAL; 5203 } 5204 5205 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5206 } 5207 5208 static int 5209 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5210 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5211 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5212 struct spdk_accel_sequence *seq, 5213 spdk_bdev_io_completion_cb cb, void *cb_arg) 5214 { 5215 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5216 struct spdk_bdev_io *bdev_io; 5217 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5218 5219 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5220 return -EINVAL; 5221 } 5222 5223 bdev_io = bdev_channel_get_io(channel); 5224 if (!bdev_io) { 5225 return -ENOMEM; 5226 } 5227 5228 bdev_io->internal.ch = channel; 5229 bdev_io->internal.desc = desc; 5230 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5231 bdev_io->u.bdev.iovs = iov; 5232 bdev_io->u.bdev.iovcnt = iovcnt; 5233 bdev_io->u.bdev.md_buf = md_buf; 5234 bdev_io->u.bdev.num_blocks = num_blocks; 5235 bdev_io->u.bdev.offset_blocks = offset_blocks; 5236 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5237 bdev_io->internal.memory_domain = domain; 5238 bdev_io->internal.memory_domain_ctx = domain_ctx; 5239 bdev_io->internal.accel_sequence = seq; 5240 bdev_io->internal.has_accel_sequence = seq != NULL; 5241 bdev_io->u.bdev.memory_domain = domain; 5242 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5243 bdev_io->u.bdev.accel_sequence = seq; 5244 5245 _bdev_io_submit_ext(desc, bdev_io); 5246 5247 return 0; 5248 } 5249 5250 int 5251 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5252 struct iovec *iov, int iovcnt, 5253 uint64_t offset_blocks, uint64_t num_blocks, 5254 spdk_bdev_io_completion_cb cb, void *cb_arg) 5255 { 5256 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5257 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5258 } 5259 5260 int 5261 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5262 struct iovec *iov, int iovcnt, void *md_buf, 5263 uint64_t offset_blocks, uint64_t num_blocks, 5264 spdk_bdev_io_completion_cb cb, void *cb_arg) 5265 { 5266 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5267 return -EINVAL; 5268 } 5269 5270 if (md_buf && !_is_buf_allocated(iov)) { 5271 return -EINVAL; 5272 } 5273 5274 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5275 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5276 } 5277 5278 static inline bool 5279 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5280 { 5281 /* 5282 * We check if opts size is at least of size when we first introduced 5283 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5284 * are not checked internal. 5285 */ 5286 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5287 sizeof(opts->metadata) && 5288 opts->size <= sizeof(*opts) && 5289 /* When memory domain is used, the user must provide data buffers */ 5290 (!opts->memory_domain || (iov && iov[0].iov_base)); 5291 } 5292 5293 int 5294 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5295 struct iovec *iov, int iovcnt, 5296 uint64_t offset_blocks, uint64_t num_blocks, 5297 spdk_bdev_io_completion_cb cb, void *cb_arg, 5298 struct spdk_bdev_ext_io_opts *opts) 5299 { 5300 void *md = NULL; 5301 5302 if (opts) { 5303 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5304 return -EINVAL; 5305 } 5306 md = opts->metadata; 5307 } 5308 5309 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5310 return -EINVAL; 5311 } 5312 5313 if (md && !_is_buf_allocated(iov)) { 5314 return -EINVAL; 5315 } 5316 5317 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5318 num_blocks, 5319 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5320 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5321 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5322 cb, cb_arg); 5323 } 5324 5325 static int 5326 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5327 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5328 spdk_bdev_io_completion_cb cb, void *cb_arg) 5329 { 5330 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5331 struct spdk_bdev_io *bdev_io; 5332 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5333 5334 if (!desc->write) { 5335 return -EBADF; 5336 } 5337 5338 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5339 return -EINVAL; 5340 } 5341 5342 bdev_io = bdev_channel_get_io(channel); 5343 if (!bdev_io) { 5344 return -ENOMEM; 5345 } 5346 5347 bdev_io->internal.ch = channel; 5348 bdev_io->internal.desc = desc; 5349 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5350 bdev_io->u.bdev.iovs = &bdev_io->iov; 5351 bdev_io->u.bdev.iovs[0].iov_base = buf; 5352 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5353 bdev_io->u.bdev.iovcnt = 1; 5354 bdev_io->u.bdev.md_buf = md_buf; 5355 bdev_io->u.bdev.num_blocks = num_blocks; 5356 bdev_io->u.bdev.offset_blocks = offset_blocks; 5357 bdev_io->u.bdev.memory_domain = NULL; 5358 bdev_io->u.bdev.memory_domain_ctx = NULL; 5359 bdev_io->u.bdev.accel_sequence = NULL; 5360 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5361 5362 bdev_io_submit(bdev_io); 5363 return 0; 5364 } 5365 5366 int 5367 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5368 void *buf, uint64_t offset, uint64_t nbytes, 5369 spdk_bdev_io_completion_cb cb, void *cb_arg) 5370 { 5371 uint64_t offset_blocks, num_blocks; 5372 5373 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5374 nbytes, &num_blocks) != 0) { 5375 return -EINVAL; 5376 } 5377 5378 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5379 } 5380 5381 int 5382 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5383 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5384 spdk_bdev_io_completion_cb cb, void *cb_arg) 5385 { 5386 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5387 cb, cb_arg); 5388 } 5389 5390 int 5391 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5392 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5393 spdk_bdev_io_completion_cb cb, void *cb_arg) 5394 { 5395 struct iovec iov = { 5396 .iov_base = buf, 5397 }; 5398 5399 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5400 return -EINVAL; 5401 } 5402 5403 if (md_buf && !_is_buf_allocated(&iov)) { 5404 return -EINVAL; 5405 } 5406 5407 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5408 cb, cb_arg); 5409 } 5410 5411 static int 5412 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5413 struct iovec *iov, int iovcnt, void *md_buf, 5414 uint64_t offset_blocks, uint64_t num_blocks, 5415 struct spdk_memory_domain *domain, void *domain_ctx, 5416 struct spdk_accel_sequence *seq, 5417 spdk_bdev_io_completion_cb cb, void *cb_arg) 5418 { 5419 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5420 struct spdk_bdev_io *bdev_io; 5421 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5422 5423 if (!desc->write) { 5424 return -EBADF; 5425 } 5426 5427 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5428 return -EINVAL; 5429 } 5430 5431 bdev_io = bdev_channel_get_io(channel); 5432 if (!bdev_io) { 5433 return -ENOMEM; 5434 } 5435 5436 bdev_io->internal.ch = channel; 5437 bdev_io->internal.desc = desc; 5438 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5439 bdev_io->u.bdev.iovs = iov; 5440 bdev_io->u.bdev.iovcnt = iovcnt; 5441 bdev_io->u.bdev.md_buf = md_buf; 5442 bdev_io->u.bdev.num_blocks = num_blocks; 5443 bdev_io->u.bdev.offset_blocks = offset_blocks; 5444 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5445 bdev_io->internal.memory_domain = domain; 5446 bdev_io->internal.memory_domain_ctx = domain_ctx; 5447 bdev_io->internal.accel_sequence = seq; 5448 bdev_io->internal.has_accel_sequence = seq != NULL; 5449 bdev_io->u.bdev.memory_domain = domain; 5450 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5451 bdev_io->u.bdev.accel_sequence = seq; 5452 5453 _bdev_io_submit_ext(desc, bdev_io); 5454 5455 return 0; 5456 } 5457 5458 int 5459 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5460 struct iovec *iov, int iovcnt, 5461 uint64_t offset, uint64_t len, 5462 spdk_bdev_io_completion_cb cb, void *cb_arg) 5463 { 5464 uint64_t offset_blocks, num_blocks; 5465 5466 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5467 len, &num_blocks) != 0) { 5468 return -EINVAL; 5469 } 5470 5471 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5472 } 5473 5474 int 5475 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5476 struct iovec *iov, int iovcnt, 5477 uint64_t offset_blocks, uint64_t num_blocks, 5478 spdk_bdev_io_completion_cb cb, void *cb_arg) 5479 { 5480 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5481 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5482 } 5483 5484 int 5485 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5486 struct iovec *iov, int iovcnt, void *md_buf, 5487 uint64_t offset_blocks, uint64_t num_blocks, 5488 spdk_bdev_io_completion_cb cb, void *cb_arg) 5489 { 5490 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5491 return -EINVAL; 5492 } 5493 5494 if (md_buf && !_is_buf_allocated(iov)) { 5495 return -EINVAL; 5496 } 5497 5498 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5499 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5500 } 5501 5502 int 5503 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5504 struct iovec *iov, int iovcnt, 5505 uint64_t offset_blocks, uint64_t num_blocks, 5506 spdk_bdev_io_completion_cb cb, void *cb_arg, 5507 struct spdk_bdev_ext_io_opts *opts) 5508 { 5509 void *md = NULL; 5510 5511 if (opts) { 5512 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5513 return -EINVAL; 5514 } 5515 md = opts->metadata; 5516 } 5517 5518 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5519 return -EINVAL; 5520 } 5521 5522 if (md && !_is_buf_allocated(iov)) { 5523 return -EINVAL; 5524 } 5525 5526 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5527 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5528 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5529 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5530 cb, cb_arg); 5531 } 5532 5533 static void 5534 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5535 { 5536 struct spdk_bdev_io *parent_io = cb_arg; 5537 struct spdk_bdev *bdev = parent_io->bdev; 5538 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5539 int i, rc = 0; 5540 5541 if (!success) { 5542 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5543 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5544 spdk_bdev_free_io(bdev_io); 5545 return; 5546 } 5547 5548 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5549 rc = memcmp(read_buf, 5550 parent_io->u.bdev.iovs[i].iov_base, 5551 parent_io->u.bdev.iovs[i].iov_len); 5552 if (rc) { 5553 break; 5554 } 5555 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5556 } 5557 5558 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5559 rc = memcmp(bdev_io->u.bdev.md_buf, 5560 parent_io->u.bdev.md_buf, 5561 spdk_bdev_get_md_size(bdev)); 5562 } 5563 5564 spdk_bdev_free_io(bdev_io); 5565 5566 if (rc == 0) { 5567 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5568 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5569 } else { 5570 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5571 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5572 } 5573 } 5574 5575 static void 5576 bdev_compare_do_read(void *_bdev_io) 5577 { 5578 struct spdk_bdev_io *bdev_io = _bdev_io; 5579 int rc; 5580 5581 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5582 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5583 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5584 bdev_compare_do_read_done, bdev_io); 5585 5586 if (rc == -ENOMEM) { 5587 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5588 } else if (rc != 0) { 5589 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5590 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5591 } 5592 } 5593 5594 static int 5595 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5596 struct iovec *iov, int iovcnt, void *md_buf, 5597 uint64_t offset_blocks, uint64_t num_blocks, 5598 spdk_bdev_io_completion_cb cb, void *cb_arg) 5599 { 5600 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5601 struct spdk_bdev_io *bdev_io; 5602 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5603 5604 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5605 return -EINVAL; 5606 } 5607 5608 bdev_io = bdev_channel_get_io(channel); 5609 if (!bdev_io) { 5610 return -ENOMEM; 5611 } 5612 5613 bdev_io->internal.ch = channel; 5614 bdev_io->internal.desc = desc; 5615 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5616 bdev_io->u.bdev.iovs = iov; 5617 bdev_io->u.bdev.iovcnt = iovcnt; 5618 bdev_io->u.bdev.md_buf = md_buf; 5619 bdev_io->u.bdev.num_blocks = num_blocks; 5620 bdev_io->u.bdev.offset_blocks = offset_blocks; 5621 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5622 bdev_io->u.bdev.memory_domain = NULL; 5623 bdev_io->u.bdev.memory_domain_ctx = NULL; 5624 bdev_io->u.bdev.accel_sequence = NULL; 5625 5626 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5627 bdev_io_submit(bdev_io); 5628 return 0; 5629 } 5630 5631 bdev_compare_do_read(bdev_io); 5632 5633 return 0; 5634 } 5635 5636 int 5637 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5638 struct iovec *iov, int iovcnt, 5639 uint64_t offset_blocks, uint64_t num_blocks, 5640 spdk_bdev_io_completion_cb cb, void *cb_arg) 5641 { 5642 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5643 num_blocks, cb, cb_arg); 5644 } 5645 5646 int 5647 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5648 struct iovec *iov, int iovcnt, void *md_buf, 5649 uint64_t offset_blocks, uint64_t num_blocks, 5650 spdk_bdev_io_completion_cb cb, void *cb_arg) 5651 { 5652 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5653 return -EINVAL; 5654 } 5655 5656 if (md_buf && !_is_buf_allocated(iov)) { 5657 return -EINVAL; 5658 } 5659 5660 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5661 num_blocks, cb, cb_arg); 5662 } 5663 5664 static int 5665 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5666 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5667 spdk_bdev_io_completion_cb cb, void *cb_arg) 5668 { 5669 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5670 struct spdk_bdev_io *bdev_io; 5671 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5672 5673 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5674 return -EINVAL; 5675 } 5676 5677 bdev_io = bdev_channel_get_io(channel); 5678 if (!bdev_io) { 5679 return -ENOMEM; 5680 } 5681 5682 bdev_io->internal.ch = channel; 5683 bdev_io->internal.desc = desc; 5684 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5685 bdev_io->u.bdev.iovs = &bdev_io->iov; 5686 bdev_io->u.bdev.iovs[0].iov_base = buf; 5687 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5688 bdev_io->u.bdev.iovcnt = 1; 5689 bdev_io->u.bdev.md_buf = md_buf; 5690 bdev_io->u.bdev.num_blocks = num_blocks; 5691 bdev_io->u.bdev.offset_blocks = offset_blocks; 5692 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5693 bdev_io->u.bdev.memory_domain = NULL; 5694 bdev_io->u.bdev.memory_domain_ctx = NULL; 5695 bdev_io->u.bdev.accel_sequence = NULL; 5696 5697 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5698 bdev_io_submit(bdev_io); 5699 return 0; 5700 } 5701 5702 bdev_compare_do_read(bdev_io); 5703 5704 return 0; 5705 } 5706 5707 int 5708 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5709 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5710 spdk_bdev_io_completion_cb cb, void *cb_arg) 5711 { 5712 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5713 cb, cb_arg); 5714 } 5715 5716 int 5717 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5718 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5719 spdk_bdev_io_completion_cb cb, void *cb_arg) 5720 { 5721 struct iovec iov = { 5722 .iov_base = buf, 5723 }; 5724 5725 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5726 return -EINVAL; 5727 } 5728 5729 if (md_buf && !_is_buf_allocated(&iov)) { 5730 return -EINVAL; 5731 } 5732 5733 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5734 cb, cb_arg); 5735 } 5736 5737 static void 5738 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5739 { 5740 struct spdk_bdev_io *bdev_io = ctx; 5741 5742 if (unlock_status) { 5743 SPDK_ERRLOG("LBA range unlock failed\n"); 5744 } 5745 5746 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5747 false, bdev_io->internal.caller_ctx); 5748 } 5749 5750 static void 5751 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5752 { 5753 bdev_io->internal.status = status; 5754 5755 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5756 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5757 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5758 } 5759 5760 static void 5761 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5762 { 5763 struct spdk_bdev_io *parent_io = cb_arg; 5764 5765 if (!success) { 5766 SPDK_ERRLOG("Compare and write operation failed\n"); 5767 } 5768 5769 spdk_bdev_free_io(bdev_io); 5770 5771 bdev_comparev_and_writev_blocks_unlock(parent_io, 5772 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5773 } 5774 5775 static void 5776 bdev_compare_and_write_do_write(void *_bdev_io) 5777 { 5778 struct spdk_bdev_io *bdev_io = _bdev_io; 5779 int rc; 5780 5781 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5782 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5783 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5784 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5785 bdev_compare_and_write_do_write_done, bdev_io); 5786 5787 5788 if (rc == -ENOMEM) { 5789 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5790 } else if (rc != 0) { 5791 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5792 } 5793 } 5794 5795 static void 5796 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5797 { 5798 struct spdk_bdev_io *parent_io = cb_arg; 5799 5800 spdk_bdev_free_io(bdev_io); 5801 5802 if (!success) { 5803 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5804 return; 5805 } 5806 5807 bdev_compare_and_write_do_write(parent_io); 5808 } 5809 5810 static void 5811 bdev_compare_and_write_do_compare(void *_bdev_io) 5812 { 5813 struct spdk_bdev_io *bdev_io = _bdev_io; 5814 int rc; 5815 5816 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5817 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5818 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5819 bdev_compare_and_write_do_compare_done, bdev_io); 5820 5821 if (rc == -ENOMEM) { 5822 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5823 } else if (rc != 0) { 5824 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5825 } 5826 } 5827 5828 static void 5829 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5830 { 5831 struct spdk_bdev_io *bdev_io = ctx; 5832 5833 if (status) { 5834 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5835 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5836 return; 5837 } 5838 5839 bdev_compare_and_write_do_compare(bdev_io); 5840 } 5841 5842 int 5843 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5844 struct iovec *compare_iov, int compare_iovcnt, 5845 struct iovec *write_iov, int write_iovcnt, 5846 uint64_t offset_blocks, uint64_t num_blocks, 5847 spdk_bdev_io_completion_cb cb, void *cb_arg) 5848 { 5849 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5850 struct spdk_bdev_io *bdev_io; 5851 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5852 5853 if (!desc->write) { 5854 return -EBADF; 5855 } 5856 5857 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5858 return -EINVAL; 5859 } 5860 5861 if (num_blocks > bdev->acwu) { 5862 return -EINVAL; 5863 } 5864 5865 bdev_io = bdev_channel_get_io(channel); 5866 if (!bdev_io) { 5867 return -ENOMEM; 5868 } 5869 5870 bdev_io->internal.ch = channel; 5871 bdev_io->internal.desc = desc; 5872 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5873 bdev_io->u.bdev.iovs = compare_iov; 5874 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5875 bdev_io->u.bdev.fused_iovs = write_iov; 5876 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5877 bdev_io->u.bdev.md_buf = NULL; 5878 bdev_io->u.bdev.num_blocks = num_blocks; 5879 bdev_io->u.bdev.offset_blocks = offset_blocks; 5880 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5881 bdev_io->u.bdev.memory_domain = NULL; 5882 bdev_io->u.bdev.memory_domain_ctx = NULL; 5883 bdev_io->u.bdev.accel_sequence = NULL; 5884 5885 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5886 bdev_io_submit(bdev_io); 5887 return 0; 5888 } 5889 5890 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5891 bdev_comparev_and_writev_blocks_locked, bdev_io); 5892 } 5893 5894 int 5895 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5896 struct iovec *iov, int iovcnt, 5897 uint64_t offset_blocks, uint64_t num_blocks, 5898 bool populate, 5899 spdk_bdev_io_completion_cb cb, void *cb_arg) 5900 { 5901 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5902 struct spdk_bdev_io *bdev_io; 5903 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5904 5905 if (!desc->write) { 5906 return -EBADF; 5907 } 5908 5909 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5910 return -EINVAL; 5911 } 5912 5913 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5914 return -ENOTSUP; 5915 } 5916 5917 bdev_io = bdev_channel_get_io(channel); 5918 if (!bdev_io) { 5919 return -ENOMEM; 5920 } 5921 5922 bdev_io->internal.ch = channel; 5923 bdev_io->internal.desc = desc; 5924 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5925 bdev_io->u.bdev.num_blocks = num_blocks; 5926 bdev_io->u.bdev.offset_blocks = offset_blocks; 5927 bdev_io->u.bdev.iovs = iov; 5928 bdev_io->u.bdev.iovcnt = iovcnt; 5929 bdev_io->u.bdev.md_buf = NULL; 5930 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5931 bdev_io->u.bdev.zcopy.commit = 0; 5932 bdev_io->u.bdev.zcopy.start = 1; 5933 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5934 bdev_io->u.bdev.memory_domain = NULL; 5935 bdev_io->u.bdev.memory_domain_ctx = NULL; 5936 bdev_io->u.bdev.accel_sequence = NULL; 5937 5938 bdev_io_submit(bdev_io); 5939 5940 return 0; 5941 } 5942 5943 int 5944 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5945 spdk_bdev_io_completion_cb cb, void *cb_arg) 5946 { 5947 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5948 return -EINVAL; 5949 } 5950 5951 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5952 bdev_io->u.bdev.zcopy.start = 0; 5953 bdev_io->internal.caller_ctx = cb_arg; 5954 bdev_io->internal.cb = cb; 5955 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5956 5957 bdev_io_submit(bdev_io); 5958 5959 return 0; 5960 } 5961 5962 int 5963 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5964 uint64_t offset, uint64_t len, 5965 spdk_bdev_io_completion_cb cb, void *cb_arg) 5966 { 5967 uint64_t offset_blocks, num_blocks; 5968 5969 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5970 len, &num_blocks) != 0) { 5971 return -EINVAL; 5972 } 5973 5974 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5975 } 5976 5977 int 5978 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5979 uint64_t offset_blocks, uint64_t num_blocks, 5980 spdk_bdev_io_completion_cb cb, void *cb_arg) 5981 { 5982 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5983 struct spdk_bdev_io *bdev_io; 5984 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5985 5986 if (!desc->write) { 5987 return -EBADF; 5988 } 5989 5990 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5991 return -EINVAL; 5992 } 5993 5994 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5995 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5996 return -ENOTSUP; 5997 } 5998 5999 bdev_io = bdev_channel_get_io(channel); 6000 6001 if (!bdev_io) { 6002 return -ENOMEM; 6003 } 6004 6005 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6006 bdev_io->internal.ch = channel; 6007 bdev_io->internal.desc = desc; 6008 bdev_io->u.bdev.offset_blocks = offset_blocks; 6009 bdev_io->u.bdev.num_blocks = num_blocks; 6010 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6011 bdev_io->u.bdev.memory_domain = NULL; 6012 bdev_io->u.bdev.memory_domain_ctx = NULL; 6013 bdev_io->u.bdev.accel_sequence = NULL; 6014 6015 /* If the write_zeroes size is large and should be split, use the generic split 6016 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6017 * 6018 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6019 * or emulate it using regular write request otherwise. 6020 */ 6021 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6022 bdev_io->internal.split) { 6023 bdev_io_submit(bdev_io); 6024 return 0; 6025 } 6026 6027 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6028 6029 return bdev_write_zero_buffer(bdev_io); 6030 } 6031 6032 int 6033 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6034 uint64_t offset, uint64_t nbytes, 6035 spdk_bdev_io_completion_cb cb, void *cb_arg) 6036 { 6037 uint64_t offset_blocks, num_blocks; 6038 6039 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6040 nbytes, &num_blocks) != 0) { 6041 return -EINVAL; 6042 } 6043 6044 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6045 } 6046 6047 int 6048 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6049 uint64_t offset_blocks, uint64_t num_blocks, 6050 spdk_bdev_io_completion_cb cb, void *cb_arg) 6051 { 6052 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6053 struct spdk_bdev_io *bdev_io; 6054 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6055 6056 if (!desc->write) { 6057 return -EBADF; 6058 } 6059 6060 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6061 return -EINVAL; 6062 } 6063 6064 if (num_blocks == 0) { 6065 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6066 return -EINVAL; 6067 } 6068 6069 bdev_io = bdev_channel_get_io(channel); 6070 if (!bdev_io) { 6071 return -ENOMEM; 6072 } 6073 6074 bdev_io->internal.ch = channel; 6075 bdev_io->internal.desc = desc; 6076 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6077 6078 bdev_io->u.bdev.iovs = &bdev_io->iov; 6079 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6080 bdev_io->u.bdev.iovs[0].iov_len = 0; 6081 bdev_io->u.bdev.iovcnt = 1; 6082 6083 bdev_io->u.bdev.offset_blocks = offset_blocks; 6084 bdev_io->u.bdev.num_blocks = num_blocks; 6085 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6086 bdev_io->u.bdev.memory_domain = NULL; 6087 bdev_io->u.bdev.memory_domain_ctx = NULL; 6088 bdev_io->u.bdev.accel_sequence = NULL; 6089 6090 bdev_io_submit(bdev_io); 6091 return 0; 6092 } 6093 6094 int 6095 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6096 uint64_t offset, uint64_t length, 6097 spdk_bdev_io_completion_cb cb, void *cb_arg) 6098 { 6099 uint64_t offset_blocks, num_blocks; 6100 6101 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6102 length, &num_blocks) != 0) { 6103 return -EINVAL; 6104 } 6105 6106 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6107 } 6108 6109 int 6110 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6111 uint64_t offset_blocks, uint64_t num_blocks, 6112 spdk_bdev_io_completion_cb cb, void *cb_arg) 6113 { 6114 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6115 struct spdk_bdev_io *bdev_io; 6116 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6117 6118 if (!desc->write) { 6119 return -EBADF; 6120 } 6121 6122 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6123 return -EINVAL; 6124 } 6125 6126 bdev_io = bdev_channel_get_io(channel); 6127 if (!bdev_io) { 6128 return -ENOMEM; 6129 } 6130 6131 bdev_io->internal.ch = channel; 6132 bdev_io->internal.desc = desc; 6133 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6134 bdev_io->u.bdev.iovs = NULL; 6135 bdev_io->u.bdev.iovcnt = 0; 6136 bdev_io->u.bdev.offset_blocks = offset_blocks; 6137 bdev_io->u.bdev.num_blocks = num_blocks; 6138 bdev_io->u.bdev.memory_domain = NULL; 6139 bdev_io->u.bdev.memory_domain_ctx = NULL; 6140 bdev_io->u.bdev.accel_sequence = NULL; 6141 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6142 6143 bdev_io_submit(bdev_io); 6144 return 0; 6145 } 6146 6147 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6148 6149 static void 6150 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6151 { 6152 struct spdk_bdev_channel *ch = _ctx; 6153 struct spdk_bdev_io *bdev_io; 6154 6155 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6156 6157 if (status == -EBUSY) { 6158 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6159 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6160 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6161 } else { 6162 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6163 6164 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6165 /* If outstanding IOs are still present and reset_io_drain_timeout 6166 * seconds passed, start the reset. */ 6167 bdev_io_submit_reset(bdev_io); 6168 } else { 6169 /* We still have in progress memory domain pull/push or we're 6170 * executing accel sequence. Since we cannot abort either of those 6171 * operaions, fail the reset request. */ 6172 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6173 } 6174 } 6175 } else { 6176 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6177 SPDK_DEBUGLOG(bdev, 6178 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6179 ch->bdev->name); 6180 /* Mark the completion status as a SUCCESS and complete the reset. */ 6181 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6182 } 6183 } 6184 6185 static void 6186 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6187 struct spdk_io_channel *io_ch, void *_ctx) 6188 { 6189 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6190 int status = 0; 6191 6192 if (cur_ch->io_outstanding > 0 || 6193 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6194 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6195 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6196 * further iteration over the rest of the channels and pass non-zero status 6197 * to the callback function. */ 6198 status = -EBUSY; 6199 } 6200 spdk_bdev_for_each_channel_continue(i, status); 6201 } 6202 6203 static int 6204 bdev_reset_poll_for_outstanding_io(void *ctx) 6205 { 6206 struct spdk_bdev_channel *ch = ctx; 6207 struct spdk_bdev_io *bdev_io; 6208 6209 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6210 6211 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6212 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6213 bdev_reset_check_outstanding_io_done); 6214 6215 return SPDK_POLLER_BUSY; 6216 } 6217 6218 static void 6219 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6220 { 6221 struct spdk_bdev_channel *ch = _ctx; 6222 struct spdk_bdev_io *bdev_io; 6223 6224 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6225 6226 if (bdev->reset_io_drain_timeout == 0) { 6227 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6228 6229 bdev_io_submit_reset(bdev_io); 6230 return; 6231 } 6232 6233 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6234 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6235 6236 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6237 * submit the reset to the underlying module only if outstanding I/O 6238 * remain after reset_io_drain_timeout seconds have passed. */ 6239 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6240 bdev_reset_check_outstanding_io_done); 6241 } 6242 6243 static void 6244 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6245 struct spdk_io_channel *ch, void *_ctx) 6246 { 6247 struct spdk_bdev_channel *channel; 6248 struct spdk_bdev_mgmt_channel *mgmt_channel; 6249 struct spdk_bdev_shared_resource *shared_resource; 6250 bdev_io_tailq_t tmp_queued; 6251 6252 TAILQ_INIT(&tmp_queued); 6253 6254 channel = __io_ch_to_bdev_ch(ch); 6255 shared_resource = channel->shared_resource; 6256 mgmt_channel = shared_resource->mgmt_ch; 6257 6258 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6259 6260 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6261 /* The QoS object is always valid and readable while 6262 * the channel flag is set, so the lock here should not 6263 * be necessary. We're not in the fast path though, so 6264 * just take it anyway. */ 6265 spdk_spin_lock(&channel->bdev->internal.spinlock); 6266 if (channel->bdev->internal.qos->ch == channel) { 6267 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6268 } 6269 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6270 } 6271 6272 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6273 bdev_abort_all_buf_io(mgmt_channel, channel); 6274 bdev_abort_all_queued_io(&tmp_queued, channel); 6275 6276 spdk_bdev_for_each_channel_continue(i, 0); 6277 } 6278 6279 static void 6280 bdev_start_reset(void *ctx) 6281 { 6282 struct spdk_bdev_channel *ch = ctx; 6283 6284 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6285 bdev_reset_freeze_channel_done); 6286 } 6287 6288 static void 6289 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6290 { 6291 struct spdk_bdev *bdev = ch->bdev; 6292 6293 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6294 6295 spdk_spin_lock(&bdev->internal.spinlock); 6296 if (bdev->internal.reset_in_progress == NULL) { 6297 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6298 /* 6299 * Take a channel reference for the target bdev for the life of this 6300 * reset. This guards against the channel getting destroyed while 6301 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6302 * progress. We will release the reference when this reset is 6303 * completed. 6304 */ 6305 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6306 bdev_start_reset(ch); 6307 } 6308 spdk_spin_unlock(&bdev->internal.spinlock); 6309 } 6310 6311 int 6312 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6313 spdk_bdev_io_completion_cb cb, void *cb_arg) 6314 { 6315 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6316 struct spdk_bdev_io *bdev_io; 6317 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6318 6319 bdev_io = bdev_channel_get_io(channel); 6320 if (!bdev_io) { 6321 return -ENOMEM; 6322 } 6323 6324 bdev_io->internal.ch = channel; 6325 bdev_io->internal.desc = desc; 6326 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6327 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6328 bdev_io->u.reset.ch_ref = NULL; 6329 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6330 6331 spdk_spin_lock(&bdev->internal.spinlock); 6332 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6333 spdk_spin_unlock(&bdev->internal.spinlock); 6334 6335 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6336 internal.ch_link); 6337 6338 bdev_channel_start_reset(channel); 6339 6340 return 0; 6341 } 6342 6343 void 6344 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6345 struct spdk_bdev_io_stat *stat) 6346 { 6347 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6348 6349 bdev_get_io_stat(stat, channel->stat); 6350 } 6351 6352 static void 6353 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6354 { 6355 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6356 6357 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6358 bdev_iostat_ctx->cb_arg, 0); 6359 free(bdev_iostat_ctx); 6360 } 6361 6362 static void 6363 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6364 struct spdk_io_channel *ch, void *_ctx) 6365 { 6366 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6367 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6368 6369 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6370 spdk_bdev_for_each_channel_continue(i, 0); 6371 } 6372 6373 void 6374 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6375 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6376 { 6377 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6378 6379 assert(bdev != NULL); 6380 assert(stat != NULL); 6381 assert(cb != NULL); 6382 6383 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6384 if (bdev_iostat_ctx == NULL) { 6385 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6386 cb(bdev, stat, cb_arg, -ENOMEM); 6387 return; 6388 } 6389 6390 bdev_iostat_ctx->stat = stat; 6391 bdev_iostat_ctx->cb = cb; 6392 bdev_iostat_ctx->cb_arg = cb_arg; 6393 6394 /* Start with the statistics from previously deleted channels. */ 6395 spdk_spin_lock(&bdev->internal.spinlock); 6396 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6397 spdk_spin_unlock(&bdev->internal.spinlock); 6398 6399 /* Then iterate and add the statistics from each existing channel. */ 6400 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6401 bdev_get_device_stat_done); 6402 } 6403 6404 struct bdev_iostat_reset_ctx { 6405 enum spdk_bdev_reset_stat_mode mode; 6406 bdev_reset_device_stat_cb cb; 6407 void *cb_arg; 6408 }; 6409 6410 static void 6411 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6412 { 6413 struct bdev_iostat_reset_ctx *ctx = _ctx; 6414 6415 ctx->cb(bdev, ctx->cb_arg, 0); 6416 6417 free(ctx); 6418 } 6419 6420 static void 6421 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6422 struct spdk_io_channel *ch, void *_ctx) 6423 { 6424 struct bdev_iostat_reset_ctx *ctx = _ctx; 6425 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6426 6427 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6428 6429 spdk_bdev_for_each_channel_continue(i, 0); 6430 } 6431 6432 void 6433 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6434 bdev_reset_device_stat_cb cb, void *cb_arg) 6435 { 6436 struct bdev_iostat_reset_ctx *ctx; 6437 6438 assert(bdev != NULL); 6439 assert(cb != NULL); 6440 6441 ctx = calloc(1, sizeof(*ctx)); 6442 if (ctx == NULL) { 6443 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6444 cb(bdev, cb_arg, -ENOMEM); 6445 return; 6446 } 6447 6448 ctx->mode = mode; 6449 ctx->cb = cb; 6450 ctx->cb_arg = cb_arg; 6451 6452 spdk_spin_lock(&bdev->internal.spinlock); 6453 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6454 spdk_spin_unlock(&bdev->internal.spinlock); 6455 6456 spdk_bdev_for_each_channel(bdev, 6457 bdev_reset_each_channel_stat, 6458 ctx, 6459 bdev_reset_device_stat_done); 6460 } 6461 6462 int 6463 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6464 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6465 spdk_bdev_io_completion_cb cb, void *cb_arg) 6466 { 6467 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6468 struct spdk_bdev_io *bdev_io; 6469 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6470 6471 if (!desc->write) { 6472 return -EBADF; 6473 } 6474 6475 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6476 return -ENOTSUP; 6477 } 6478 6479 bdev_io = bdev_channel_get_io(channel); 6480 if (!bdev_io) { 6481 return -ENOMEM; 6482 } 6483 6484 bdev_io->internal.ch = channel; 6485 bdev_io->internal.desc = desc; 6486 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6487 bdev_io->u.nvme_passthru.cmd = *cmd; 6488 bdev_io->u.nvme_passthru.buf = buf; 6489 bdev_io->u.nvme_passthru.nbytes = nbytes; 6490 bdev_io->u.nvme_passthru.md_buf = NULL; 6491 bdev_io->u.nvme_passthru.md_len = 0; 6492 6493 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6494 6495 bdev_io_submit(bdev_io); 6496 return 0; 6497 } 6498 6499 int 6500 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6501 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6502 spdk_bdev_io_completion_cb cb, void *cb_arg) 6503 { 6504 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6505 struct spdk_bdev_io *bdev_io; 6506 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6507 6508 if (!desc->write) { 6509 /* 6510 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6511 * to easily determine if the command is a read or write, but for now just 6512 * do not allow io_passthru with a read-only descriptor. 6513 */ 6514 return -EBADF; 6515 } 6516 6517 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6518 return -ENOTSUP; 6519 } 6520 6521 bdev_io = bdev_channel_get_io(channel); 6522 if (!bdev_io) { 6523 return -ENOMEM; 6524 } 6525 6526 bdev_io->internal.ch = channel; 6527 bdev_io->internal.desc = desc; 6528 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6529 bdev_io->u.nvme_passthru.cmd = *cmd; 6530 bdev_io->u.nvme_passthru.buf = buf; 6531 bdev_io->u.nvme_passthru.nbytes = nbytes; 6532 bdev_io->u.nvme_passthru.md_buf = NULL; 6533 bdev_io->u.nvme_passthru.md_len = 0; 6534 6535 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6536 6537 bdev_io_submit(bdev_io); 6538 return 0; 6539 } 6540 6541 int 6542 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6543 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6544 spdk_bdev_io_completion_cb cb, void *cb_arg) 6545 { 6546 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6547 struct spdk_bdev_io *bdev_io; 6548 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6549 6550 if (!desc->write) { 6551 /* 6552 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6553 * to easily determine if the command is a read or write, but for now just 6554 * do not allow io_passthru with a read-only descriptor. 6555 */ 6556 return -EBADF; 6557 } 6558 6559 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6560 return -ENOTSUP; 6561 } 6562 6563 bdev_io = bdev_channel_get_io(channel); 6564 if (!bdev_io) { 6565 return -ENOMEM; 6566 } 6567 6568 bdev_io->internal.ch = channel; 6569 bdev_io->internal.desc = desc; 6570 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6571 bdev_io->u.nvme_passthru.cmd = *cmd; 6572 bdev_io->u.nvme_passthru.buf = buf; 6573 bdev_io->u.nvme_passthru.nbytes = nbytes; 6574 bdev_io->u.nvme_passthru.md_buf = md_buf; 6575 bdev_io->u.nvme_passthru.md_len = md_len; 6576 6577 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6578 6579 bdev_io_submit(bdev_io); 6580 return 0; 6581 } 6582 6583 static void bdev_abort_retry(void *ctx); 6584 static void bdev_abort(struct spdk_bdev_io *parent_io); 6585 6586 static void 6587 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6588 { 6589 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6590 struct spdk_bdev_io *parent_io = cb_arg; 6591 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6592 6593 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6594 6595 spdk_bdev_free_io(bdev_io); 6596 6597 if (!success) { 6598 /* Check if the target I/O completed in the meantime. */ 6599 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6600 if (tmp_io == bio_to_abort) { 6601 break; 6602 } 6603 } 6604 6605 /* If the target I/O still exists, set the parent to failed. */ 6606 if (tmp_io != NULL) { 6607 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6608 } 6609 } 6610 6611 parent_io->u.bdev.split_outstanding--; 6612 if (parent_io->u.bdev.split_outstanding == 0) { 6613 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6614 bdev_abort_retry(parent_io); 6615 } else { 6616 bdev_io_complete(parent_io); 6617 } 6618 } 6619 } 6620 6621 static int 6622 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6623 struct spdk_bdev_io *bio_to_abort, 6624 spdk_bdev_io_completion_cb cb, void *cb_arg) 6625 { 6626 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6627 struct spdk_bdev_io *bdev_io; 6628 6629 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6630 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6631 /* TODO: Abort reset or abort request. */ 6632 return -ENOTSUP; 6633 } 6634 6635 bdev_io = bdev_channel_get_io(channel); 6636 if (bdev_io == NULL) { 6637 return -ENOMEM; 6638 } 6639 6640 bdev_io->internal.ch = channel; 6641 bdev_io->internal.desc = desc; 6642 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6643 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6644 6645 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6646 assert(bdev_io_should_split(bio_to_abort)); 6647 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6648 6649 /* Parent abort request is not submitted directly, but to manage its 6650 * execution add it to the submitted list here. 6651 */ 6652 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6653 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6654 6655 bdev_abort(bdev_io); 6656 6657 return 0; 6658 } 6659 6660 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6661 6662 /* Submit the abort request to the underlying bdev module. */ 6663 bdev_io_submit(bdev_io); 6664 6665 return 0; 6666 } 6667 6668 static bool 6669 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6670 { 6671 struct spdk_bdev_io *iter; 6672 6673 TAILQ_FOREACH(iter, tailq, internal.link) { 6674 if (iter == bdev_io) { 6675 return true; 6676 } 6677 } 6678 6679 return false; 6680 } 6681 6682 static uint32_t 6683 _bdev_abort(struct spdk_bdev_io *parent_io) 6684 { 6685 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6686 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6687 void *bio_cb_arg; 6688 struct spdk_bdev_io *bio_to_abort; 6689 uint32_t matched_ios; 6690 int rc; 6691 6692 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6693 6694 /* matched_ios is returned and will be kept by the caller. 6695 * 6696 * This function will be used for two cases, 1) the same cb_arg is used for 6697 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6698 * Incrementing split_outstanding directly here may confuse readers especially 6699 * for the 1st case. 6700 * 6701 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6702 * works as expected. 6703 */ 6704 matched_ios = 0; 6705 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6706 6707 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6708 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6709 continue; 6710 } 6711 6712 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6713 /* Any I/O which was submitted after this abort command should be excluded. */ 6714 continue; 6715 } 6716 6717 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6718 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6719 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6720 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6721 break; 6722 } 6723 6724 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6725 if (rc != 0) { 6726 if (rc == -ENOMEM) { 6727 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6728 } else { 6729 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6730 } 6731 break; 6732 } 6733 matched_ios++; 6734 } 6735 6736 return matched_ios; 6737 } 6738 6739 static void 6740 bdev_abort_retry(void *ctx) 6741 { 6742 struct spdk_bdev_io *parent_io = ctx; 6743 uint32_t matched_ios; 6744 6745 matched_ios = _bdev_abort(parent_io); 6746 6747 if (matched_ios == 0) { 6748 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6749 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6750 } else { 6751 /* For retry, the case that no target I/O was found is success 6752 * because it means target I/Os completed in the meantime. 6753 */ 6754 bdev_io_complete(parent_io); 6755 } 6756 return; 6757 } 6758 6759 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6760 parent_io->u.bdev.split_outstanding = matched_ios; 6761 } 6762 6763 static void 6764 bdev_abort(struct spdk_bdev_io *parent_io) 6765 { 6766 uint32_t matched_ios; 6767 6768 matched_ios = _bdev_abort(parent_io); 6769 6770 if (matched_ios == 0) { 6771 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6772 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6773 } else { 6774 /* The case the no target I/O was found is failure. */ 6775 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6776 bdev_io_complete(parent_io); 6777 } 6778 return; 6779 } 6780 6781 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6782 parent_io->u.bdev.split_outstanding = matched_ios; 6783 } 6784 6785 int 6786 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6787 void *bio_cb_arg, 6788 spdk_bdev_io_completion_cb cb, void *cb_arg) 6789 { 6790 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6791 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6792 struct spdk_bdev_io *bdev_io; 6793 6794 if (bio_cb_arg == NULL) { 6795 return -EINVAL; 6796 } 6797 6798 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6799 return -ENOTSUP; 6800 } 6801 6802 bdev_io = bdev_channel_get_io(channel); 6803 if (bdev_io == NULL) { 6804 return -ENOMEM; 6805 } 6806 6807 bdev_io->internal.ch = channel; 6808 bdev_io->internal.desc = desc; 6809 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6810 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6811 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6812 6813 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6814 6815 /* Parent abort request is not submitted directly, but to manage its execution, 6816 * add it to the submitted list here. 6817 */ 6818 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6819 6820 bdev_abort(bdev_io); 6821 6822 return 0; 6823 } 6824 6825 int 6826 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6827 struct spdk_bdev_io_wait_entry *entry) 6828 { 6829 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6830 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6831 6832 if (bdev != entry->bdev) { 6833 SPDK_ERRLOG("bdevs do not match\n"); 6834 return -EINVAL; 6835 } 6836 6837 if (mgmt_ch->per_thread_cache_count > 0) { 6838 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6839 return -EINVAL; 6840 } 6841 6842 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6843 return 0; 6844 } 6845 6846 static inline void 6847 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6848 { 6849 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6850 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6851 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6852 uint32_t blocklen = bdev_io->bdev->blocklen; 6853 6854 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6855 switch (bdev_io->type) { 6856 case SPDK_BDEV_IO_TYPE_READ: 6857 io_stat->bytes_read += num_blocks * blocklen; 6858 io_stat->num_read_ops++; 6859 io_stat->read_latency_ticks += tsc_diff; 6860 if (io_stat->max_read_latency_ticks < tsc_diff) { 6861 io_stat->max_read_latency_ticks = tsc_diff; 6862 } 6863 if (io_stat->min_read_latency_ticks > tsc_diff) { 6864 io_stat->min_read_latency_ticks = tsc_diff; 6865 } 6866 break; 6867 case SPDK_BDEV_IO_TYPE_WRITE: 6868 io_stat->bytes_written += num_blocks * blocklen; 6869 io_stat->num_write_ops++; 6870 io_stat->write_latency_ticks += tsc_diff; 6871 if (io_stat->max_write_latency_ticks < tsc_diff) { 6872 io_stat->max_write_latency_ticks = tsc_diff; 6873 } 6874 if (io_stat->min_write_latency_ticks > tsc_diff) { 6875 io_stat->min_write_latency_ticks = tsc_diff; 6876 } 6877 break; 6878 case SPDK_BDEV_IO_TYPE_UNMAP: 6879 io_stat->bytes_unmapped += num_blocks * blocklen; 6880 io_stat->num_unmap_ops++; 6881 io_stat->unmap_latency_ticks += tsc_diff; 6882 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6883 io_stat->max_unmap_latency_ticks = tsc_diff; 6884 } 6885 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6886 io_stat->min_unmap_latency_ticks = tsc_diff; 6887 } 6888 break; 6889 case SPDK_BDEV_IO_TYPE_ZCOPY: 6890 /* Track the data in the start phase only */ 6891 if (bdev_io->u.bdev.zcopy.start) { 6892 if (bdev_io->u.bdev.zcopy.populate) { 6893 io_stat->bytes_read += num_blocks * blocklen; 6894 io_stat->num_read_ops++; 6895 io_stat->read_latency_ticks += tsc_diff; 6896 if (io_stat->max_read_latency_ticks < tsc_diff) { 6897 io_stat->max_read_latency_ticks = tsc_diff; 6898 } 6899 if (io_stat->min_read_latency_ticks > tsc_diff) { 6900 io_stat->min_read_latency_ticks = tsc_diff; 6901 } 6902 } else { 6903 io_stat->bytes_written += num_blocks * blocklen; 6904 io_stat->num_write_ops++; 6905 io_stat->write_latency_ticks += tsc_diff; 6906 if (io_stat->max_write_latency_ticks < tsc_diff) { 6907 io_stat->max_write_latency_ticks = tsc_diff; 6908 } 6909 if (io_stat->min_write_latency_ticks > tsc_diff) { 6910 io_stat->min_write_latency_ticks = tsc_diff; 6911 } 6912 } 6913 } 6914 break; 6915 case SPDK_BDEV_IO_TYPE_COPY: 6916 io_stat->bytes_copied += num_blocks * blocklen; 6917 io_stat->num_copy_ops++; 6918 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6919 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6920 io_stat->max_copy_latency_ticks = tsc_diff; 6921 } 6922 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6923 io_stat->min_copy_latency_ticks = tsc_diff; 6924 } 6925 break; 6926 default: 6927 break; 6928 } 6929 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6930 io_stat = bdev_io->bdev->internal.stat; 6931 assert(io_stat->io_error != NULL); 6932 6933 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6934 io_stat->io_error->error_status[-io_status - 1]++; 6935 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6936 } 6937 6938 #ifdef SPDK_CONFIG_VTUNE 6939 uint64_t now_tsc = spdk_get_ticks(); 6940 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6941 uint64_t data[5]; 6942 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6943 6944 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6945 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6946 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6947 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6948 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6949 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6950 6951 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6952 __itt_metadata_u64, 5, data); 6953 6954 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6955 bdev_io->internal.ch->start_tsc = now_tsc; 6956 } 6957 #endif 6958 } 6959 6960 static inline void 6961 _bdev_io_complete(void *ctx) 6962 { 6963 struct spdk_bdev_io *bdev_io = ctx; 6964 6965 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6966 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6967 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6968 } 6969 6970 assert(bdev_io->internal.cb != NULL); 6971 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6972 6973 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6974 bdev_io->internal.caller_ctx); 6975 } 6976 6977 static inline void 6978 bdev_io_complete(void *ctx) 6979 { 6980 struct spdk_bdev_io *bdev_io = ctx; 6981 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6982 uint64_t tsc, tsc_diff; 6983 6984 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6985 /* 6986 * Defer completion to avoid potential infinite recursion if the 6987 * user's completion callback issues a new I/O. 6988 */ 6989 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6990 bdev_io_complete, bdev_io); 6991 return; 6992 } 6993 6994 tsc = spdk_get_ticks(); 6995 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6996 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6997 bdev_io->internal.caller_ctx); 6998 6999 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7000 7001 if (bdev_io->internal.ch->histogram) { 7002 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7003 } 7004 7005 bdev_io_update_io_stat(bdev_io, tsc_diff); 7006 _bdev_io_complete(bdev_io); 7007 } 7008 7009 /* The difference between this function and bdev_io_complete() is that this should be called to 7010 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7011 * io_submitted list and don't have submit_tsc updated. 7012 */ 7013 static inline void 7014 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7015 { 7016 /* Since the IO hasn't been submitted it's bound to be failed */ 7017 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7018 7019 /* At this point we don't know if the IO is completed from submission context or not, but, 7020 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7021 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7022 _bdev_io_complete, bdev_io); 7023 } 7024 7025 static void bdev_destroy_cb(void *io_device); 7026 7027 static void 7028 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7029 { 7030 struct spdk_bdev_io *bdev_io = _ctx; 7031 7032 if (bdev_io->u.reset.ch_ref != NULL) { 7033 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7034 bdev_io->u.reset.ch_ref = NULL; 7035 } 7036 7037 bdev_io_complete(bdev_io); 7038 7039 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7040 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7041 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7042 } 7043 } 7044 7045 static void 7046 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7047 struct spdk_io_channel *_ch, void *_ctx) 7048 { 7049 struct spdk_bdev_io *bdev_io = _ctx; 7050 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7051 struct spdk_bdev_io *queued_reset; 7052 7053 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7054 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7055 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7056 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7057 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7058 } 7059 7060 spdk_bdev_for_each_channel_continue(i, 0); 7061 } 7062 7063 static void 7064 bdev_io_complete_sequence_cb(void *ctx, int status) 7065 { 7066 struct spdk_bdev_io *bdev_io = ctx; 7067 7068 /* u.bdev.accel_sequence should have already been cleared at this point */ 7069 assert(bdev_io->u.bdev.accel_sequence == NULL); 7070 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7071 bdev_io->internal.accel_sequence = NULL; 7072 7073 if (spdk_unlikely(status != 0)) { 7074 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7075 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7076 } 7077 7078 bdev_io_complete(bdev_io); 7079 } 7080 7081 void 7082 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7083 { 7084 struct spdk_bdev *bdev = bdev_io->bdev; 7085 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7086 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7087 7088 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7089 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7090 spdk_bdev_get_module_name(bdev), 7091 bdev_io_status_get_string(bdev_io->internal.status)); 7092 assert(false); 7093 } 7094 bdev_io->internal.status = status; 7095 7096 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7097 bool unlock_channels = false; 7098 7099 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7100 SPDK_ERRLOG("NOMEM returned for reset\n"); 7101 } 7102 spdk_spin_lock(&bdev->internal.spinlock); 7103 if (bdev_io == bdev->internal.reset_in_progress) { 7104 bdev->internal.reset_in_progress = NULL; 7105 unlock_channels = true; 7106 } 7107 spdk_spin_unlock(&bdev->internal.spinlock); 7108 7109 if (unlock_channels) { 7110 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7111 bdev_reset_complete); 7112 return; 7113 } 7114 } else { 7115 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7116 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7117 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7118 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7119 return; 7120 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7121 !bdev_io_use_accel_sequence(bdev_io))) { 7122 _bdev_io_push_bounce_data_buffer(bdev_io, 7123 _bdev_io_complete_push_bounce_done); 7124 /* bdev IO will be completed in the callback */ 7125 return; 7126 } 7127 } 7128 7129 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7130 return; 7131 } 7132 } 7133 7134 bdev_io_complete(bdev_io); 7135 } 7136 7137 void 7138 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7139 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7140 { 7141 enum spdk_bdev_io_status status; 7142 7143 if (sc == SPDK_SCSI_STATUS_GOOD) { 7144 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7145 } else { 7146 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7147 bdev_io->internal.error.scsi.sc = sc; 7148 bdev_io->internal.error.scsi.sk = sk; 7149 bdev_io->internal.error.scsi.asc = asc; 7150 bdev_io->internal.error.scsi.ascq = ascq; 7151 } 7152 7153 spdk_bdev_io_complete(bdev_io, status); 7154 } 7155 7156 void 7157 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7158 int *sc, int *sk, int *asc, int *ascq) 7159 { 7160 assert(sc != NULL); 7161 assert(sk != NULL); 7162 assert(asc != NULL); 7163 assert(ascq != NULL); 7164 7165 switch (bdev_io->internal.status) { 7166 case SPDK_BDEV_IO_STATUS_SUCCESS: 7167 *sc = SPDK_SCSI_STATUS_GOOD; 7168 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7169 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7170 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7171 break; 7172 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7173 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7174 break; 7175 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7176 *sc = bdev_io->internal.error.scsi.sc; 7177 *sk = bdev_io->internal.error.scsi.sk; 7178 *asc = bdev_io->internal.error.scsi.asc; 7179 *ascq = bdev_io->internal.error.scsi.ascq; 7180 break; 7181 default: 7182 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7183 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7184 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7185 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7186 break; 7187 } 7188 } 7189 7190 void 7191 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7192 { 7193 enum spdk_bdev_io_status status; 7194 7195 if (aio_result == 0) { 7196 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7197 } else { 7198 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7199 } 7200 7201 bdev_io->internal.error.aio_result = aio_result; 7202 7203 spdk_bdev_io_complete(bdev_io, status); 7204 } 7205 7206 void 7207 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7208 { 7209 assert(aio_result != NULL); 7210 7211 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7212 *aio_result = bdev_io->internal.error.aio_result; 7213 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7214 *aio_result = 0; 7215 } else { 7216 *aio_result = -EIO; 7217 } 7218 } 7219 7220 void 7221 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7222 { 7223 enum spdk_bdev_io_status status; 7224 7225 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7226 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7227 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7228 status = SPDK_BDEV_IO_STATUS_ABORTED; 7229 } else { 7230 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7231 } 7232 7233 bdev_io->internal.error.nvme.cdw0 = cdw0; 7234 bdev_io->internal.error.nvme.sct = sct; 7235 bdev_io->internal.error.nvme.sc = sc; 7236 7237 spdk_bdev_io_complete(bdev_io, status); 7238 } 7239 7240 void 7241 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7242 { 7243 assert(sct != NULL); 7244 assert(sc != NULL); 7245 assert(cdw0 != NULL); 7246 7247 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7248 *sct = SPDK_NVME_SCT_GENERIC; 7249 *sc = SPDK_NVME_SC_SUCCESS; 7250 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7251 *cdw0 = 0; 7252 } else { 7253 *cdw0 = 1U; 7254 } 7255 return; 7256 } 7257 7258 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7259 *sct = bdev_io->internal.error.nvme.sct; 7260 *sc = bdev_io->internal.error.nvme.sc; 7261 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7262 *sct = SPDK_NVME_SCT_GENERIC; 7263 *sc = SPDK_NVME_SC_SUCCESS; 7264 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7265 *sct = SPDK_NVME_SCT_GENERIC; 7266 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7267 } else { 7268 *sct = SPDK_NVME_SCT_GENERIC; 7269 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7270 } 7271 7272 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7273 } 7274 7275 void 7276 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7277 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7278 { 7279 assert(first_sct != NULL); 7280 assert(first_sc != NULL); 7281 assert(second_sct != NULL); 7282 assert(second_sc != NULL); 7283 assert(cdw0 != NULL); 7284 7285 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7286 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7287 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7288 *first_sct = bdev_io->internal.error.nvme.sct; 7289 *first_sc = bdev_io->internal.error.nvme.sc; 7290 *second_sct = SPDK_NVME_SCT_GENERIC; 7291 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7292 } else { 7293 *first_sct = SPDK_NVME_SCT_GENERIC; 7294 *first_sc = SPDK_NVME_SC_SUCCESS; 7295 *second_sct = bdev_io->internal.error.nvme.sct; 7296 *second_sc = bdev_io->internal.error.nvme.sc; 7297 } 7298 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7299 *first_sct = SPDK_NVME_SCT_GENERIC; 7300 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7301 *second_sct = SPDK_NVME_SCT_GENERIC; 7302 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7303 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7304 *first_sct = SPDK_NVME_SCT_GENERIC; 7305 *first_sc = SPDK_NVME_SC_SUCCESS; 7306 *second_sct = SPDK_NVME_SCT_GENERIC; 7307 *second_sc = SPDK_NVME_SC_SUCCESS; 7308 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7309 *first_sct = SPDK_NVME_SCT_GENERIC; 7310 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7311 *second_sct = SPDK_NVME_SCT_GENERIC; 7312 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7313 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7314 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7315 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7316 *second_sct = SPDK_NVME_SCT_GENERIC; 7317 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7318 } else { 7319 *first_sct = SPDK_NVME_SCT_GENERIC; 7320 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7321 *second_sct = SPDK_NVME_SCT_GENERIC; 7322 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7323 } 7324 7325 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7326 } 7327 7328 struct spdk_thread * 7329 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7330 { 7331 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7332 } 7333 7334 struct spdk_io_channel * 7335 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7336 { 7337 return bdev_io->internal.ch->channel; 7338 } 7339 7340 static int 7341 bdev_register(struct spdk_bdev *bdev) 7342 { 7343 char *bdev_name; 7344 char uuid[SPDK_UUID_STRING_LEN]; 7345 struct spdk_iobuf_opts iobuf_opts; 7346 int ret, i; 7347 7348 assert(bdev->module != NULL); 7349 7350 if (!bdev->name) { 7351 SPDK_ERRLOG("Bdev name is NULL\n"); 7352 return -EINVAL; 7353 } 7354 7355 if (!strlen(bdev->name)) { 7356 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7357 return -EINVAL; 7358 } 7359 7360 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7361 if (bdev->fn_table->accel_sequence_supported == NULL) { 7362 continue; 7363 } 7364 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7365 (enum spdk_bdev_io_type)i)) { 7366 continue; 7367 } 7368 7369 if (spdk_bdev_is_md_separate(bdev)) { 7370 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7371 "accel sequence support\n"); 7372 return -EINVAL; 7373 } 7374 } 7375 7376 /* Users often register their own I/O devices using the bdev name. In 7377 * order to avoid conflicts, prepend bdev_. */ 7378 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7379 if (!bdev_name) { 7380 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7381 return -ENOMEM; 7382 } 7383 7384 bdev->internal.stat = bdev_alloc_io_stat(true); 7385 if (!bdev->internal.stat) { 7386 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7387 free(bdev_name); 7388 return -ENOMEM; 7389 } 7390 7391 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7392 bdev->internal.measured_queue_depth = UINT64_MAX; 7393 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7394 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7395 bdev->internal.qd_poller = NULL; 7396 bdev->internal.qos = NULL; 7397 7398 TAILQ_INIT(&bdev->internal.open_descs); 7399 TAILQ_INIT(&bdev->internal.locked_ranges); 7400 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7401 TAILQ_INIT(&bdev->aliases); 7402 7403 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7404 if (ret != 0) { 7405 bdev_free_io_stat(bdev->internal.stat); 7406 free(bdev_name); 7407 return ret; 7408 } 7409 7410 /* UUID may be specified by the user or defined by bdev itself. 7411 * Otherwise it will be generated here, so this field will never be empty. */ 7412 if (spdk_uuid_is_null(&bdev->uuid)) { 7413 spdk_uuid_generate(&bdev->uuid); 7414 } 7415 7416 /* Add the UUID alias only if it's different than the name */ 7417 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7418 if (strcmp(bdev->name, uuid) != 0) { 7419 ret = spdk_bdev_alias_add(bdev, uuid); 7420 if (ret != 0) { 7421 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7422 bdev_name_del(&bdev->internal.bdev_name); 7423 bdev_free_io_stat(bdev->internal.stat); 7424 free(bdev_name); 7425 return ret; 7426 } 7427 } 7428 7429 if (spdk_bdev_get_buf_align(bdev) > 1) { 7430 if (bdev->split_on_optimal_io_boundary) { 7431 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7432 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7433 } else { 7434 bdev->split_on_optimal_io_boundary = true; 7435 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7436 } 7437 } 7438 7439 /* If the user didn't specify a write unit size, set it to one. */ 7440 if (bdev->write_unit_size == 0) { 7441 bdev->write_unit_size = 1; 7442 } 7443 7444 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7445 if (bdev->acwu == 0) { 7446 bdev->acwu = bdev->write_unit_size; 7447 } 7448 7449 if (bdev->phys_blocklen == 0) { 7450 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7451 } 7452 7453 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7454 spdk_iobuf_get_opts(&iobuf_opts); 7455 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7456 } 7457 7458 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7459 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7460 } 7461 7462 bdev->internal.reset_in_progress = NULL; 7463 bdev->internal.qd_poll_in_progress = false; 7464 bdev->internal.period = 0; 7465 bdev->internal.new_period = 0; 7466 7467 spdk_io_device_register(__bdev_to_io_dev(bdev), 7468 bdev_channel_create, bdev_channel_destroy, 7469 sizeof(struct spdk_bdev_channel), 7470 bdev_name); 7471 7472 free(bdev_name); 7473 7474 spdk_spin_init(&bdev->internal.spinlock); 7475 7476 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7477 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7478 7479 return 0; 7480 } 7481 7482 static void 7483 bdev_destroy_cb(void *io_device) 7484 { 7485 int rc; 7486 struct spdk_bdev *bdev; 7487 spdk_bdev_unregister_cb cb_fn; 7488 void *cb_arg; 7489 7490 bdev = __bdev_from_io_dev(io_device); 7491 7492 if (bdev->internal.unregister_td != spdk_get_thread()) { 7493 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7494 return; 7495 } 7496 7497 cb_fn = bdev->internal.unregister_cb; 7498 cb_arg = bdev->internal.unregister_ctx; 7499 7500 spdk_spin_destroy(&bdev->internal.spinlock); 7501 free(bdev->internal.qos); 7502 bdev_free_io_stat(bdev->internal.stat); 7503 7504 rc = bdev->fn_table->destruct(bdev->ctxt); 7505 if (rc < 0) { 7506 SPDK_ERRLOG("destruct failed\n"); 7507 } 7508 if (rc <= 0 && cb_fn != NULL) { 7509 cb_fn(cb_arg, rc); 7510 } 7511 } 7512 7513 void 7514 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7515 { 7516 if (bdev->internal.unregister_cb != NULL) { 7517 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7518 } 7519 } 7520 7521 static void 7522 _remove_notify(void *arg) 7523 { 7524 struct spdk_bdev_desc *desc = arg; 7525 7526 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7527 } 7528 7529 /* returns: 0 - bdev removed and ready to be destructed. 7530 * -EBUSY - bdev can't be destructed yet. */ 7531 static int 7532 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7533 { 7534 struct spdk_bdev_desc *desc, *tmp; 7535 int rc = 0; 7536 char uuid[SPDK_UUID_STRING_LEN]; 7537 7538 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7539 assert(spdk_spin_held(&bdev->internal.spinlock)); 7540 7541 /* Notify each descriptor about hotremoval */ 7542 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7543 rc = -EBUSY; 7544 /* 7545 * Defer invocation of the event_cb to a separate message that will 7546 * run later on its thread. This ensures this context unwinds and 7547 * we don't recursively unregister this bdev again if the event_cb 7548 * immediately closes its descriptor. 7549 */ 7550 event_notify(desc, _remove_notify); 7551 } 7552 7553 /* If there are no descriptors, proceed removing the bdev */ 7554 if (rc == 0) { 7555 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7556 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7557 7558 /* Delete the name and the UUID alias */ 7559 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7560 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7561 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7562 7563 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7564 7565 if (bdev->internal.reset_in_progress != NULL) { 7566 /* If reset is in progress, let the completion callback for reset 7567 * unregister the bdev. 7568 */ 7569 rc = -EBUSY; 7570 } 7571 } 7572 7573 return rc; 7574 } 7575 7576 static void 7577 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7578 struct spdk_io_channel *io_ch, void *_ctx) 7579 { 7580 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7581 7582 bdev_channel_abort_queued_ios(bdev_ch); 7583 spdk_bdev_for_each_channel_continue(i, 0); 7584 } 7585 7586 static void 7587 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7588 { 7589 int rc; 7590 7591 spdk_spin_lock(&g_bdev_mgr.spinlock); 7592 spdk_spin_lock(&bdev->internal.spinlock); 7593 /* 7594 * Set the status to REMOVING after completing to abort channels. Otherwise, 7595 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7596 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7597 * may fail. 7598 */ 7599 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7600 rc = bdev_unregister_unsafe(bdev); 7601 spdk_spin_unlock(&bdev->internal.spinlock); 7602 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7603 7604 if (rc == 0) { 7605 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7606 } 7607 } 7608 7609 void 7610 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7611 { 7612 struct spdk_thread *thread; 7613 7614 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7615 7616 thread = spdk_get_thread(); 7617 if (!thread) { 7618 /* The user called this from a non-SPDK thread. */ 7619 if (cb_fn != NULL) { 7620 cb_fn(cb_arg, -ENOTSUP); 7621 } 7622 return; 7623 } 7624 7625 spdk_spin_lock(&g_bdev_mgr.spinlock); 7626 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7627 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7628 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7629 if (cb_fn) { 7630 cb_fn(cb_arg, -EBUSY); 7631 } 7632 return; 7633 } 7634 7635 spdk_spin_lock(&bdev->internal.spinlock); 7636 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7637 bdev->internal.unregister_cb = cb_fn; 7638 bdev->internal.unregister_ctx = cb_arg; 7639 bdev->internal.unregister_td = thread; 7640 spdk_spin_unlock(&bdev->internal.spinlock); 7641 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7642 7643 spdk_bdev_set_qd_sampling_period(bdev, 0); 7644 7645 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7646 bdev_unregister); 7647 } 7648 7649 int 7650 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7651 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7652 { 7653 struct spdk_bdev_desc *desc; 7654 struct spdk_bdev *bdev; 7655 int rc; 7656 7657 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7658 if (rc != 0) { 7659 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7660 return rc; 7661 } 7662 7663 bdev = spdk_bdev_desc_get_bdev(desc); 7664 7665 if (bdev->module != module) { 7666 spdk_bdev_close(desc); 7667 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7668 bdev_name); 7669 return -ENODEV; 7670 } 7671 7672 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7673 7674 spdk_bdev_close(desc); 7675 7676 return 0; 7677 } 7678 7679 static int 7680 bdev_start_qos(struct spdk_bdev *bdev) 7681 { 7682 struct set_qos_limit_ctx *ctx; 7683 7684 /* Enable QoS */ 7685 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7686 ctx = calloc(1, sizeof(*ctx)); 7687 if (ctx == NULL) { 7688 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7689 return -ENOMEM; 7690 } 7691 ctx->bdev = bdev; 7692 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7693 } 7694 7695 return 0; 7696 } 7697 7698 static void 7699 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7700 struct spdk_bdev *bdev) 7701 { 7702 enum spdk_bdev_claim_type type; 7703 const char *typename, *modname; 7704 extern struct spdk_log_flag SPDK_LOG_bdev; 7705 7706 assert(spdk_spin_held(&bdev->internal.spinlock)); 7707 7708 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7709 return; 7710 } 7711 7712 type = bdev->internal.claim_type; 7713 typename = spdk_bdev_claim_get_name(type); 7714 7715 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7716 modname = bdev->internal.claim.v1.module->name; 7717 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7718 bdev->name, detail, typename, modname); 7719 return; 7720 } 7721 7722 if (claim_type_is_v2(type)) { 7723 struct spdk_bdev_module_claim *claim; 7724 7725 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7726 modname = claim->module->name; 7727 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7728 bdev->name, detail, typename, modname); 7729 } 7730 return; 7731 } 7732 7733 assert(false); 7734 } 7735 7736 static int 7737 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7738 { 7739 struct spdk_thread *thread; 7740 int rc = 0; 7741 7742 thread = spdk_get_thread(); 7743 if (!thread) { 7744 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7745 return -ENOTSUP; 7746 } 7747 7748 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7749 spdk_get_thread()); 7750 7751 desc->bdev = bdev; 7752 desc->thread = thread; 7753 desc->write = write; 7754 7755 spdk_spin_lock(&bdev->internal.spinlock); 7756 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7757 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7758 spdk_spin_unlock(&bdev->internal.spinlock); 7759 return -ENODEV; 7760 } 7761 7762 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7763 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7764 spdk_spin_unlock(&bdev->internal.spinlock); 7765 return -EPERM; 7766 } 7767 7768 rc = bdev_start_qos(bdev); 7769 if (rc != 0) { 7770 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7771 spdk_spin_unlock(&bdev->internal.spinlock); 7772 return rc; 7773 } 7774 7775 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7776 7777 spdk_spin_unlock(&bdev->internal.spinlock); 7778 7779 return 0; 7780 } 7781 7782 static int 7783 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7784 struct spdk_bdev_desc **_desc) 7785 { 7786 struct spdk_bdev_desc *desc; 7787 unsigned int i; 7788 7789 desc = calloc(1, sizeof(*desc)); 7790 if (desc == NULL) { 7791 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7792 return -ENOMEM; 7793 } 7794 7795 TAILQ_INIT(&desc->pending_media_events); 7796 TAILQ_INIT(&desc->free_media_events); 7797 7798 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7799 desc->callback.event_fn = event_cb; 7800 desc->callback.ctx = event_ctx; 7801 spdk_spin_init(&desc->spinlock); 7802 7803 if (bdev->media_events) { 7804 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7805 sizeof(*desc->media_events_buffer)); 7806 if (desc->media_events_buffer == NULL) { 7807 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7808 bdev_desc_free(desc); 7809 return -ENOMEM; 7810 } 7811 7812 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7813 TAILQ_INSERT_TAIL(&desc->free_media_events, 7814 &desc->media_events_buffer[i], tailq); 7815 } 7816 } 7817 7818 if (bdev->fn_table->accel_sequence_supported != NULL) { 7819 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7820 desc->accel_sequence_supported[i] = 7821 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7822 (enum spdk_bdev_io_type)i); 7823 } 7824 } 7825 7826 *_desc = desc; 7827 7828 return 0; 7829 } 7830 7831 static int 7832 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7833 void *event_ctx, struct spdk_bdev_desc **_desc) 7834 { 7835 struct spdk_bdev_desc *desc; 7836 struct spdk_bdev *bdev; 7837 int rc; 7838 7839 bdev = bdev_get_by_name(bdev_name); 7840 7841 if (bdev == NULL) { 7842 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7843 return -ENODEV; 7844 } 7845 7846 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7847 if (rc != 0) { 7848 return rc; 7849 } 7850 7851 rc = bdev_open(bdev, write, desc); 7852 if (rc != 0) { 7853 bdev_desc_free(desc); 7854 desc = NULL; 7855 } 7856 7857 *_desc = desc; 7858 7859 return rc; 7860 } 7861 7862 int 7863 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7864 void *event_ctx, struct spdk_bdev_desc **_desc) 7865 { 7866 int rc; 7867 7868 if (event_cb == NULL) { 7869 SPDK_ERRLOG("Missing event callback function\n"); 7870 return -EINVAL; 7871 } 7872 7873 spdk_spin_lock(&g_bdev_mgr.spinlock); 7874 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 7875 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7876 7877 return rc; 7878 } 7879 7880 struct spdk_bdev_open_async_ctx { 7881 char *bdev_name; 7882 spdk_bdev_event_cb_t event_cb; 7883 void *event_ctx; 7884 bool write; 7885 int rc; 7886 spdk_bdev_open_async_cb_t cb_fn; 7887 void *cb_arg; 7888 struct spdk_bdev_desc *desc; 7889 struct spdk_bdev_open_async_opts opts; 7890 uint64_t start_ticks; 7891 struct spdk_thread *orig_thread; 7892 struct spdk_poller *poller; 7893 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 7894 }; 7895 7896 static void 7897 bdev_open_async_done(void *arg) 7898 { 7899 struct spdk_bdev_open_async_ctx *ctx = arg; 7900 7901 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 7902 7903 free(ctx->bdev_name); 7904 free(ctx); 7905 } 7906 7907 static void 7908 bdev_open_async_cancel(void *arg) 7909 { 7910 struct spdk_bdev_open_async_ctx *ctx = arg; 7911 7912 assert(ctx->rc == -ESHUTDOWN); 7913 7914 spdk_poller_unregister(&ctx->poller); 7915 7916 bdev_open_async_done(ctx); 7917 } 7918 7919 /* This is called when the bdev library finishes at shutdown. */ 7920 static void 7921 bdev_open_async_fini(void) 7922 { 7923 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 7924 7925 spdk_spin_lock(&g_bdev_mgr.spinlock); 7926 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 7927 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 7928 /* 7929 * We have to move to ctx->orig_thread to unregister ctx->poller. 7930 * However, there is a chance that ctx->poller is executed before 7931 * message is executed, which could result in bdev_open_async_done() 7932 * being called twice. To avoid such race condition, set ctx->rc to 7933 * -ESHUTDOWN. 7934 */ 7935 ctx->rc = -ESHUTDOWN; 7936 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 7937 } 7938 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7939 } 7940 7941 static int bdev_open_async(void *arg); 7942 7943 static void 7944 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 7945 { 7946 uint64_t timeout_ticks; 7947 7948 if (ctx->rc == -ESHUTDOWN) { 7949 /* This context is being canceled. Do nothing. */ 7950 return; 7951 } 7952 7953 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 7954 &ctx->desc); 7955 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 7956 goto exit; 7957 } 7958 7959 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 7960 if (spdk_get_ticks() >= timeout_ticks) { 7961 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 7962 ctx->rc = -ETIMEDOUT; 7963 goto exit; 7964 } 7965 7966 return; 7967 7968 exit: 7969 spdk_poller_unregister(&ctx->poller); 7970 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 7971 7972 /* Completion callback is processed after stack unwinding. */ 7973 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 7974 } 7975 7976 static int 7977 bdev_open_async(void *arg) 7978 { 7979 struct spdk_bdev_open_async_ctx *ctx = arg; 7980 7981 spdk_spin_lock(&g_bdev_mgr.spinlock); 7982 7983 _bdev_open_async(ctx); 7984 7985 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7986 7987 return SPDK_POLLER_BUSY; 7988 } 7989 7990 static void 7991 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 7992 struct spdk_bdev_open_async_opts *opts_src, 7993 size_t size) 7994 { 7995 assert(opts); 7996 assert(opts_src); 7997 7998 opts->size = size; 7999 8000 #define SET_FIELD(field) \ 8001 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8002 opts->field = opts_src->field; \ 8003 } \ 8004 8005 SET_FIELD(timeout_ms); 8006 8007 /* Do not remove this statement, you should always update this statement when you adding a new field, 8008 * and do not forget to add the SET_FIELD statement for your added field. */ 8009 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8010 8011 #undef SET_FIELD 8012 } 8013 8014 static void 8015 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8016 { 8017 assert(opts); 8018 8019 opts->size = size; 8020 8021 #define SET_FIELD(field, value) \ 8022 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8023 opts->field = value; \ 8024 } \ 8025 8026 SET_FIELD(timeout_ms, 0); 8027 8028 #undef SET_FIELD 8029 } 8030 8031 int 8032 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8033 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8034 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8035 { 8036 struct spdk_bdev_open_async_ctx *ctx; 8037 8038 if (event_cb == NULL) { 8039 SPDK_ERRLOG("Missing event callback function\n"); 8040 return -EINVAL; 8041 } 8042 8043 if (open_cb == NULL) { 8044 SPDK_ERRLOG("Missing open callback function\n"); 8045 return -EINVAL; 8046 } 8047 8048 if (opts != NULL && opts->size == 0) { 8049 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8050 return -EINVAL; 8051 } 8052 8053 ctx = calloc(1, sizeof(*ctx)); 8054 if (ctx == NULL) { 8055 SPDK_ERRLOG("Failed to allocate open context\n"); 8056 return -ENOMEM; 8057 } 8058 8059 ctx->bdev_name = strdup(bdev_name); 8060 if (ctx->bdev_name == NULL) { 8061 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8062 free(ctx); 8063 return -ENOMEM; 8064 } 8065 8066 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8067 if (ctx->poller == NULL) { 8068 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8069 free(ctx->bdev_name); 8070 free(ctx); 8071 return -ENOMEM; 8072 } 8073 8074 ctx->cb_fn = open_cb; 8075 ctx->cb_arg = open_cb_arg; 8076 ctx->write = write; 8077 ctx->event_cb = event_cb; 8078 ctx->event_ctx = event_ctx; 8079 ctx->orig_thread = spdk_get_thread(); 8080 ctx->start_ticks = spdk_get_ticks(); 8081 8082 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8083 if (opts != NULL) { 8084 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8085 } 8086 8087 spdk_spin_lock(&g_bdev_mgr.spinlock); 8088 8089 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8090 _bdev_open_async(ctx); 8091 8092 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8093 8094 return 0; 8095 } 8096 8097 static void 8098 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8099 { 8100 int rc; 8101 8102 spdk_spin_lock(&bdev->internal.spinlock); 8103 spdk_spin_lock(&desc->spinlock); 8104 8105 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8106 8107 desc->closed = true; 8108 8109 if (desc->claim != NULL) { 8110 bdev_desc_release_claims(desc); 8111 } 8112 8113 if (0 == desc->refs) { 8114 spdk_spin_unlock(&desc->spinlock); 8115 bdev_desc_free(desc); 8116 } else { 8117 spdk_spin_unlock(&desc->spinlock); 8118 } 8119 8120 /* If no more descriptors, kill QoS channel */ 8121 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8122 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8123 bdev->name, spdk_get_thread()); 8124 8125 if (bdev_qos_destroy(bdev)) { 8126 /* There isn't anything we can do to recover here. Just let the 8127 * old QoS poller keep running. The QoS handling won't change 8128 * cores when the user allocates a new channel, but it won't break. */ 8129 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8130 } 8131 } 8132 8133 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8134 rc = bdev_unregister_unsafe(bdev); 8135 spdk_spin_unlock(&bdev->internal.spinlock); 8136 8137 if (rc == 0) { 8138 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8139 } 8140 } else { 8141 spdk_spin_unlock(&bdev->internal.spinlock); 8142 } 8143 } 8144 8145 void 8146 spdk_bdev_close(struct spdk_bdev_desc *desc) 8147 { 8148 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8149 8150 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8151 spdk_get_thread()); 8152 8153 assert(desc->thread == spdk_get_thread()); 8154 8155 spdk_poller_unregister(&desc->io_timeout_poller); 8156 8157 spdk_spin_lock(&g_bdev_mgr.spinlock); 8158 8159 bdev_close(bdev, desc); 8160 8161 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8162 } 8163 8164 static void 8165 bdev_register_finished(void *arg) 8166 { 8167 struct spdk_bdev_desc *desc = arg; 8168 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8169 8170 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8171 8172 spdk_spin_lock(&g_bdev_mgr.spinlock); 8173 8174 bdev_close(bdev, desc); 8175 8176 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8177 } 8178 8179 int 8180 spdk_bdev_register(struct spdk_bdev *bdev) 8181 { 8182 struct spdk_bdev_desc *desc; 8183 struct spdk_thread *thread = spdk_get_thread(); 8184 int rc; 8185 8186 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8187 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 8188 thread ? spdk_thread_get_name(thread) : "null"); 8189 return -EINVAL; 8190 } 8191 8192 rc = bdev_register(bdev); 8193 if (rc != 0) { 8194 return rc; 8195 } 8196 8197 /* A descriptor is opened to prevent bdev deletion during examination */ 8198 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8199 if (rc != 0) { 8200 spdk_bdev_unregister(bdev, NULL, NULL); 8201 return rc; 8202 } 8203 8204 rc = bdev_open(bdev, false, desc); 8205 if (rc != 0) { 8206 bdev_desc_free(desc); 8207 spdk_bdev_unregister(bdev, NULL, NULL); 8208 return rc; 8209 } 8210 8211 /* Examine configuration before initializing I/O */ 8212 bdev_examine(bdev); 8213 8214 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8215 if (rc != 0) { 8216 bdev_close(bdev, desc); 8217 spdk_bdev_unregister(bdev, NULL, NULL); 8218 } 8219 8220 return rc; 8221 } 8222 8223 int 8224 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8225 struct spdk_bdev_module *module) 8226 { 8227 spdk_spin_lock(&bdev->internal.spinlock); 8228 8229 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8230 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8231 spdk_spin_unlock(&bdev->internal.spinlock); 8232 return -EPERM; 8233 } 8234 8235 if (desc && !desc->write) { 8236 desc->write = true; 8237 } 8238 8239 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8240 bdev->internal.claim.v1.module = module; 8241 8242 spdk_spin_unlock(&bdev->internal.spinlock); 8243 return 0; 8244 } 8245 8246 void 8247 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8248 { 8249 spdk_spin_lock(&bdev->internal.spinlock); 8250 8251 assert(bdev->internal.claim.v1.module != NULL); 8252 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8253 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8254 bdev->internal.claim.v1.module = NULL; 8255 8256 spdk_spin_unlock(&bdev->internal.spinlock); 8257 } 8258 8259 /* 8260 * Start claims v2 8261 */ 8262 8263 const char * 8264 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8265 { 8266 switch (type) { 8267 case SPDK_BDEV_CLAIM_NONE: 8268 return "not_claimed"; 8269 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8270 return "exclusive_write"; 8271 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8272 return "read_many_write_one"; 8273 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8274 return "read_many_write_none"; 8275 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8276 return "read_many_write_many"; 8277 default: 8278 break; 8279 } 8280 return "invalid_claim"; 8281 } 8282 8283 static bool 8284 claim_type_is_v2(enum spdk_bdev_claim_type type) 8285 { 8286 switch (type) { 8287 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8288 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8289 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8290 return true; 8291 default: 8292 break; 8293 } 8294 return false; 8295 } 8296 8297 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8298 static bool 8299 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8300 { 8301 switch (type) { 8302 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8303 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8304 return true; 8305 default: 8306 break; 8307 } 8308 return false; 8309 } 8310 8311 void 8312 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8313 { 8314 if (opts == NULL) { 8315 SPDK_ERRLOG("opts should not be NULL\n"); 8316 assert(opts != NULL); 8317 return; 8318 } 8319 if (size == 0) { 8320 SPDK_ERRLOG("size should not be zero\n"); 8321 assert(size != 0); 8322 return; 8323 } 8324 8325 memset(opts, 0, size); 8326 opts->opts_size = size; 8327 8328 #define FIELD_OK(field) \ 8329 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8330 8331 #define SET_FIELD(field, value) \ 8332 if (FIELD_OK(field)) { \ 8333 opts->field = value; \ 8334 } \ 8335 8336 SET_FIELD(shared_claim_key, 0); 8337 8338 #undef FIELD_OK 8339 #undef SET_FIELD 8340 } 8341 8342 static int 8343 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8344 { 8345 if (src->opts_size == 0) { 8346 SPDK_ERRLOG("size should not be zero\n"); 8347 return -1; 8348 } 8349 8350 memset(dst, 0, sizeof(*dst)); 8351 dst->opts_size = src->opts_size; 8352 8353 #define FIELD_OK(field) \ 8354 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8355 8356 #define SET_FIELD(field) \ 8357 if (FIELD_OK(field)) { \ 8358 dst->field = src->field; \ 8359 } \ 8360 8361 if (FIELD_OK(name)) { 8362 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8363 } 8364 8365 SET_FIELD(shared_claim_key); 8366 8367 /* You should not remove this statement, but need to update the assert statement 8368 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8369 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8370 8371 #undef FIELD_OK 8372 #undef SET_FIELD 8373 return 0; 8374 } 8375 8376 /* Returns 0 if a read-write-once claim can be taken. */ 8377 static int 8378 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8379 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8380 { 8381 struct spdk_bdev *bdev = desc->bdev; 8382 struct spdk_bdev_desc *open_desc; 8383 8384 assert(spdk_spin_held(&bdev->internal.spinlock)); 8385 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8386 8387 if (opts->shared_claim_key != 0) { 8388 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8389 bdev->name); 8390 return -EINVAL; 8391 } 8392 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8393 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8394 return -EPERM; 8395 } 8396 if (desc->claim != NULL) { 8397 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8398 bdev->name, desc->claim->module->name); 8399 return -EPERM; 8400 } 8401 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8402 if (desc != open_desc && open_desc->write) { 8403 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8404 "another descriptor is open for writing\n", 8405 bdev->name); 8406 return -EPERM; 8407 } 8408 } 8409 8410 return 0; 8411 } 8412 8413 /* Returns 0 if a read-only-many claim can be taken. */ 8414 static int 8415 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8416 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8417 { 8418 struct spdk_bdev *bdev = desc->bdev; 8419 struct spdk_bdev_desc *open_desc; 8420 8421 assert(spdk_spin_held(&bdev->internal.spinlock)); 8422 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8423 assert(desc->claim == NULL); 8424 8425 if (desc->write) { 8426 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8427 bdev->name); 8428 return -EINVAL; 8429 } 8430 if (opts->shared_claim_key != 0) { 8431 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8432 return -EINVAL; 8433 } 8434 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8435 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8436 if (open_desc->write) { 8437 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8438 "another descriptor is open for writing\n", 8439 bdev->name); 8440 return -EPERM; 8441 } 8442 } 8443 } 8444 8445 return 0; 8446 } 8447 8448 /* Returns 0 if a read-write-many claim can be taken. */ 8449 static int 8450 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8451 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8452 { 8453 struct spdk_bdev *bdev = desc->bdev; 8454 struct spdk_bdev_desc *open_desc; 8455 8456 assert(spdk_spin_held(&bdev->internal.spinlock)); 8457 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8458 assert(desc->claim == NULL); 8459 8460 if (opts->shared_claim_key == 0) { 8461 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8462 bdev->name); 8463 return -EINVAL; 8464 } 8465 switch (bdev->internal.claim_type) { 8466 case SPDK_BDEV_CLAIM_NONE: 8467 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8468 if (open_desc == desc) { 8469 continue; 8470 } 8471 if (open_desc->write) { 8472 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8473 "another descriptor is open for writing without a " 8474 "claim\n", bdev->name); 8475 return -EPERM; 8476 } 8477 } 8478 break; 8479 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8480 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8481 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8482 return -EPERM; 8483 } 8484 break; 8485 default: 8486 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8487 return -EBUSY; 8488 } 8489 8490 return 0; 8491 } 8492 8493 /* Updates desc and its bdev with a v2 claim. */ 8494 static int 8495 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8496 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8497 { 8498 struct spdk_bdev *bdev = desc->bdev; 8499 struct spdk_bdev_module_claim *claim; 8500 8501 assert(spdk_spin_held(&bdev->internal.spinlock)); 8502 assert(claim_type_is_v2(type)); 8503 assert(desc->claim == NULL); 8504 8505 claim = calloc(1, sizeof(*desc->claim)); 8506 if (claim == NULL) { 8507 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8508 return -ENOMEM; 8509 } 8510 claim->module = module; 8511 claim->desc = desc; 8512 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8513 memcpy(claim->name, opts->name, sizeof(claim->name)); 8514 desc->claim = claim; 8515 8516 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8517 bdev->internal.claim_type = type; 8518 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8519 bdev->internal.claim.v2.key = opts->shared_claim_key; 8520 } 8521 assert(type == bdev->internal.claim_type); 8522 8523 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8524 8525 if (!desc->write && claim_type_promotes_to_write(type)) { 8526 desc->write = true; 8527 } 8528 8529 return 0; 8530 } 8531 8532 int 8533 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8534 struct spdk_bdev_claim_opts *_opts, 8535 struct spdk_bdev_module *module) 8536 { 8537 struct spdk_bdev *bdev; 8538 struct spdk_bdev_claim_opts opts; 8539 int rc = 0; 8540 8541 if (desc == NULL) { 8542 SPDK_ERRLOG("descriptor must not be NULL\n"); 8543 return -EINVAL; 8544 } 8545 8546 bdev = desc->bdev; 8547 8548 if (_opts == NULL) { 8549 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8550 } else if (claim_opts_copy(_opts, &opts) != 0) { 8551 return -EINVAL; 8552 } 8553 8554 spdk_spin_lock(&bdev->internal.spinlock); 8555 8556 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8557 bdev->internal.claim_type != type) { 8558 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8559 spdk_spin_unlock(&bdev->internal.spinlock); 8560 return -EPERM; 8561 } 8562 8563 if (claim_type_is_v2(type) && desc->claim != NULL) { 8564 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8565 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8566 spdk_spin_unlock(&bdev->internal.spinlock); 8567 return -EPERM; 8568 } 8569 8570 switch (type) { 8571 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8572 spdk_spin_unlock(&bdev->internal.spinlock); 8573 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8574 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8575 rc = claim_verify_rwo(desc, type, &opts, module); 8576 break; 8577 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8578 rc = claim_verify_rom(desc, type, &opts, module); 8579 break; 8580 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8581 rc = claim_verify_rwm(desc, type, &opts, module); 8582 break; 8583 default: 8584 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8585 rc = -ENOTSUP; 8586 } 8587 8588 if (rc == 0) { 8589 rc = claim_bdev(desc, type, &opts, module); 8590 } 8591 8592 spdk_spin_unlock(&bdev->internal.spinlock); 8593 return rc; 8594 } 8595 8596 static void 8597 claim_reset(struct spdk_bdev *bdev) 8598 { 8599 assert(spdk_spin_held(&bdev->internal.spinlock)); 8600 assert(claim_type_is_v2(bdev->internal.claim_type)); 8601 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8602 8603 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8604 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8605 } 8606 8607 static void 8608 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8609 { 8610 struct spdk_bdev *bdev = desc->bdev; 8611 8612 assert(spdk_spin_held(&bdev->internal.spinlock)); 8613 assert(claim_type_is_v2(bdev->internal.claim_type)); 8614 8615 if (bdev->internal.examine_in_progress == 0) { 8616 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8617 free(desc->claim); 8618 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8619 claim_reset(bdev); 8620 } 8621 } else { 8622 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8623 desc->claim->module = NULL; 8624 desc->claim->desc = NULL; 8625 } 8626 desc->claim = NULL; 8627 } 8628 8629 /* 8630 * End claims v2 8631 */ 8632 8633 struct spdk_bdev * 8634 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8635 { 8636 assert(desc != NULL); 8637 return desc->bdev; 8638 } 8639 8640 int 8641 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8642 { 8643 struct spdk_bdev *bdev, *tmp; 8644 struct spdk_bdev_desc *desc; 8645 int rc = 0; 8646 8647 assert(fn != NULL); 8648 8649 spdk_spin_lock(&g_bdev_mgr.spinlock); 8650 bdev = spdk_bdev_first(); 8651 while (bdev != NULL) { 8652 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8653 if (rc != 0) { 8654 break; 8655 } 8656 rc = bdev_open(bdev, false, desc); 8657 if (rc != 0) { 8658 bdev_desc_free(desc); 8659 if (rc == -ENODEV) { 8660 /* Ignore the error and move to the next bdev. */ 8661 rc = 0; 8662 bdev = spdk_bdev_next(bdev); 8663 continue; 8664 } 8665 break; 8666 } 8667 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8668 8669 rc = fn(ctx, bdev); 8670 8671 spdk_spin_lock(&g_bdev_mgr.spinlock); 8672 tmp = spdk_bdev_next(bdev); 8673 bdev_close(bdev, desc); 8674 if (rc != 0) { 8675 break; 8676 } 8677 bdev = tmp; 8678 } 8679 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8680 8681 return rc; 8682 } 8683 8684 int 8685 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8686 { 8687 struct spdk_bdev *bdev, *tmp; 8688 struct spdk_bdev_desc *desc; 8689 int rc = 0; 8690 8691 assert(fn != NULL); 8692 8693 spdk_spin_lock(&g_bdev_mgr.spinlock); 8694 bdev = spdk_bdev_first_leaf(); 8695 while (bdev != NULL) { 8696 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8697 if (rc != 0) { 8698 break; 8699 } 8700 rc = bdev_open(bdev, false, desc); 8701 if (rc != 0) { 8702 bdev_desc_free(desc); 8703 if (rc == -ENODEV) { 8704 /* Ignore the error and move to the next bdev. */ 8705 rc = 0; 8706 bdev = spdk_bdev_next_leaf(bdev); 8707 continue; 8708 } 8709 break; 8710 } 8711 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8712 8713 rc = fn(ctx, bdev); 8714 8715 spdk_spin_lock(&g_bdev_mgr.spinlock); 8716 tmp = spdk_bdev_next_leaf(bdev); 8717 bdev_close(bdev, desc); 8718 if (rc != 0) { 8719 break; 8720 } 8721 bdev = tmp; 8722 } 8723 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8724 8725 return rc; 8726 } 8727 8728 void 8729 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8730 { 8731 struct iovec *iovs; 8732 int iovcnt; 8733 8734 if (bdev_io == NULL) { 8735 return; 8736 } 8737 8738 switch (bdev_io->type) { 8739 case SPDK_BDEV_IO_TYPE_READ: 8740 case SPDK_BDEV_IO_TYPE_WRITE: 8741 case SPDK_BDEV_IO_TYPE_ZCOPY: 8742 iovs = bdev_io->u.bdev.iovs; 8743 iovcnt = bdev_io->u.bdev.iovcnt; 8744 break; 8745 default: 8746 iovs = NULL; 8747 iovcnt = 0; 8748 break; 8749 } 8750 8751 if (iovp) { 8752 *iovp = iovs; 8753 } 8754 if (iovcntp) { 8755 *iovcntp = iovcnt; 8756 } 8757 } 8758 8759 void * 8760 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8761 { 8762 if (bdev_io == NULL) { 8763 return NULL; 8764 } 8765 8766 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8767 return NULL; 8768 } 8769 8770 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8771 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8772 return bdev_io->u.bdev.md_buf; 8773 } 8774 8775 return NULL; 8776 } 8777 8778 void * 8779 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8780 { 8781 if (bdev_io == NULL) { 8782 assert(false); 8783 return NULL; 8784 } 8785 8786 return bdev_io->internal.caller_ctx; 8787 } 8788 8789 void 8790 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8791 { 8792 8793 if (spdk_bdev_module_list_find(bdev_module->name)) { 8794 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8795 assert(false); 8796 } 8797 8798 spdk_spin_init(&bdev_module->internal.spinlock); 8799 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8800 8801 /* 8802 * Modules with examine callbacks must be initialized first, so they are 8803 * ready to handle examine callbacks from later modules that will 8804 * register physical bdevs. 8805 */ 8806 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8807 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8808 } else { 8809 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8810 } 8811 } 8812 8813 struct spdk_bdev_module * 8814 spdk_bdev_module_list_find(const char *name) 8815 { 8816 struct spdk_bdev_module *bdev_module; 8817 8818 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8819 if (strcmp(name, bdev_module->name) == 0) { 8820 break; 8821 } 8822 } 8823 8824 return bdev_module; 8825 } 8826 8827 static int 8828 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8829 { 8830 uint64_t num_blocks; 8831 void *md_buf = NULL; 8832 8833 num_blocks = bdev_io->u.bdev.num_blocks; 8834 8835 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8836 md_buf = (char *)g_bdev_mgr.zero_buffer + 8837 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8838 } 8839 8840 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8841 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8842 g_bdev_mgr.zero_buffer, md_buf, 8843 bdev_io->u.bdev.offset_blocks, num_blocks, 8844 bdev_write_zero_buffer_done, bdev_io); 8845 } 8846 8847 static void 8848 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8849 { 8850 struct spdk_bdev_io *parent_io = cb_arg; 8851 8852 spdk_bdev_free_io(bdev_io); 8853 8854 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8855 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8856 } 8857 8858 static void 8859 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8860 { 8861 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8862 ctx->bdev->internal.qos_mod_in_progress = false; 8863 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8864 8865 if (ctx->cb_fn) { 8866 ctx->cb_fn(ctx->cb_arg, status); 8867 } 8868 free(ctx); 8869 } 8870 8871 static void 8872 bdev_disable_qos_done(void *cb_arg) 8873 { 8874 struct set_qos_limit_ctx *ctx = cb_arg; 8875 struct spdk_bdev *bdev = ctx->bdev; 8876 struct spdk_bdev_io *bdev_io; 8877 struct spdk_bdev_qos *qos; 8878 8879 spdk_spin_lock(&bdev->internal.spinlock); 8880 qos = bdev->internal.qos; 8881 bdev->internal.qos = NULL; 8882 spdk_spin_unlock(&bdev->internal.spinlock); 8883 8884 while (!TAILQ_EMPTY(&qos->queued)) { 8885 /* Send queued I/O back to their original thread for resubmission. */ 8886 bdev_io = TAILQ_FIRST(&qos->queued); 8887 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8888 8889 if (bdev_io->internal.io_submit_ch) { 8890 /* 8891 * Channel was changed when sending it to the QoS thread - change it back 8892 * before sending it back to the original thread. 8893 */ 8894 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8895 bdev_io->internal.io_submit_ch = NULL; 8896 } 8897 8898 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8899 _bdev_io_submit, bdev_io); 8900 } 8901 8902 if (qos->thread != NULL) { 8903 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8904 spdk_poller_unregister(&qos->poller); 8905 } 8906 8907 free(qos); 8908 8909 bdev_set_qos_limit_done(ctx, 0); 8910 } 8911 8912 static void 8913 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8914 { 8915 struct set_qos_limit_ctx *ctx = _ctx; 8916 struct spdk_thread *thread; 8917 8918 spdk_spin_lock(&bdev->internal.spinlock); 8919 thread = bdev->internal.qos->thread; 8920 spdk_spin_unlock(&bdev->internal.spinlock); 8921 8922 if (thread != NULL) { 8923 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8924 } else { 8925 bdev_disable_qos_done(ctx); 8926 } 8927 } 8928 8929 static void 8930 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8931 struct spdk_io_channel *ch, void *_ctx) 8932 { 8933 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8934 8935 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8936 8937 spdk_bdev_for_each_channel_continue(i, 0); 8938 } 8939 8940 static void 8941 bdev_update_qos_rate_limit_msg(void *cb_arg) 8942 { 8943 struct set_qos_limit_ctx *ctx = cb_arg; 8944 struct spdk_bdev *bdev = ctx->bdev; 8945 8946 spdk_spin_lock(&bdev->internal.spinlock); 8947 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8948 spdk_spin_unlock(&bdev->internal.spinlock); 8949 8950 bdev_set_qos_limit_done(ctx, 0); 8951 } 8952 8953 static void 8954 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8955 struct spdk_io_channel *ch, void *_ctx) 8956 { 8957 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8958 8959 spdk_spin_lock(&bdev->internal.spinlock); 8960 bdev_enable_qos(bdev, bdev_ch); 8961 spdk_spin_unlock(&bdev->internal.spinlock); 8962 spdk_bdev_for_each_channel_continue(i, 0); 8963 } 8964 8965 static void 8966 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8967 { 8968 struct set_qos_limit_ctx *ctx = _ctx; 8969 8970 bdev_set_qos_limit_done(ctx, status); 8971 } 8972 8973 static void 8974 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8975 { 8976 int i; 8977 8978 assert(bdev->internal.qos != NULL); 8979 8980 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8981 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8982 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8983 8984 if (limits[i] == 0) { 8985 bdev->internal.qos->rate_limits[i].limit = 8986 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8987 } 8988 } 8989 } 8990 } 8991 8992 void 8993 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8994 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8995 { 8996 struct set_qos_limit_ctx *ctx; 8997 uint32_t limit_set_complement; 8998 uint64_t min_limit_per_sec; 8999 int i; 9000 bool disable_rate_limit = true; 9001 9002 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9003 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9004 continue; 9005 } 9006 9007 if (limits[i] > 0) { 9008 disable_rate_limit = false; 9009 } 9010 9011 if (bdev_qos_is_iops_rate_limit(i) == true) { 9012 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9013 } else { 9014 /* Change from megabyte to byte rate limit */ 9015 limits[i] = limits[i] * 1024 * 1024; 9016 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9017 } 9018 9019 limit_set_complement = limits[i] % min_limit_per_sec; 9020 if (limit_set_complement) { 9021 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9022 limits[i], min_limit_per_sec); 9023 limits[i] += min_limit_per_sec - limit_set_complement; 9024 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9025 } 9026 } 9027 9028 ctx = calloc(1, sizeof(*ctx)); 9029 if (ctx == NULL) { 9030 cb_fn(cb_arg, -ENOMEM); 9031 return; 9032 } 9033 9034 ctx->cb_fn = cb_fn; 9035 ctx->cb_arg = cb_arg; 9036 ctx->bdev = bdev; 9037 9038 spdk_spin_lock(&bdev->internal.spinlock); 9039 if (bdev->internal.qos_mod_in_progress) { 9040 spdk_spin_unlock(&bdev->internal.spinlock); 9041 free(ctx); 9042 cb_fn(cb_arg, -EAGAIN); 9043 return; 9044 } 9045 bdev->internal.qos_mod_in_progress = true; 9046 9047 if (disable_rate_limit == true && bdev->internal.qos) { 9048 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9049 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9050 (bdev->internal.qos->rate_limits[i].limit > 0 && 9051 bdev->internal.qos->rate_limits[i].limit != 9052 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9053 disable_rate_limit = false; 9054 break; 9055 } 9056 } 9057 } 9058 9059 if (disable_rate_limit == false) { 9060 if (bdev->internal.qos == NULL) { 9061 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9062 if (!bdev->internal.qos) { 9063 spdk_spin_unlock(&bdev->internal.spinlock); 9064 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9065 bdev_set_qos_limit_done(ctx, -ENOMEM); 9066 return; 9067 } 9068 } 9069 9070 if (bdev->internal.qos->thread == NULL) { 9071 /* Enabling */ 9072 bdev_set_qos_rate_limits(bdev, limits); 9073 9074 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9075 bdev_enable_qos_done); 9076 } else { 9077 /* Updating */ 9078 bdev_set_qos_rate_limits(bdev, limits); 9079 9080 spdk_thread_send_msg(bdev->internal.qos->thread, 9081 bdev_update_qos_rate_limit_msg, ctx); 9082 } 9083 } else { 9084 if (bdev->internal.qos != NULL) { 9085 bdev_set_qos_rate_limits(bdev, limits); 9086 9087 /* Disabling */ 9088 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9089 bdev_disable_qos_msg_done); 9090 } else { 9091 spdk_spin_unlock(&bdev->internal.spinlock); 9092 bdev_set_qos_limit_done(ctx, 0); 9093 return; 9094 } 9095 } 9096 9097 spdk_spin_unlock(&bdev->internal.spinlock); 9098 } 9099 9100 struct spdk_bdev_histogram_ctx { 9101 spdk_bdev_histogram_status_cb cb_fn; 9102 void *cb_arg; 9103 struct spdk_bdev *bdev; 9104 int status; 9105 }; 9106 9107 static void 9108 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9109 { 9110 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9111 9112 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9113 ctx->bdev->internal.histogram_in_progress = false; 9114 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9115 ctx->cb_fn(ctx->cb_arg, ctx->status); 9116 free(ctx); 9117 } 9118 9119 static void 9120 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9121 struct spdk_io_channel *_ch, void *_ctx) 9122 { 9123 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9124 9125 if (ch->histogram != NULL) { 9126 spdk_histogram_data_free(ch->histogram); 9127 ch->histogram = NULL; 9128 } 9129 spdk_bdev_for_each_channel_continue(i, 0); 9130 } 9131 9132 static void 9133 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9134 { 9135 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9136 9137 if (status != 0) { 9138 ctx->status = status; 9139 ctx->bdev->internal.histogram_enabled = false; 9140 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9141 bdev_histogram_disable_channel_cb); 9142 } else { 9143 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9144 ctx->bdev->internal.histogram_in_progress = false; 9145 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9146 ctx->cb_fn(ctx->cb_arg, ctx->status); 9147 free(ctx); 9148 } 9149 } 9150 9151 static void 9152 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9153 struct spdk_io_channel *_ch, void *_ctx) 9154 { 9155 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9156 int status = 0; 9157 9158 if (ch->histogram == NULL) { 9159 ch->histogram = spdk_histogram_data_alloc(); 9160 if (ch->histogram == NULL) { 9161 status = -ENOMEM; 9162 } 9163 } 9164 9165 spdk_bdev_for_each_channel_continue(i, status); 9166 } 9167 9168 void 9169 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9170 void *cb_arg, bool enable) 9171 { 9172 struct spdk_bdev_histogram_ctx *ctx; 9173 9174 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9175 if (ctx == NULL) { 9176 cb_fn(cb_arg, -ENOMEM); 9177 return; 9178 } 9179 9180 ctx->bdev = bdev; 9181 ctx->status = 0; 9182 ctx->cb_fn = cb_fn; 9183 ctx->cb_arg = cb_arg; 9184 9185 spdk_spin_lock(&bdev->internal.spinlock); 9186 if (bdev->internal.histogram_in_progress) { 9187 spdk_spin_unlock(&bdev->internal.spinlock); 9188 free(ctx); 9189 cb_fn(cb_arg, -EAGAIN); 9190 return; 9191 } 9192 9193 bdev->internal.histogram_in_progress = true; 9194 spdk_spin_unlock(&bdev->internal.spinlock); 9195 9196 bdev->internal.histogram_enabled = enable; 9197 9198 if (enable) { 9199 /* Allocate histogram for each channel */ 9200 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9201 bdev_histogram_enable_channel_cb); 9202 } else { 9203 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9204 bdev_histogram_disable_channel_cb); 9205 } 9206 } 9207 9208 struct spdk_bdev_histogram_data_ctx { 9209 spdk_bdev_histogram_data_cb cb_fn; 9210 void *cb_arg; 9211 struct spdk_bdev *bdev; 9212 /** merged histogram data from all channels */ 9213 struct spdk_histogram_data *histogram; 9214 }; 9215 9216 static void 9217 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9218 { 9219 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9220 9221 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9222 free(ctx); 9223 } 9224 9225 static void 9226 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9227 struct spdk_io_channel *_ch, void *_ctx) 9228 { 9229 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9230 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9231 int status = 0; 9232 9233 if (ch->histogram == NULL) { 9234 status = -EFAULT; 9235 } else { 9236 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9237 } 9238 9239 spdk_bdev_for_each_channel_continue(i, status); 9240 } 9241 9242 void 9243 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9244 spdk_bdev_histogram_data_cb cb_fn, 9245 void *cb_arg) 9246 { 9247 struct spdk_bdev_histogram_data_ctx *ctx; 9248 9249 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9250 if (ctx == NULL) { 9251 cb_fn(cb_arg, -ENOMEM, NULL); 9252 return; 9253 } 9254 9255 ctx->bdev = bdev; 9256 ctx->cb_fn = cb_fn; 9257 ctx->cb_arg = cb_arg; 9258 9259 ctx->histogram = histogram; 9260 9261 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9262 bdev_histogram_get_channel_cb); 9263 } 9264 9265 void 9266 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9267 void *cb_arg) 9268 { 9269 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9270 int status = 0; 9271 9272 assert(cb_fn != NULL); 9273 9274 if (bdev_ch->histogram == NULL) { 9275 status = -EFAULT; 9276 } 9277 cb_fn(cb_arg, status, bdev_ch->histogram); 9278 } 9279 9280 size_t 9281 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9282 size_t max_events) 9283 { 9284 struct media_event_entry *entry; 9285 size_t num_events = 0; 9286 9287 for (; num_events < max_events; ++num_events) { 9288 entry = TAILQ_FIRST(&desc->pending_media_events); 9289 if (entry == NULL) { 9290 break; 9291 } 9292 9293 events[num_events] = entry->event; 9294 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9295 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9296 } 9297 9298 return num_events; 9299 } 9300 9301 int 9302 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9303 size_t num_events) 9304 { 9305 struct spdk_bdev_desc *desc; 9306 struct media_event_entry *entry; 9307 size_t event_id; 9308 int rc = 0; 9309 9310 assert(bdev->media_events); 9311 9312 spdk_spin_lock(&bdev->internal.spinlock); 9313 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9314 if (desc->write) { 9315 break; 9316 } 9317 } 9318 9319 if (desc == NULL || desc->media_events_buffer == NULL) { 9320 rc = -ENODEV; 9321 goto out; 9322 } 9323 9324 for (event_id = 0; event_id < num_events; ++event_id) { 9325 entry = TAILQ_FIRST(&desc->free_media_events); 9326 if (entry == NULL) { 9327 break; 9328 } 9329 9330 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9331 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9332 entry->event = events[event_id]; 9333 } 9334 9335 rc = event_id; 9336 out: 9337 spdk_spin_unlock(&bdev->internal.spinlock); 9338 return rc; 9339 } 9340 9341 static void 9342 _media_management_notify(void *arg) 9343 { 9344 struct spdk_bdev_desc *desc = arg; 9345 9346 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9347 } 9348 9349 void 9350 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9351 { 9352 struct spdk_bdev_desc *desc; 9353 9354 spdk_spin_lock(&bdev->internal.spinlock); 9355 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9356 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9357 event_notify(desc, _media_management_notify); 9358 } 9359 } 9360 spdk_spin_unlock(&bdev->internal.spinlock); 9361 } 9362 9363 struct locked_lba_range_ctx { 9364 struct lba_range range; 9365 struct lba_range *current_range; 9366 struct lba_range *owner_range; 9367 struct spdk_poller *poller; 9368 lock_range_cb cb_fn; 9369 void *cb_arg; 9370 }; 9371 9372 static void 9373 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9374 { 9375 struct locked_lba_range_ctx *ctx = _ctx; 9376 9377 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9378 free(ctx); 9379 } 9380 9381 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9382 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9383 9384 static void 9385 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9386 { 9387 struct locked_lba_range_ctx *ctx = _ctx; 9388 9389 if (status == -ENOMEM) { 9390 /* One of the channels could not allocate a range object. 9391 * So we have to go back and clean up any ranges that were 9392 * allocated successfully before we return error status to 9393 * the caller. We can reuse the unlock function to do that 9394 * clean up. 9395 */ 9396 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9397 bdev_lock_error_cleanup_cb); 9398 return; 9399 } 9400 9401 /* All channels have locked this range and no I/O overlapping the range 9402 * are outstanding! Set the owner_ch for the range object for the 9403 * locking channel, so that this channel will know that it is allowed 9404 * to write to this range. 9405 */ 9406 if (ctx->owner_range != NULL) { 9407 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9408 } 9409 9410 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9411 9412 /* Don't free the ctx here. Its range is in the bdev's global list of 9413 * locked ranges still, and will be removed and freed when this range 9414 * is later unlocked. 9415 */ 9416 } 9417 9418 static int 9419 bdev_lock_lba_range_check_io(void *_i) 9420 { 9421 struct spdk_bdev_channel_iter *i = _i; 9422 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9423 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9424 struct locked_lba_range_ctx *ctx = i->ctx; 9425 struct lba_range *range = ctx->current_range; 9426 struct spdk_bdev_io *bdev_io; 9427 9428 spdk_poller_unregister(&ctx->poller); 9429 9430 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9431 * range. But we need to wait until any outstanding IO overlapping with this range 9432 * are completed. 9433 */ 9434 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9435 if (bdev_io_range_is_locked(bdev_io, range)) { 9436 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9437 return SPDK_POLLER_BUSY; 9438 } 9439 } 9440 9441 spdk_bdev_for_each_channel_continue(i, 0); 9442 return SPDK_POLLER_BUSY; 9443 } 9444 9445 static void 9446 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9447 struct spdk_io_channel *_ch, void *_ctx) 9448 { 9449 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9450 struct locked_lba_range_ctx *ctx = _ctx; 9451 struct lba_range *range; 9452 9453 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9454 if (range->length == ctx->range.length && 9455 range->offset == ctx->range.offset && 9456 range->locked_ctx == ctx->range.locked_ctx) { 9457 /* This range already exists on this channel, so don't add 9458 * it again. This can happen when a new channel is created 9459 * while the for_each_channel operation is in progress. 9460 * Do not check for outstanding I/O in that case, since the 9461 * range was locked before any I/O could be submitted to the 9462 * new channel. 9463 */ 9464 spdk_bdev_for_each_channel_continue(i, 0); 9465 return; 9466 } 9467 } 9468 9469 range = calloc(1, sizeof(*range)); 9470 if (range == NULL) { 9471 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9472 return; 9473 } 9474 9475 range->length = ctx->range.length; 9476 range->offset = ctx->range.offset; 9477 range->locked_ctx = ctx->range.locked_ctx; 9478 ctx->current_range = range; 9479 if (ctx->range.owner_ch == ch) { 9480 /* This is the range object for the channel that will hold 9481 * the lock. Store it in the ctx object so that we can easily 9482 * set its owner_ch after the lock is finally acquired. 9483 */ 9484 ctx->owner_range = range; 9485 } 9486 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9487 bdev_lock_lba_range_check_io(i); 9488 } 9489 9490 static void 9491 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9492 { 9493 assert(spdk_get_thread() == ctx->range.owner_thread); 9494 assert(ctx->range.owner_ch == NULL || 9495 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9496 9497 /* We will add a copy of this range to each channel now. */ 9498 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9499 bdev_lock_lba_range_cb); 9500 } 9501 9502 static bool 9503 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9504 { 9505 struct lba_range *r; 9506 9507 TAILQ_FOREACH(r, tailq, tailq) { 9508 if (bdev_lba_range_overlapped(range, r)) { 9509 return true; 9510 } 9511 } 9512 return false; 9513 } 9514 9515 static int 9516 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9517 uint64_t offset, uint64_t length, 9518 lock_range_cb cb_fn, void *cb_arg) 9519 { 9520 struct locked_lba_range_ctx *ctx; 9521 9522 ctx = calloc(1, sizeof(*ctx)); 9523 if (ctx == NULL) { 9524 return -ENOMEM; 9525 } 9526 9527 ctx->range.offset = offset; 9528 ctx->range.length = length; 9529 ctx->range.owner_thread = spdk_get_thread(); 9530 ctx->range.owner_ch = ch; 9531 ctx->range.locked_ctx = cb_arg; 9532 ctx->range.bdev = bdev; 9533 ctx->cb_fn = cb_fn; 9534 ctx->cb_arg = cb_arg; 9535 9536 spdk_spin_lock(&bdev->internal.spinlock); 9537 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9538 /* There is an active lock overlapping with this range. 9539 * Put it on the pending list until this range no 9540 * longer overlaps with another. 9541 */ 9542 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9543 } else { 9544 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9545 bdev_lock_lba_range_ctx(bdev, ctx); 9546 } 9547 spdk_spin_unlock(&bdev->internal.spinlock); 9548 return 0; 9549 } 9550 9551 static int 9552 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9553 uint64_t offset, uint64_t length, 9554 lock_range_cb cb_fn, void *cb_arg) 9555 { 9556 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9557 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9558 9559 if (cb_arg == NULL) { 9560 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9561 return -EINVAL; 9562 } 9563 9564 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9565 } 9566 9567 static void 9568 bdev_lock_lba_range_ctx_msg(void *_ctx) 9569 { 9570 struct locked_lba_range_ctx *ctx = _ctx; 9571 9572 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9573 } 9574 9575 static void 9576 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9577 { 9578 struct locked_lba_range_ctx *ctx = _ctx; 9579 struct locked_lba_range_ctx *pending_ctx; 9580 struct lba_range *range, *tmp; 9581 9582 spdk_spin_lock(&bdev->internal.spinlock); 9583 /* Check if there are any pending locked ranges that overlap with this range 9584 * that was just unlocked. If there are, check that it doesn't overlap with any 9585 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9586 * the lock process. 9587 */ 9588 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9589 if (bdev_lba_range_overlapped(range, &ctx->range) && 9590 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9591 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9592 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9593 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9594 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9595 bdev_lock_lba_range_ctx_msg, pending_ctx); 9596 } 9597 } 9598 spdk_spin_unlock(&bdev->internal.spinlock); 9599 9600 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9601 free(ctx); 9602 } 9603 9604 static void 9605 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9606 struct spdk_io_channel *_ch, void *_ctx) 9607 { 9608 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9609 struct locked_lba_range_ctx *ctx = _ctx; 9610 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9611 struct spdk_bdev_io *bdev_io; 9612 struct lba_range *range; 9613 9614 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9615 if (ctx->range.offset == range->offset && 9616 ctx->range.length == range->length && 9617 ctx->range.locked_ctx == range->locked_ctx) { 9618 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9619 free(range); 9620 break; 9621 } 9622 } 9623 9624 /* Note: we should almost always be able to assert that the range specified 9625 * was found. But there are some very rare corner cases where a new channel 9626 * gets created simultaneously with a range unlock, where this function 9627 * would execute on that new channel and wouldn't have the range. 9628 * We also use this to clean up range allocations when a later allocation 9629 * fails in the locking path. 9630 * So we can't actually assert() here. 9631 */ 9632 9633 /* Swap the locked IO into a temporary list, and then try to submit them again. 9634 * We could hyper-optimize this to only resubmit locked I/O that overlap 9635 * with the range that was just unlocked, but this isn't a performance path so 9636 * we go for simplicity here. 9637 */ 9638 TAILQ_INIT(&io_locked); 9639 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9640 while (!TAILQ_EMPTY(&io_locked)) { 9641 bdev_io = TAILQ_FIRST(&io_locked); 9642 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9643 bdev_io_submit(bdev_io); 9644 } 9645 9646 spdk_bdev_for_each_channel_continue(i, 0); 9647 } 9648 9649 static int 9650 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9651 lock_range_cb cb_fn, void *cb_arg) 9652 { 9653 struct locked_lba_range_ctx *ctx; 9654 struct lba_range *range; 9655 9656 spdk_spin_lock(&bdev->internal.spinlock); 9657 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9658 * and remove it. This ensures new channels don't inherit the locked range. 9659 * Then we will send a message to each channel to remove the range from its 9660 * per-channel list. 9661 */ 9662 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9663 if (range->offset == offset && range->length == length && 9664 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9665 break; 9666 } 9667 } 9668 if (range == NULL) { 9669 assert(false); 9670 spdk_spin_unlock(&bdev->internal.spinlock); 9671 return -EINVAL; 9672 } 9673 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9674 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9675 spdk_spin_unlock(&bdev->internal.spinlock); 9676 9677 ctx->cb_fn = cb_fn; 9678 ctx->cb_arg = cb_arg; 9679 9680 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9681 bdev_unlock_lba_range_cb); 9682 return 0; 9683 } 9684 9685 static int 9686 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9687 uint64_t offset, uint64_t length, 9688 lock_range_cb cb_fn, void *cb_arg) 9689 { 9690 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9691 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9692 struct lba_range *range; 9693 bool range_found = false; 9694 9695 /* Let's make sure the specified channel actually has a lock on 9696 * the specified range. Note that the range must match exactly. 9697 */ 9698 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9699 if (range->offset == offset && range->length == length && 9700 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9701 range_found = true; 9702 break; 9703 } 9704 } 9705 9706 if (!range_found) { 9707 return -EINVAL; 9708 } 9709 9710 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9711 } 9712 9713 struct bdev_quiesce_ctx { 9714 spdk_bdev_quiesce_cb cb_fn; 9715 void *cb_arg; 9716 }; 9717 9718 static void 9719 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9720 { 9721 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9722 9723 if (quiesce_ctx->cb_fn != NULL) { 9724 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9725 } 9726 9727 free(quiesce_ctx); 9728 } 9729 9730 static void 9731 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9732 { 9733 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9734 struct spdk_bdev_module *module = range->bdev->module; 9735 9736 if (status != 0) { 9737 if (quiesce_ctx->cb_fn != NULL) { 9738 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9739 } 9740 free(quiesce_ctx); 9741 return; 9742 } 9743 9744 spdk_spin_lock(&module->internal.spinlock); 9745 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9746 spdk_spin_unlock(&module->internal.spinlock); 9747 9748 if (quiesce_ctx->cb_fn != NULL) { 9749 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9750 quiesce_ctx->cb_fn = NULL; 9751 quiesce_ctx->cb_arg = NULL; 9752 } 9753 /* quiesce_ctx will be freed on unquiesce */ 9754 } 9755 9756 static int 9757 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9758 uint64_t offset, uint64_t length, 9759 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9760 bool unquiesce) 9761 { 9762 struct bdev_quiesce_ctx *quiesce_ctx; 9763 int rc; 9764 9765 if (module != bdev->module) { 9766 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9767 return -EINVAL; 9768 } 9769 9770 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9771 return -EINVAL; 9772 } 9773 9774 if (unquiesce) { 9775 struct lba_range *range; 9776 9777 /* Make sure the specified range is actually quiesced in the specified module and 9778 * then remove it from the list. Note that the range must match exactly. 9779 */ 9780 spdk_spin_lock(&module->internal.spinlock); 9781 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9782 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9783 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9784 break; 9785 } 9786 } 9787 spdk_spin_unlock(&module->internal.spinlock); 9788 9789 if (range == NULL) { 9790 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9791 return -EINVAL; 9792 } 9793 9794 quiesce_ctx = range->locked_ctx; 9795 quiesce_ctx->cb_fn = cb_fn; 9796 quiesce_ctx->cb_arg = cb_arg; 9797 9798 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9799 } else { 9800 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9801 if (quiesce_ctx == NULL) { 9802 return -ENOMEM; 9803 } 9804 9805 quiesce_ctx->cb_fn = cb_fn; 9806 quiesce_ctx->cb_arg = cb_arg; 9807 9808 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9809 if (rc != 0) { 9810 free(quiesce_ctx); 9811 } 9812 } 9813 9814 return rc; 9815 } 9816 9817 int 9818 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9819 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9820 { 9821 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9822 } 9823 9824 int 9825 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9826 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9827 { 9828 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9829 } 9830 9831 int 9832 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9833 uint64_t offset, uint64_t length, 9834 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9835 { 9836 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9837 } 9838 9839 int 9840 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9841 uint64_t offset, uint64_t length, 9842 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9843 { 9844 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9845 } 9846 9847 int 9848 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9849 int array_size) 9850 { 9851 if (!bdev) { 9852 return -EINVAL; 9853 } 9854 9855 if (bdev->fn_table->get_memory_domains) { 9856 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9857 } 9858 9859 return 0; 9860 } 9861 9862 struct spdk_bdev_for_each_io_ctx { 9863 void *ctx; 9864 spdk_bdev_io_fn fn; 9865 spdk_bdev_for_each_io_cb cb; 9866 }; 9867 9868 static void 9869 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9870 struct spdk_io_channel *io_ch, void *_ctx) 9871 { 9872 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9873 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9874 struct spdk_bdev_io *bdev_io; 9875 int rc = 0; 9876 9877 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9878 rc = ctx->fn(ctx->ctx, bdev_io); 9879 if (rc != 0) { 9880 break; 9881 } 9882 } 9883 9884 spdk_bdev_for_each_channel_continue(i, rc); 9885 } 9886 9887 static void 9888 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9889 { 9890 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9891 9892 ctx->cb(ctx->ctx, status); 9893 9894 free(ctx); 9895 } 9896 9897 void 9898 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9899 spdk_bdev_for_each_io_cb cb) 9900 { 9901 struct spdk_bdev_for_each_io_ctx *ctx; 9902 9903 assert(fn != NULL && cb != NULL); 9904 9905 ctx = calloc(1, sizeof(*ctx)); 9906 if (ctx == NULL) { 9907 SPDK_ERRLOG("Failed to allocate context.\n"); 9908 cb(_ctx, -ENOMEM); 9909 return; 9910 } 9911 9912 ctx->ctx = _ctx; 9913 ctx->fn = fn; 9914 ctx->cb = cb; 9915 9916 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9917 bdev_for_each_io_done); 9918 } 9919 9920 void 9921 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9922 { 9923 spdk_for_each_channel_continue(iter->i, status); 9924 } 9925 9926 static struct spdk_bdev * 9927 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9928 { 9929 void *io_device = spdk_io_channel_iter_get_io_device(i); 9930 9931 return __bdev_from_io_dev(io_device); 9932 } 9933 9934 static void 9935 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9936 { 9937 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9938 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9939 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9940 9941 iter->i = i; 9942 iter->fn(iter, bdev, ch, iter->ctx); 9943 } 9944 9945 static void 9946 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9947 { 9948 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9949 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9950 9951 iter->i = i; 9952 iter->cpl(bdev, iter->ctx, status); 9953 9954 free(iter); 9955 } 9956 9957 void 9958 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9959 void *ctx, spdk_bdev_for_each_channel_done cpl) 9960 { 9961 struct spdk_bdev_channel_iter *iter; 9962 9963 assert(bdev != NULL && fn != NULL && ctx != NULL); 9964 9965 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9966 if (iter == NULL) { 9967 SPDK_ERRLOG("Unable to allocate iterator\n"); 9968 assert(false); 9969 return; 9970 } 9971 9972 iter->fn = fn; 9973 iter->cpl = cpl; 9974 iter->ctx = ctx; 9975 9976 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9977 iter, bdev_each_channel_cpl); 9978 } 9979 9980 static void 9981 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9982 { 9983 struct spdk_bdev_io *parent_io = cb_arg; 9984 9985 spdk_bdev_free_io(bdev_io); 9986 9987 /* Check return status of write */ 9988 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9989 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9990 } 9991 9992 static void 9993 bdev_copy_do_write(void *_bdev_io) 9994 { 9995 struct spdk_bdev_io *bdev_io = _bdev_io; 9996 int rc; 9997 9998 /* Write blocks */ 9999 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10000 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10001 bdev_io->u.bdev.iovs[0].iov_base, 10002 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10003 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10004 10005 if (rc == -ENOMEM) { 10006 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10007 } else if (rc != 0) { 10008 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10009 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10010 } 10011 } 10012 10013 static void 10014 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10015 { 10016 struct spdk_bdev_io *parent_io = cb_arg; 10017 10018 spdk_bdev_free_io(bdev_io); 10019 10020 /* Check return status of read */ 10021 if (!success) { 10022 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10023 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10024 return; 10025 } 10026 10027 /* Do write */ 10028 bdev_copy_do_write(parent_io); 10029 } 10030 10031 static void 10032 bdev_copy_do_read(void *_bdev_io) 10033 { 10034 struct spdk_bdev_io *bdev_io = _bdev_io; 10035 int rc; 10036 10037 /* Read blocks */ 10038 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10039 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10040 bdev_io->u.bdev.iovs[0].iov_base, 10041 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10042 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10043 10044 if (rc == -ENOMEM) { 10045 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10046 } else if (rc != 0) { 10047 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10048 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10049 } 10050 } 10051 10052 static void 10053 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10054 { 10055 if (!success) { 10056 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10057 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10058 return; 10059 } 10060 10061 bdev_copy_do_read(bdev_io); 10062 } 10063 10064 int 10065 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10066 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10067 spdk_bdev_io_completion_cb cb, void *cb_arg) 10068 { 10069 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10070 struct spdk_bdev_io *bdev_io; 10071 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10072 10073 if (!desc->write) { 10074 return -EBADF; 10075 } 10076 10077 if (num_blocks == 0) { 10078 SPDK_ERRLOG("Can't copy 0 blocks\n"); 10079 return -EINVAL; 10080 } 10081 10082 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10083 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10084 SPDK_DEBUGLOG(bdev, 10085 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10086 dst_offset_blocks, src_offset_blocks, num_blocks); 10087 return -EINVAL; 10088 } 10089 10090 bdev_io = bdev_channel_get_io(channel); 10091 if (!bdev_io) { 10092 return -ENOMEM; 10093 } 10094 10095 bdev_io->internal.ch = channel; 10096 bdev_io->internal.desc = desc; 10097 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10098 10099 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10100 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10101 bdev_io->u.bdev.num_blocks = num_blocks; 10102 bdev_io->u.bdev.memory_domain = NULL; 10103 bdev_io->u.bdev.memory_domain_ctx = NULL; 10104 bdev_io->u.bdev.iovs = NULL; 10105 bdev_io->u.bdev.iovcnt = 0; 10106 bdev_io->u.bdev.md_buf = NULL; 10107 bdev_io->u.bdev.accel_sequence = NULL; 10108 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10109 10110 if (dst_offset_blocks == src_offset_blocks) { 10111 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 10112 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 10113 10114 return 0; 10115 } 10116 10117 10118 /* If the copy size is large and should be split, use the generic split logic 10119 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10120 * 10121 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10122 * emulate it using regular read and write requests otherwise. 10123 */ 10124 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10125 bdev_io->internal.split) { 10126 bdev_io_submit(bdev_io); 10127 return 0; 10128 } 10129 10130 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10131 10132 return 0; 10133 } 10134 10135 SPDK_LOG_REGISTER_COMPONENT(bdev) 10136 10137 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10138 { 10139 struct spdk_trace_tpoint_opts opts[] = { 10140 { 10141 "BDEV_IO_START", TRACE_BDEV_IO_START, 10142 OWNER_BDEV, OBJECT_BDEV_IO, 1, 10143 { 10144 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10145 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10146 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10147 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10148 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10149 } 10150 }, 10151 { 10152 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10153 OWNER_BDEV, OBJECT_BDEV_IO, 0, 10154 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10155 }, 10156 { 10157 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10158 OWNER_BDEV, OBJECT_NONE, 1, 10159 { 10160 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10161 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10162 } 10163 }, 10164 { 10165 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10166 OWNER_BDEV, OBJECT_NONE, 0, 10167 { 10168 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10169 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10170 } 10171 }, 10172 }; 10173 10174 10175 spdk_trace_register_owner(OWNER_BDEV, 'b'); 10176 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10177 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10178 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10179 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10180 } 10181