1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "v23.09", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 void *locked_ctx; 140 struct spdk_thread *owner_thread; 141 struct spdk_bdev_channel *owner_ch; 142 TAILQ_ENTRY(lba_range) tailq; 143 TAILQ_ENTRY(lba_range) tailq_module; 144 }; 145 146 static struct spdk_bdev_opts g_bdev_opts = { 147 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 148 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 149 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 150 }; 151 152 static spdk_bdev_init_cb g_init_cb_fn = NULL; 153 static void *g_init_cb_arg = NULL; 154 155 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 156 static void *g_fini_cb_arg = NULL; 157 static struct spdk_thread *g_fini_thread = NULL; 158 159 struct spdk_bdev_qos_limit { 160 /** IOs or bytes allowed per second (i.e., 1s). */ 161 uint64_t limit; 162 163 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 164 * For remaining bytes, allowed to run negative if an I/O is submitted when 165 * some bytes are remaining, but the I/O is bigger than that amount. The 166 * excess will be deducted from the next timeslice. 167 */ 168 int64_t remaining_this_timeslice; 169 170 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 171 uint32_t min_per_timeslice; 172 173 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t max_per_timeslice; 175 176 /** Function to check whether to queue the IO. */ 177 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 178 179 /** Function to update for the submitted IO. */ 180 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 }; 182 183 struct spdk_bdev_qos { 184 /** Types of structure of rate limits. */ 185 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 186 187 /** The channel that all I/O are funneled through. */ 188 struct spdk_bdev_channel *ch; 189 190 /** The thread on which the poller is running. */ 191 struct spdk_thread *thread; 192 193 /** Queue of I/O waiting to be issued. */ 194 bdev_io_tailq_t queued; 195 196 /** Size of a timeslice in tsc ticks. */ 197 uint64_t timeslice_size; 198 199 /** Timestamp of start of last timeslice. */ 200 uint64_t last_timeslice; 201 202 /** Poller that processes queued I/O commands each time slice. */ 203 struct spdk_poller *poller; 204 }; 205 206 struct spdk_bdev_mgmt_channel { 207 /* 208 * Each thread keeps a cache of bdev_io - this allows 209 * bdev threads which are *not* DPDK threads to still 210 * benefit from a per-thread bdev_io cache. Without 211 * this, non-DPDK threads fetching from the mempool 212 * incur a cmpxchg on get and put. 213 */ 214 bdev_io_stailq_t per_thread_cache; 215 uint32_t per_thread_cache_count; 216 uint32_t bdev_io_cache_size; 217 218 struct spdk_iobuf_channel iobuf; 219 220 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 221 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 222 }; 223 224 /* 225 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 226 * will queue here their IO that awaits retry. It makes it possible to retry sending 227 * IO to one bdev after IO from other bdev completes. 228 */ 229 struct spdk_bdev_shared_resource { 230 /* The bdev management channel */ 231 struct spdk_bdev_mgmt_channel *mgmt_ch; 232 233 /* 234 * Count of I/O submitted to bdev module and waiting for completion. 235 * Incremented before submit_request() is called on an spdk_bdev_io. 236 */ 237 uint64_t io_outstanding; 238 239 /* 240 * Queue of IO awaiting retry because of a previous NOMEM status returned 241 * on this channel. 242 */ 243 bdev_io_tailq_t nomem_io; 244 245 /* 246 * Threshold which io_outstanding must drop to before retrying nomem_io. 247 */ 248 uint64_t nomem_threshold; 249 250 /* I/O channel allocated by a bdev module */ 251 struct spdk_io_channel *shared_ch; 252 253 /* Refcount of bdev channels using this resource */ 254 uint32_t ref; 255 256 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 257 }; 258 259 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 260 #define BDEV_CH_QOS_ENABLED (1 << 1) 261 262 struct spdk_bdev_channel { 263 struct spdk_bdev *bdev; 264 265 /* The channel for the underlying device */ 266 struct spdk_io_channel *channel; 267 268 /* Accel channel */ 269 struct spdk_io_channel *accel_channel; 270 271 /* Per io_device per thread data */ 272 struct spdk_bdev_shared_resource *shared_resource; 273 274 struct spdk_bdev_io_stat *stat; 275 276 /* 277 * Count of I/O submitted to the underlying dev module through this channel 278 * and waiting for completion. 279 */ 280 uint64_t io_outstanding; 281 282 /* 283 * List of all submitted I/Os including I/O that are generated via splitting. 284 */ 285 bdev_io_tailq_t io_submitted; 286 287 /* 288 * List of spdk_bdev_io that are currently queued because they write to a locked 289 * LBA range. 290 */ 291 bdev_io_tailq_t io_locked; 292 293 /* List of I/Os with accel sequence being currently executed */ 294 bdev_io_tailq_t io_accel_exec; 295 296 /* List of I/Os doing memory domain pull/push */ 297 bdev_io_tailq_t io_memory_domain; 298 299 uint32_t flags; 300 301 struct spdk_histogram_data *histogram; 302 303 #ifdef SPDK_CONFIG_VTUNE 304 uint64_t start_tsc; 305 uint64_t interval_tsc; 306 __itt_string_handle *handle; 307 struct spdk_bdev_io_stat *prev_stat; 308 #endif 309 310 bdev_io_tailq_t queued_resets; 311 312 lba_range_tailq_t locked_ranges; 313 }; 314 315 struct media_event_entry { 316 struct spdk_bdev_media_event event; 317 TAILQ_ENTRY(media_event_entry) tailq; 318 }; 319 320 #define MEDIA_EVENT_POOL_SIZE 64 321 322 struct spdk_bdev_desc { 323 struct spdk_bdev *bdev; 324 struct spdk_thread *thread; 325 struct { 326 spdk_bdev_event_cb_t event_fn; 327 void *ctx; 328 } callback; 329 bool closed; 330 bool write; 331 bool memory_domains_supported; 332 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 333 struct spdk_spinlock spinlock; 334 uint32_t refs; 335 TAILQ_HEAD(, media_event_entry) pending_media_events; 336 TAILQ_HEAD(, media_event_entry) free_media_events; 337 struct media_event_entry *media_events_buffer; 338 TAILQ_ENTRY(spdk_bdev_desc) link; 339 340 uint64_t timeout_in_sec; 341 spdk_bdev_io_timeout_cb cb_fn; 342 void *cb_arg; 343 struct spdk_poller *io_timeout_poller; 344 struct spdk_bdev_module_claim *claim; 345 }; 346 347 struct spdk_bdev_iostat_ctx { 348 struct spdk_bdev_io_stat *stat; 349 spdk_bdev_get_device_stat_cb cb; 350 void *cb_arg; 351 }; 352 353 struct set_qos_limit_ctx { 354 void (*cb_fn)(void *cb_arg, int status); 355 void *cb_arg; 356 struct spdk_bdev *bdev; 357 }; 358 359 struct spdk_bdev_channel_iter { 360 spdk_bdev_for_each_channel_msg fn; 361 spdk_bdev_for_each_channel_done cpl; 362 struct spdk_io_channel_iter *i; 363 void *ctx; 364 }; 365 366 struct spdk_bdev_io_error_stat { 367 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 368 }; 369 370 enum bdev_io_retry_state { 371 BDEV_IO_RETRY_STATE_INVALID, 372 BDEV_IO_RETRY_STATE_PULL, 373 BDEV_IO_RETRY_STATE_PULL_MD, 374 BDEV_IO_RETRY_STATE_SUBMIT, 375 BDEV_IO_RETRY_STATE_PUSH, 376 BDEV_IO_RETRY_STATE_PUSH_MD, 377 }; 378 379 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 380 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 381 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 382 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 383 384 static inline void bdev_io_complete(void *ctx); 385 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 386 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 387 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 388 389 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 390 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 391 392 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 393 struct spdk_io_channel *ch, void *_ctx); 394 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 395 396 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 397 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 398 uint64_t num_blocks, 399 struct spdk_memory_domain *domain, void *domain_ctx, 400 struct spdk_accel_sequence *seq, 401 spdk_bdev_io_completion_cb cb, void *cb_arg); 402 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 403 struct iovec *iov, int iovcnt, void *md_buf, 404 uint64_t offset_blocks, uint64_t num_blocks, 405 struct spdk_memory_domain *domain, void *domain_ctx, 406 struct spdk_accel_sequence *seq, 407 spdk_bdev_io_completion_cb cb, void *cb_arg); 408 409 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 410 uint64_t offset, uint64_t length, 411 lock_range_cb cb_fn, void *cb_arg); 412 413 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 414 uint64_t offset, uint64_t length, 415 lock_range_cb cb_fn, void *cb_arg); 416 417 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 418 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 419 420 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 421 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 422 static void claim_reset(struct spdk_bdev *bdev); 423 424 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 425 426 #define bdev_get_ext_io_opt(opts, field, defval) \ 427 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 428 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 429 430 void 431 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 432 { 433 if (!opts) { 434 SPDK_ERRLOG("opts should not be NULL\n"); 435 return; 436 } 437 438 if (!opts_size) { 439 SPDK_ERRLOG("opts_size should not be zero value\n"); 440 return; 441 } 442 443 opts->opts_size = opts_size; 444 445 #define SET_FIELD(field) \ 446 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 447 opts->field = g_bdev_opts.field; \ 448 } \ 449 450 SET_FIELD(bdev_io_pool_size); 451 SET_FIELD(bdev_io_cache_size); 452 SET_FIELD(bdev_auto_examine); 453 454 /* Do not remove this statement, you should always update this statement when you adding a new field, 455 * and do not forget to add the SET_FIELD statement for your added field. */ 456 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 457 458 #undef SET_FIELD 459 } 460 461 int 462 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 463 { 464 uint32_t min_pool_size; 465 466 if (!opts) { 467 SPDK_ERRLOG("opts cannot be NULL\n"); 468 return -1; 469 } 470 471 if (!opts->opts_size) { 472 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 473 return -1; 474 } 475 476 /* 477 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 478 * initialization. A second mgmt_ch will be created on the same thread when the application starts 479 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 480 */ 481 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 482 if (opts->bdev_io_pool_size < min_pool_size) { 483 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 484 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 485 spdk_thread_get_count()); 486 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 487 return -1; 488 } 489 490 #define SET_FIELD(field) \ 491 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 492 g_bdev_opts.field = opts->field; \ 493 } \ 494 495 SET_FIELD(bdev_io_pool_size); 496 SET_FIELD(bdev_io_cache_size); 497 SET_FIELD(bdev_auto_examine); 498 499 g_bdev_opts.opts_size = opts->opts_size; 500 501 #undef SET_FIELD 502 503 return 0; 504 } 505 506 static struct spdk_bdev * 507 bdev_get_by_name(const char *bdev_name) 508 { 509 struct spdk_bdev_name find; 510 struct spdk_bdev_name *res; 511 512 find.name = (char *)bdev_name; 513 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 514 if (res != NULL) { 515 return res->bdev; 516 } 517 518 return NULL; 519 } 520 521 struct spdk_bdev * 522 spdk_bdev_get_by_name(const char *bdev_name) 523 { 524 struct spdk_bdev *bdev; 525 526 spdk_spin_lock(&g_bdev_mgr.spinlock); 527 bdev = bdev_get_by_name(bdev_name); 528 spdk_spin_unlock(&g_bdev_mgr.spinlock); 529 530 return bdev; 531 } 532 533 struct bdev_io_status_string { 534 enum spdk_bdev_io_status status; 535 const char *str; 536 }; 537 538 static const struct bdev_io_status_string bdev_io_status_strings[] = { 539 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 540 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 541 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 542 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 543 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 544 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 545 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 546 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 547 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 548 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 549 }; 550 551 static const char * 552 bdev_io_status_get_string(enum spdk_bdev_io_status status) 553 { 554 uint32_t i; 555 556 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 557 if (bdev_io_status_strings[i].status == status) { 558 return bdev_io_status_strings[i].str; 559 } 560 } 561 562 return "reserved"; 563 } 564 565 struct spdk_bdev_wait_for_examine_ctx { 566 struct spdk_poller *poller; 567 spdk_bdev_wait_for_examine_cb cb_fn; 568 void *cb_arg; 569 }; 570 571 static bool bdev_module_all_actions_completed(void); 572 573 static int 574 bdev_wait_for_examine_cb(void *arg) 575 { 576 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 577 578 if (!bdev_module_all_actions_completed()) { 579 return SPDK_POLLER_IDLE; 580 } 581 582 spdk_poller_unregister(&ctx->poller); 583 ctx->cb_fn(ctx->cb_arg); 584 free(ctx); 585 586 return SPDK_POLLER_BUSY; 587 } 588 589 int 590 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 591 { 592 struct spdk_bdev_wait_for_examine_ctx *ctx; 593 594 ctx = calloc(1, sizeof(*ctx)); 595 if (ctx == NULL) { 596 return -ENOMEM; 597 } 598 ctx->cb_fn = cb_fn; 599 ctx->cb_arg = cb_arg; 600 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 601 602 return 0; 603 } 604 605 struct spdk_bdev_examine_item { 606 char *name; 607 TAILQ_ENTRY(spdk_bdev_examine_item) link; 608 }; 609 610 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 611 612 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 613 g_bdev_examine_allowlist); 614 615 static inline bool 616 bdev_examine_allowlist_check(const char *name) 617 { 618 struct spdk_bdev_examine_item *item; 619 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 620 if (strcmp(name, item->name) == 0) { 621 return true; 622 } 623 } 624 return false; 625 } 626 627 static inline void 628 bdev_examine_allowlist_free(void) 629 { 630 struct spdk_bdev_examine_item *item; 631 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 632 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 633 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 634 free(item->name); 635 free(item); 636 } 637 } 638 639 static inline bool 640 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 641 { 642 struct spdk_bdev_alias *tmp; 643 if (bdev_examine_allowlist_check(bdev->name)) { 644 return true; 645 } 646 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 647 if (bdev_examine_allowlist_check(tmp->alias.name)) { 648 return true; 649 } 650 } 651 return false; 652 } 653 654 static inline bool 655 bdev_ok_to_examine(struct spdk_bdev *bdev) 656 { 657 if (g_bdev_opts.bdev_auto_examine) { 658 return true; 659 } else { 660 return bdev_in_examine_allowlist(bdev); 661 } 662 } 663 664 static void 665 bdev_examine(struct spdk_bdev *bdev) 666 { 667 struct spdk_bdev_module *module; 668 struct spdk_bdev_module_claim *claim, *tmpclaim; 669 uint32_t action; 670 671 if (!bdev_ok_to_examine(bdev)) { 672 return; 673 } 674 675 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 676 if (module->examine_config) { 677 spdk_spin_lock(&module->internal.spinlock); 678 action = module->internal.action_in_progress; 679 module->internal.action_in_progress++; 680 spdk_spin_unlock(&module->internal.spinlock); 681 module->examine_config(bdev); 682 if (action != module->internal.action_in_progress) { 683 SPDK_ERRLOG("examine_config for module %s did not call " 684 "spdk_bdev_module_examine_done()\n", module->name); 685 } 686 } 687 } 688 689 spdk_spin_lock(&bdev->internal.spinlock); 690 691 switch (bdev->internal.claim_type) { 692 case SPDK_BDEV_CLAIM_NONE: 693 /* Examine by all bdev modules */ 694 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 695 if (module->examine_disk) { 696 spdk_spin_lock(&module->internal.spinlock); 697 module->internal.action_in_progress++; 698 spdk_spin_unlock(&module->internal.spinlock); 699 spdk_spin_unlock(&bdev->internal.spinlock); 700 module->examine_disk(bdev); 701 spdk_spin_lock(&bdev->internal.spinlock); 702 } 703 } 704 break; 705 case SPDK_BDEV_CLAIM_EXCL_WRITE: 706 /* Examine by the one bdev module with a v1 claim */ 707 module = bdev->internal.claim.v1.module; 708 if (module->examine_disk) { 709 spdk_spin_lock(&module->internal.spinlock); 710 module->internal.action_in_progress++; 711 spdk_spin_unlock(&module->internal.spinlock); 712 spdk_spin_unlock(&bdev->internal.spinlock); 713 module->examine_disk(bdev); 714 return; 715 } 716 break; 717 default: 718 /* Examine by all bdev modules with a v2 claim */ 719 assert(claim_type_is_v2(bdev->internal.claim_type)); 720 /* 721 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 722 * list, perhaps accessing freed memory. Without protection, this could happen 723 * while the lock is dropped during the examine callback. 724 */ 725 bdev->internal.examine_in_progress++; 726 727 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 728 module = claim->module; 729 730 if (module == NULL) { 731 /* This is a vestigial claim, held by examine_count */ 732 continue; 733 } 734 735 if (module->examine_disk == NULL) { 736 continue; 737 } 738 739 spdk_spin_lock(&module->internal.spinlock); 740 module->internal.action_in_progress++; 741 spdk_spin_unlock(&module->internal.spinlock); 742 743 /* Call examine_disk without holding internal.spinlock. */ 744 spdk_spin_unlock(&bdev->internal.spinlock); 745 module->examine_disk(bdev); 746 spdk_spin_lock(&bdev->internal.spinlock); 747 } 748 749 assert(bdev->internal.examine_in_progress > 0); 750 bdev->internal.examine_in_progress--; 751 if (bdev->internal.examine_in_progress == 0) { 752 /* Remove any claims that were released during examine_disk */ 753 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 754 if (claim->desc != NULL) { 755 continue; 756 } 757 758 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 759 free(claim); 760 } 761 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 762 claim_reset(bdev); 763 } 764 } 765 } 766 767 spdk_spin_unlock(&bdev->internal.spinlock); 768 } 769 770 int 771 spdk_bdev_examine(const char *name) 772 { 773 struct spdk_bdev *bdev; 774 struct spdk_bdev_examine_item *item; 775 struct spdk_thread *thread = spdk_get_thread(); 776 777 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 778 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 779 thread ? spdk_thread_get_name(thread) : "null"); 780 return -EINVAL; 781 } 782 783 if (g_bdev_opts.bdev_auto_examine) { 784 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 785 return -EINVAL; 786 } 787 788 if (bdev_examine_allowlist_check(name)) { 789 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 790 return -EEXIST; 791 } 792 793 item = calloc(1, sizeof(*item)); 794 if (!item) { 795 return -ENOMEM; 796 } 797 item->name = strdup(name); 798 if (!item->name) { 799 free(item); 800 return -ENOMEM; 801 } 802 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 803 804 bdev = spdk_bdev_get_by_name(name); 805 if (bdev) { 806 bdev_examine(bdev); 807 } 808 return 0; 809 } 810 811 static inline void 812 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 813 { 814 struct spdk_bdev_examine_item *item; 815 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 816 spdk_json_write_object_begin(w); 817 spdk_json_write_named_string(w, "method", "bdev_examine"); 818 spdk_json_write_named_object_begin(w, "params"); 819 spdk_json_write_named_string(w, "name", item->name); 820 spdk_json_write_object_end(w); 821 spdk_json_write_object_end(w); 822 } 823 } 824 825 struct spdk_bdev * 826 spdk_bdev_first(void) 827 { 828 struct spdk_bdev *bdev; 829 830 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 831 if (bdev) { 832 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 833 } 834 835 return bdev; 836 } 837 838 struct spdk_bdev * 839 spdk_bdev_next(struct spdk_bdev *prev) 840 { 841 struct spdk_bdev *bdev; 842 843 bdev = TAILQ_NEXT(prev, internal.link); 844 if (bdev) { 845 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 846 } 847 848 return bdev; 849 } 850 851 static struct spdk_bdev * 852 _bdev_next_leaf(struct spdk_bdev *bdev) 853 { 854 while (bdev != NULL) { 855 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 856 return bdev; 857 } else { 858 bdev = TAILQ_NEXT(bdev, internal.link); 859 } 860 } 861 862 return bdev; 863 } 864 865 struct spdk_bdev * 866 spdk_bdev_first_leaf(void) 867 { 868 struct spdk_bdev *bdev; 869 870 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 871 872 if (bdev) { 873 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 874 } 875 876 return bdev; 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_next_leaf(struct spdk_bdev *prev) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 885 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 static inline bool 894 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 895 { 896 return bdev_io->internal.memory_domain; 897 } 898 899 static inline bool 900 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 901 { 902 return bdev_io->internal.has_accel_sequence; 903 } 904 905 static inline void 906 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 907 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 908 { 909 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 910 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 911 * channels we will instead wait for half to complete. 912 */ 913 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 914 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 915 916 assert(state != BDEV_IO_RETRY_STATE_INVALID); 917 bdev_io->internal.retry_state = state; 918 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 919 } 920 921 static inline void 922 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 923 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 924 { 925 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 926 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 927 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 928 929 assert(state != BDEV_IO_RETRY_STATE_INVALID); 930 bdev_io->internal.retry_state = state; 931 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 932 } 933 934 void 935 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 936 { 937 struct iovec *iovs; 938 939 if (bdev_io->u.bdev.iovs == NULL) { 940 bdev_io->u.bdev.iovs = &bdev_io->iov; 941 bdev_io->u.bdev.iovcnt = 1; 942 } 943 944 iovs = bdev_io->u.bdev.iovs; 945 946 assert(iovs != NULL); 947 assert(bdev_io->u.bdev.iovcnt >= 1); 948 949 iovs[0].iov_base = buf; 950 iovs[0].iov_len = len; 951 } 952 953 void 954 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 955 { 956 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 957 bdev_io->u.bdev.md_buf = md_buf; 958 } 959 960 static bool 961 _is_buf_allocated(const struct iovec *iovs) 962 { 963 if (iovs == NULL) { 964 return false; 965 } 966 967 return iovs[0].iov_base != NULL; 968 } 969 970 static bool 971 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 972 { 973 int i; 974 uintptr_t iov_base; 975 976 if (spdk_likely(alignment == 1)) { 977 return true; 978 } 979 980 for (i = 0; i < iovcnt; i++) { 981 iov_base = (uintptr_t)iovs[i].iov_base; 982 if ((iov_base & (alignment - 1)) != 0) { 983 return false; 984 } 985 } 986 987 return true; 988 } 989 990 static inline bool 991 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 992 { 993 if (!bdev_io->internal.accel_sequence) { 994 return false; 995 } 996 997 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 998 * bdev module didn't support accel sequences */ 999 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1000 } 1001 1002 static inline void 1003 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1004 struct spdk_bdev_shared_resource *shared_resource) 1005 { 1006 bdev_ch->io_outstanding++; 1007 shared_resource->io_outstanding++; 1008 } 1009 1010 static inline void 1011 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1012 struct spdk_bdev_shared_resource *shared_resource) 1013 { 1014 assert(bdev_ch->io_outstanding > 0); 1015 assert(shared_resource->io_outstanding > 0); 1016 bdev_ch->io_outstanding--; 1017 shared_resource->io_outstanding--; 1018 } 1019 1020 static void 1021 bdev_io_submit_sequence_cb(void *ctx, int status) 1022 { 1023 struct spdk_bdev_io *bdev_io = ctx; 1024 1025 bdev_io->u.bdev.accel_sequence = NULL; 1026 bdev_io->internal.accel_sequence = NULL; 1027 1028 if (spdk_unlikely(status != 0)) { 1029 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1030 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1031 bdev_io_complete_unsubmitted(bdev_io); 1032 return; 1033 } 1034 1035 bdev_io_submit(bdev_io); 1036 } 1037 1038 static void 1039 bdev_io_exec_sequence_cb(void *ctx, int status) 1040 { 1041 struct spdk_bdev_io *bdev_io = ctx; 1042 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1043 1044 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1045 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1046 1047 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1048 bdev_ch_retry_io(ch); 1049 } 1050 1051 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1052 } 1053 1054 static void 1055 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1056 { 1057 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1058 1059 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1060 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1061 1062 /* Since the operations are appended during submission, they're in the opposite order than 1063 * how we want to execute them for reads (i.e. we need to execute the most recently added 1064 * operation first), so reverse the sequence before executing it. 1065 */ 1066 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1067 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1068 } 1069 1070 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1071 bdev_io_increment_outstanding(ch, ch->shared_resource); 1072 bdev_io->internal.data_transfer_cpl = cb_fn; 1073 1074 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1075 bdev_io_exec_sequence_cb, bdev_io); 1076 } 1077 1078 static void 1079 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1080 { 1081 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1082 void *buf; 1083 1084 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1085 buf = bdev_io->internal.buf; 1086 bdev_io->internal.buf = NULL; 1087 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1088 bdev_io->internal.get_aux_buf_cb = NULL; 1089 } else { 1090 assert(bdev_io->internal.get_buf_cb != NULL); 1091 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1092 bdev_io->internal.get_buf_cb = NULL; 1093 } 1094 } 1095 1096 static void 1097 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1098 { 1099 struct spdk_bdev_io *bdev_io = ctx; 1100 1101 if (rc) { 1102 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1103 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1104 } 1105 bdev_io_get_buf_complete(bdev_io, !rc); 1106 } 1107 1108 static void 1109 bdev_io_pull_md_buf_done(void *ctx, int status) 1110 { 1111 struct spdk_bdev_io *bdev_io = ctx; 1112 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1113 1114 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1115 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1116 1117 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1118 bdev_ch_retry_io(ch); 1119 } 1120 1121 assert(bdev_io->internal.data_transfer_cpl); 1122 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1123 } 1124 1125 static void 1126 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1127 { 1128 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1129 int rc = 0; 1130 1131 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1132 if (bdev_io_use_memory_domain(bdev_io)) { 1133 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1134 bdev_io_increment_outstanding(ch, ch->shared_resource); 1135 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1136 bdev_io->internal.memory_domain_ctx, 1137 &bdev_io->internal.orig_md_iov, 1, 1138 &bdev_io->internal.bounce_md_iov, 1, 1139 bdev_io_pull_md_buf_done, bdev_io); 1140 if (rc == 0) { 1141 /* Continue to submit IO in completion callback */ 1142 return; 1143 } 1144 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1145 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1146 if (rc != -ENOMEM) { 1147 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1148 spdk_memory_domain_get_dma_device_id( 1149 bdev_io->internal.memory_domain), rc); 1150 } 1151 } else { 1152 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1153 bdev_io->internal.orig_md_iov.iov_base, 1154 bdev_io->internal.orig_md_iov.iov_len); 1155 } 1156 } 1157 1158 if (spdk_unlikely(rc == -ENOMEM)) { 1159 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1160 } else { 1161 assert(bdev_io->internal.data_transfer_cpl); 1162 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1163 } 1164 } 1165 1166 static void 1167 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1168 { 1169 /* save original md_buf */ 1170 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1171 bdev_io->internal.orig_md_iov.iov_len = len; 1172 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1173 bdev_io->internal.bounce_md_iov.iov_len = len; 1174 /* set bounce md_buf */ 1175 bdev_io->u.bdev.md_buf = md_buf; 1176 1177 bdev_io_pull_md_buf(bdev_io); 1178 } 1179 1180 static void 1181 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1182 { 1183 struct spdk_bdev *bdev = bdev_io->bdev; 1184 uint64_t md_len; 1185 void *buf; 1186 1187 if (spdk_bdev_is_md_separate(bdev)) { 1188 assert(!bdev_io_use_accel_sequence(bdev_io)); 1189 1190 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1191 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1192 1193 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1194 1195 if (bdev_io->u.bdev.md_buf != NULL) { 1196 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1197 return; 1198 } else { 1199 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1200 } 1201 } 1202 1203 bdev_io_get_buf_complete(bdev_io, true); 1204 } 1205 1206 static inline void 1207 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1208 { 1209 if (rc) { 1210 SPDK_ERRLOG("Failed to get data buffer\n"); 1211 assert(bdev_io->internal.data_transfer_cpl); 1212 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1213 return; 1214 } 1215 1216 _bdev_io_set_md_buf(bdev_io); 1217 } 1218 1219 static void 1220 bdev_io_pull_data_done_and_track(void *ctx, int status) 1221 { 1222 struct spdk_bdev_io *bdev_io = ctx; 1223 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1224 1225 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1226 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1227 1228 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1229 bdev_ch_retry_io(ch); 1230 } 1231 1232 bdev_io_pull_data_done(bdev_io, status); 1233 } 1234 1235 static void 1236 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1237 { 1238 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1239 int rc = 0; 1240 1241 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1242 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1243 * operation */ 1244 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1245 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1246 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1247 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1248 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1249 NULL, NULL, 1250 bdev_io->internal.orig_iovs, 1251 bdev_io->internal.orig_iovcnt, 1252 bdev_io->internal.memory_domain, 1253 bdev_io->internal.memory_domain_ctx, 1254 0, NULL, NULL); 1255 } else { 1256 /* We need to reverse the src/dst for reads */ 1257 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1258 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1259 bdev_io->internal.orig_iovs, 1260 bdev_io->internal.orig_iovcnt, 1261 bdev_io->internal.memory_domain, 1262 bdev_io->internal.memory_domain_ctx, 1263 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1264 NULL, NULL, 0, NULL, NULL); 1265 } 1266 1267 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1268 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1269 bdev_io->internal.accel_sequence); 1270 } 1271 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1272 /* if this is write path, copy data from original buffer to bounce buffer */ 1273 if (bdev_io_use_memory_domain(bdev_io)) { 1274 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1275 bdev_io_increment_outstanding(ch, ch->shared_resource); 1276 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1277 bdev_io->internal.memory_domain_ctx, 1278 bdev_io->internal.orig_iovs, 1279 (uint32_t) bdev_io->internal.orig_iovcnt, 1280 bdev_io->u.bdev.iovs, 1, 1281 bdev_io_pull_data_done_and_track, 1282 bdev_io); 1283 if (rc == 0) { 1284 /* Continue to submit IO in completion callback */ 1285 return; 1286 } 1287 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1288 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1289 if (rc != -ENOMEM) { 1290 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1291 spdk_memory_domain_get_dma_device_id( 1292 bdev_io->internal.memory_domain)); 1293 } 1294 } else { 1295 assert(bdev_io->u.bdev.iovcnt == 1); 1296 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1297 bdev_io->u.bdev.iovs[0].iov_len, 1298 bdev_io->internal.orig_iovs, 1299 bdev_io->internal.orig_iovcnt); 1300 } 1301 } 1302 1303 if (spdk_unlikely(rc == -ENOMEM)) { 1304 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1305 } else { 1306 bdev_io_pull_data_done(bdev_io, rc); 1307 } 1308 } 1309 1310 static void 1311 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1312 bdev_copy_bounce_buffer_cpl cpl_cb) 1313 { 1314 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1315 1316 bdev_io->internal.data_transfer_cpl = cpl_cb; 1317 /* save original iovec */ 1318 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1319 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1320 /* set bounce iov */ 1321 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1322 bdev_io->u.bdev.iovcnt = 1; 1323 /* set bounce buffer for this operation */ 1324 bdev_io->u.bdev.iovs[0].iov_base = buf; 1325 bdev_io->u.bdev.iovs[0].iov_len = len; 1326 1327 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1328 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1329 } else { 1330 bdev_io_pull_data(bdev_io); 1331 } 1332 } 1333 1334 static void 1335 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1336 { 1337 struct spdk_bdev *bdev = bdev_io->bdev; 1338 bool buf_allocated; 1339 uint64_t alignment; 1340 void *aligned_buf; 1341 1342 bdev_io->internal.buf = buf; 1343 1344 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1345 bdev_io_get_buf_complete(bdev_io, true); 1346 return; 1347 } 1348 1349 alignment = spdk_bdev_get_buf_align(bdev); 1350 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1351 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1352 1353 if (buf_allocated) { 1354 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1355 /* Continue in completion callback */ 1356 return; 1357 } else { 1358 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1359 } 1360 1361 _bdev_io_set_md_buf(bdev_io); 1362 } 1363 1364 static inline uint64_t 1365 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1366 { 1367 struct spdk_bdev *bdev = bdev_io->bdev; 1368 uint64_t md_len, alignment; 1369 1370 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1371 1372 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1373 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1374 1375 return len + alignment + md_len; 1376 } 1377 1378 static void 1379 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1380 { 1381 struct spdk_bdev_mgmt_channel *ch; 1382 1383 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1384 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1385 } 1386 1387 static void 1388 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1389 { 1390 assert(bdev_io->internal.buf != NULL); 1391 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1392 bdev_io->internal.buf = NULL; 1393 } 1394 1395 void 1396 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1397 { 1398 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1399 1400 assert(buf != NULL); 1401 _bdev_io_put_buf(bdev_io, buf, len); 1402 } 1403 1404 static inline void 1405 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1406 struct spdk_bdev_io *bdev_io) 1407 { 1408 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1409 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1410 * sequence pointer to make sure we won't touch it anymore. */ 1411 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1412 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1413 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1414 bdev_io->internal.accel_sequence = NULL; 1415 } 1416 1417 bdev->fn_table->submit_request(ioch, bdev_io); 1418 } 1419 1420 static inline void 1421 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1422 { 1423 struct spdk_bdev *bdev = bdev_ch->bdev; 1424 1425 bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource); 1426 bdev_io->internal.error.nvme.cdw0 = 0; 1427 bdev_io->num_retries++; 1428 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1429 } 1430 1431 static void 1432 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1433 { 1434 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1435 struct spdk_bdev_io *bdev_io; 1436 1437 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1438 /* 1439 * Allow some more I/O to complete before retrying the nomem_io queue. 1440 * Some drivers (such as nvme) cannot immediately take a new I/O in 1441 * the context of a completion, because the resources for the I/O are 1442 * not released until control returns to the bdev poller. Also, we 1443 * may require several small I/O to complete before a larger I/O 1444 * (that requires splitting) can be submitted. 1445 */ 1446 return; 1447 } 1448 1449 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1450 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1451 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1452 1453 switch (bdev_io->internal.retry_state) { 1454 case BDEV_IO_RETRY_STATE_SUBMIT: 1455 bdev_ch_resubmit_io(bdev_ch, bdev_io); 1456 break; 1457 case BDEV_IO_RETRY_STATE_PULL: 1458 bdev_io_pull_data(bdev_io); 1459 break; 1460 case BDEV_IO_RETRY_STATE_PULL_MD: 1461 bdev_io_pull_md_buf(bdev_io); 1462 break; 1463 case BDEV_IO_RETRY_STATE_PUSH: 1464 bdev_io_push_bounce_data(bdev_io); 1465 break; 1466 case BDEV_IO_RETRY_STATE_PUSH_MD: 1467 bdev_io_push_bounce_md_buf(bdev_io); 1468 break; 1469 default: 1470 assert(0 && "invalid retry state"); 1471 break; 1472 } 1473 1474 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1475 /* This IO completed again with NOMEM status, so break the loop and 1476 * don't try anymore. Note that a bdev_io that fails with NOMEM 1477 * always gets requeued at the front of the list, to maintain 1478 * ordering. 1479 */ 1480 break; 1481 } 1482 } 1483 } 1484 1485 static inline bool 1486 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1487 { 1488 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1489 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1490 1491 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1492 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1493 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1494 1495 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1496 * ownership of that sequence is transferred back to the bdev layer, so we need to 1497 * restore internal.accel_sequence to make sure that the sequence is handled 1498 * correctly in case the I/O is later aborted. */ 1499 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1500 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1501 assert(bdev_io->internal.accel_sequence == NULL); 1502 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1503 } 1504 1505 return true; 1506 } 1507 1508 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1509 bdev_ch_retry_io(bdev_ch); 1510 } 1511 1512 return false; 1513 } 1514 1515 static void 1516 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1517 { 1518 struct spdk_bdev_io *bdev_io = ctx; 1519 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1520 1521 if (rc) { 1522 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1523 } 1524 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1525 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1526 */ 1527 bdev_io_put_buf(bdev_io); 1528 1529 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1530 bdev_ch_retry_io(ch); 1531 } 1532 1533 /* Continue with IO completion flow */ 1534 bdev_io_complete(bdev_io); 1535 } 1536 1537 static void 1538 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1539 { 1540 struct spdk_bdev_io *bdev_io = ctx; 1541 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1542 1543 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1544 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1545 1546 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1547 bdev_ch_retry_io(ch); 1548 } 1549 1550 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1551 } 1552 1553 static inline void 1554 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1555 { 1556 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1557 int rc = 0; 1558 1559 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1560 /* do the same for metadata buffer */ 1561 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1562 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1563 1564 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1565 if (bdev_io_use_memory_domain(bdev_io)) { 1566 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1567 bdev_io_increment_outstanding(ch, ch->shared_resource); 1568 /* If memory domain is used then we need to call async push function */ 1569 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1570 bdev_io->internal.memory_domain_ctx, 1571 &bdev_io->internal.orig_md_iov, 1572 (uint32_t)bdev_io->internal.orig_iovcnt, 1573 &bdev_io->internal.bounce_md_iov, 1, 1574 bdev_io_push_bounce_md_buf_done, 1575 bdev_io); 1576 if (rc == 0) { 1577 /* Continue IO completion in async callback */ 1578 return; 1579 } 1580 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1581 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1582 if (rc != -ENOMEM) { 1583 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1584 spdk_memory_domain_get_dma_device_id( 1585 bdev_io->internal.memory_domain)); 1586 } 1587 } else { 1588 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1589 bdev_io->internal.orig_md_iov.iov_len); 1590 } 1591 } 1592 } 1593 1594 if (spdk_unlikely(rc == -ENOMEM)) { 1595 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1596 } else { 1597 assert(bdev_io->internal.data_transfer_cpl); 1598 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1599 } 1600 } 1601 1602 static inline void 1603 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1604 { 1605 assert(bdev_io->internal.data_transfer_cpl); 1606 if (rc) { 1607 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1608 return; 1609 } 1610 1611 /* set original buffer for this io */ 1612 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1613 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1614 /* disable bouncing buffer for this io */ 1615 bdev_io->internal.orig_iovcnt = 0; 1616 bdev_io->internal.orig_iovs = NULL; 1617 1618 bdev_io_push_bounce_md_buf(bdev_io); 1619 } 1620 1621 static void 1622 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1623 { 1624 struct spdk_bdev_io *bdev_io = ctx; 1625 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1626 1627 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1628 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1629 1630 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1631 bdev_ch_retry_io(ch); 1632 } 1633 1634 bdev_io_push_bounce_data_done(bdev_io, status); 1635 } 1636 1637 static inline void 1638 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1639 { 1640 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1641 int rc = 0; 1642 1643 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1644 assert(!bdev_io_use_accel_sequence(bdev_io)); 1645 1646 /* if this is read path, copy data from bounce buffer to original buffer */ 1647 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1648 if (bdev_io_use_memory_domain(bdev_io)) { 1649 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1650 bdev_io_increment_outstanding(ch, ch->shared_resource); 1651 /* If memory domain is used then we need to call async push function */ 1652 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1653 bdev_io->internal.memory_domain_ctx, 1654 bdev_io->internal.orig_iovs, 1655 (uint32_t)bdev_io->internal.orig_iovcnt, 1656 &bdev_io->internal.bounce_iov, 1, 1657 bdev_io_push_bounce_data_done_and_track, 1658 bdev_io); 1659 if (rc == 0) { 1660 /* Continue IO completion in async callback */ 1661 return; 1662 } 1663 1664 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1665 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1666 if (rc != -ENOMEM) { 1667 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1668 spdk_memory_domain_get_dma_device_id( 1669 bdev_io->internal.memory_domain)); 1670 } 1671 } else { 1672 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1673 bdev_io->internal.orig_iovcnt, 1674 bdev_io->internal.bounce_iov.iov_base, 1675 bdev_io->internal.bounce_iov.iov_len); 1676 } 1677 } 1678 1679 if (spdk_unlikely(rc == -ENOMEM)) { 1680 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1681 } else { 1682 bdev_io_push_bounce_data_done(bdev_io, rc); 1683 } 1684 } 1685 1686 static inline void 1687 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1688 { 1689 bdev_io->internal.data_transfer_cpl = cpl_cb; 1690 bdev_io_push_bounce_data(bdev_io); 1691 } 1692 1693 static void 1694 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1695 { 1696 struct spdk_bdev_io *bdev_io; 1697 1698 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1699 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1700 } 1701 1702 static void 1703 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1704 { 1705 struct spdk_bdev_mgmt_channel *mgmt_ch; 1706 uint64_t max_len; 1707 void *buf; 1708 1709 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1710 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1711 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1712 1713 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1714 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1715 bdev_io_get_buf_complete(bdev_io, false); 1716 return; 1717 } 1718 1719 bdev_io->internal.buf_len = len; 1720 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1721 bdev_io_get_iobuf_cb); 1722 if (buf != NULL) { 1723 _bdev_io_set_buf(bdev_io, buf, len); 1724 } 1725 } 1726 1727 void 1728 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1729 { 1730 struct spdk_bdev *bdev = bdev_io->bdev; 1731 uint64_t alignment; 1732 1733 assert(cb != NULL); 1734 bdev_io->internal.get_buf_cb = cb; 1735 1736 alignment = spdk_bdev_get_buf_align(bdev); 1737 1738 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1739 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1740 /* Buffer already present and aligned */ 1741 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1742 return; 1743 } 1744 1745 bdev_io_get_buf(bdev_io, len); 1746 } 1747 1748 static void 1749 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1750 bool success) 1751 { 1752 if (!success) { 1753 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1754 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1755 bdev_io_complete_unsubmitted(bdev_io); 1756 return; 1757 } 1758 1759 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1760 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1761 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1762 return; 1763 } 1764 /* For reads we'll execute the sequence after the data is read, so, for now, only 1765 * clear out accel_sequence pointer and submit the IO */ 1766 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1767 bdev_io->u.bdev.accel_sequence = NULL; 1768 } 1769 1770 bdev_io_submit(bdev_io); 1771 } 1772 1773 static void 1774 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1775 uint64_t len) 1776 { 1777 assert(cb != NULL); 1778 bdev_io->internal.get_buf_cb = cb; 1779 1780 bdev_io_get_buf(bdev_io, len); 1781 } 1782 1783 void 1784 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1785 { 1786 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1787 1788 assert(cb != NULL); 1789 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1790 bdev_io->internal.get_aux_buf_cb = cb; 1791 bdev_io_get_buf(bdev_io, len); 1792 } 1793 1794 static int 1795 bdev_module_get_max_ctx_size(void) 1796 { 1797 struct spdk_bdev_module *bdev_module; 1798 int max_bdev_module_size = 0; 1799 1800 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1801 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1802 max_bdev_module_size = bdev_module->get_ctx_size(); 1803 } 1804 } 1805 1806 return max_bdev_module_size; 1807 } 1808 1809 static void 1810 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1811 { 1812 int i; 1813 struct spdk_bdev_qos *qos = bdev->internal.qos; 1814 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1815 1816 if (!qos) { 1817 return; 1818 } 1819 1820 spdk_bdev_get_qos_rate_limits(bdev, limits); 1821 1822 spdk_json_write_object_begin(w); 1823 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1824 1825 spdk_json_write_named_object_begin(w, "params"); 1826 spdk_json_write_named_string(w, "name", bdev->name); 1827 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1828 if (limits[i] > 0) { 1829 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1830 } 1831 } 1832 spdk_json_write_object_end(w); 1833 1834 spdk_json_write_object_end(w); 1835 } 1836 1837 void 1838 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1839 { 1840 struct spdk_bdev_module *bdev_module; 1841 struct spdk_bdev *bdev; 1842 1843 assert(w != NULL); 1844 1845 spdk_json_write_array_begin(w); 1846 1847 spdk_json_write_object_begin(w); 1848 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1849 spdk_json_write_named_object_begin(w, "params"); 1850 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1851 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1852 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1853 spdk_json_write_object_end(w); 1854 spdk_json_write_object_end(w); 1855 1856 bdev_examine_allowlist_config_json(w); 1857 1858 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1859 if (bdev_module->config_json) { 1860 bdev_module->config_json(w); 1861 } 1862 } 1863 1864 spdk_spin_lock(&g_bdev_mgr.spinlock); 1865 1866 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1867 if (bdev->fn_table->write_config_json) { 1868 bdev->fn_table->write_config_json(bdev, w); 1869 } 1870 1871 bdev_qos_config_json(bdev, w); 1872 } 1873 1874 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1875 1876 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1877 spdk_json_write_object_begin(w); 1878 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1879 spdk_json_write_object_end(w); 1880 1881 spdk_json_write_array_end(w); 1882 } 1883 1884 static void 1885 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1886 { 1887 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1888 struct spdk_bdev_io *bdev_io; 1889 1890 spdk_iobuf_channel_fini(&ch->iobuf); 1891 1892 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1893 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1894 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1895 ch->per_thread_cache_count--; 1896 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1897 } 1898 1899 assert(ch->per_thread_cache_count == 0); 1900 } 1901 1902 static int 1903 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1904 { 1905 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1906 struct spdk_bdev_io *bdev_io; 1907 uint32_t i; 1908 int rc; 1909 1910 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1911 if (rc != 0) { 1912 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1913 return -1; 1914 } 1915 1916 STAILQ_INIT(&ch->per_thread_cache); 1917 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1918 1919 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1920 ch->per_thread_cache_count = 0; 1921 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1922 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1923 if (bdev_io == NULL) { 1924 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1925 assert(false); 1926 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1927 return -1; 1928 } 1929 ch->per_thread_cache_count++; 1930 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1931 } 1932 1933 TAILQ_INIT(&ch->shared_resources); 1934 TAILQ_INIT(&ch->io_wait_queue); 1935 1936 return 0; 1937 } 1938 1939 static void 1940 bdev_init_complete(int rc) 1941 { 1942 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1943 void *cb_arg = g_init_cb_arg; 1944 struct spdk_bdev_module *m; 1945 1946 g_bdev_mgr.init_complete = true; 1947 g_init_cb_fn = NULL; 1948 g_init_cb_arg = NULL; 1949 1950 /* 1951 * For modules that need to know when subsystem init is complete, 1952 * inform them now. 1953 */ 1954 if (rc == 0) { 1955 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1956 if (m->init_complete) { 1957 m->init_complete(); 1958 } 1959 } 1960 } 1961 1962 cb_fn(cb_arg, rc); 1963 } 1964 1965 static bool 1966 bdev_module_all_actions_completed(void) 1967 { 1968 struct spdk_bdev_module *m; 1969 1970 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1971 if (m->internal.action_in_progress > 0) { 1972 return false; 1973 } 1974 } 1975 return true; 1976 } 1977 1978 static void 1979 bdev_module_action_complete(void) 1980 { 1981 /* 1982 * Don't finish bdev subsystem initialization if 1983 * module pre-initialization is still in progress, or 1984 * the subsystem been already initialized. 1985 */ 1986 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1987 return; 1988 } 1989 1990 /* 1991 * Check all bdev modules for inits/examinations in progress. If any 1992 * exist, return immediately since we cannot finish bdev subsystem 1993 * initialization until all are completed. 1994 */ 1995 if (!bdev_module_all_actions_completed()) { 1996 return; 1997 } 1998 1999 /* 2000 * Modules already finished initialization - now that all 2001 * the bdev modules have finished their asynchronous I/O 2002 * processing, the entire bdev layer can be marked as complete. 2003 */ 2004 bdev_init_complete(0); 2005 } 2006 2007 static void 2008 bdev_module_action_done(struct spdk_bdev_module *module) 2009 { 2010 spdk_spin_lock(&module->internal.spinlock); 2011 assert(module->internal.action_in_progress > 0); 2012 module->internal.action_in_progress--; 2013 spdk_spin_unlock(&module->internal.spinlock); 2014 bdev_module_action_complete(); 2015 } 2016 2017 void 2018 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2019 { 2020 assert(module->async_init); 2021 bdev_module_action_done(module); 2022 } 2023 2024 void 2025 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2026 { 2027 bdev_module_action_done(module); 2028 } 2029 2030 /** The last initialized bdev module */ 2031 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2032 2033 static void 2034 bdev_init_failed(void *cb_arg) 2035 { 2036 struct spdk_bdev_module *module = cb_arg; 2037 2038 spdk_spin_lock(&module->internal.spinlock); 2039 assert(module->internal.action_in_progress > 0); 2040 module->internal.action_in_progress--; 2041 spdk_spin_unlock(&module->internal.spinlock); 2042 bdev_init_complete(-1); 2043 } 2044 2045 static int 2046 bdev_modules_init(void) 2047 { 2048 struct spdk_bdev_module *module; 2049 int rc = 0; 2050 2051 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2052 g_resume_bdev_module = module; 2053 if (module->async_init) { 2054 spdk_spin_lock(&module->internal.spinlock); 2055 module->internal.action_in_progress = 1; 2056 spdk_spin_unlock(&module->internal.spinlock); 2057 } 2058 rc = module->module_init(); 2059 if (rc != 0) { 2060 /* Bump action_in_progress to prevent other modules from completion of modules_init 2061 * Send message to defer application shutdown until resources are cleaned up */ 2062 spdk_spin_lock(&module->internal.spinlock); 2063 module->internal.action_in_progress = 1; 2064 spdk_spin_unlock(&module->internal.spinlock); 2065 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2066 return rc; 2067 } 2068 } 2069 2070 g_resume_bdev_module = NULL; 2071 return 0; 2072 } 2073 2074 void 2075 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2076 { 2077 int rc = 0; 2078 char mempool_name[32]; 2079 2080 assert(cb_fn != NULL); 2081 2082 g_init_cb_fn = cb_fn; 2083 g_init_cb_arg = cb_arg; 2084 2085 spdk_notify_type_register("bdev_register"); 2086 spdk_notify_type_register("bdev_unregister"); 2087 2088 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2089 2090 rc = spdk_iobuf_register_module("bdev"); 2091 if (rc != 0) { 2092 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2093 bdev_init_complete(-1); 2094 return; 2095 } 2096 2097 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2098 g_bdev_opts.bdev_io_pool_size, 2099 sizeof(struct spdk_bdev_io) + 2100 bdev_module_get_max_ctx_size(), 2101 0, 2102 SPDK_ENV_SOCKET_ID_ANY); 2103 2104 if (g_bdev_mgr.bdev_io_pool == NULL) { 2105 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2106 bdev_init_complete(-1); 2107 return; 2108 } 2109 2110 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2111 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2112 if (!g_bdev_mgr.zero_buffer) { 2113 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2114 bdev_init_complete(-1); 2115 return; 2116 } 2117 2118 #ifdef SPDK_CONFIG_VTUNE 2119 SPDK_LOG_DEPRECATED(vtune_support); 2120 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2121 #endif 2122 2123 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2124 bdev_mgmt_channel_destroy, 2125 sizeof(struct spdk_bdev_mgmt_channel), 2126 "bdev_mgr"); 2127 2128 rc = bdev_modules_init(); 2129 g_bdev_mgr.module_init_complete = true; 2130 if (rc != 0) { 2131 SPDK_ERRLOG("bdev modules init failed\n"); 2132 return; 2133 } 2134 2135 bdev_module_action_complete(); 2136 } 2137 2138 static void 2139 bdev_mgr_unregister_cb(void *io_device) 2140 { 2141 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2142 2143 if (g_bdev_mgr.bdev_io_pool) { 2144 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2145 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2146 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2147 g_bdev_opts.bdev_io_pool_size); 2148 } 2149 2150 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2151 } 2152 2153 spdk_free(g_bdev_mgr.zero_buffer); 2154 2155 bdev_examine_allowlist_free(); 2156 2157 cb_fn(g_fini_cb_arg); 2158 g_fini_cb_fn = NULL; 2159 g_fini_cb_arg = NULL; 2160 g_bdev_mgr.init_complete = false; 2161 g_bdev_mgr.module_init_complete = false; 2162 } 2163 2164 static void 2165 bdev_module_fini_iter(void *arg) 2166 { 2167 struct spdk_bdev_module *bdev_module; 2168 2169 /* FIXME: Handling initialization failures is broken now, 2170 * so we won't even try cleaning up after successfully 2171 * initialized modules. if module_init_complete is false, 2172 * just call spdk_bdev_mgr_unregister_cb 2173 */ 2174 if (!g_bdev_mgr.module_init_complete) { 2175 bdev_mgr_unregister_cb(NULL); 2176 return; 2177 } 2178 2179 /* Start iterating from the last touched module */ 2180 if (!g_resume_bdev_module) { 2181 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2182 } else { 2183 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2184 internal.tailq); 2185 } 2186 2187 while (bdev_module) { 2188 if (bdev_module->async_fini) { 2189 /* Save our place so we can resume later. We must 2190 * save the variable here, before calling module_fini() 2191 * below, because in some cases the module may immediately 2192 * call spdk_bdev_module_fini_done() and re-enter 2193 * this function to continue iterating. */ 2194 g_resume_bdev_module = bdev_module; 2195 } 2196 2197 if (bdev_module->module_fini) { 2198 bdev_module->module_fini(); 2199 } 2200 2201 if (bdev_module->async_fini) { 2202 return; 2203 } 2204 2205 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2206 internal.tailq); 2207 } 2208 2209 g_resume_bdev_module = NULL; 2210 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2211 } 2212 2213 void 2214 spdk_bdev_module_fini_done(void) 2215 { 2216 if (spdk_get_thread() != g_fini_thread) { 2217 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2218 } else { 2219 bdev_module_fini_iter(NULL); 2220 } 2221 } 2222 2223 static void 2224 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2225 { 2226 struct spdk_bdev *bdev = cb_arg; 2227 2228 if (bdeverrno && bdev) { 2229 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2230 bdev->name); 2231 2232 /* 2233 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2234 * bdev; try to continue by manually removing this bdev from the list and continue 2235 * with the next bdev in the list. 2236 */ 2237 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2238 } 2239 2240 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2241 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2242 /* 2243 * Bdev module finish need to be deferred as we might be in the middle of some context 2244 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2245 * after returning. 2246 */ 2247 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2248 return; 2249 } 2250 2251 /* 2252 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2253 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2254 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2255 * base bdevs. 2256 * 2257 * Also, walk the list in the reverse order. 2258 */ 2259 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2260 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2261 spdk_spin_lock(&bdev->internal.spinlock); 2262 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2263 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2264 spdk_spin_unlock(&bdev->internal.spinlock); 2265 continue; 2266 } 2267 spdk_spin_unlock(&bdev->internal.spinlock); 2268 2269 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2270 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2271 return; 2272 } 2273 2274 /* 2275 * If any bdev fails to unclaim underlying bdev properly, we may face the 2276 * case of bdev list consisting of claimed bdevs only (if claims are managed 2277 * correctly, this would mean there's a loop in the claims graph which is 2278 * clearly impossible). Warn and unregister last bdev on the list then. 2279 */ 2280 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2281 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2282 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2283 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2284 return; 2285 } 2286 } 2287 2288 static void 2289 bdev_module_fini_start_iter(void *arg) 2290 { 2291 struct spdk_bdev_module *bdev_module; 2292 2293 if (!g_resume_bdev_module) { 2294 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2295 } else { 2296 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2297 } 2298 2299 while (bdev_module) { 2300 if (bdev_module->async_fini_start) { 2301 /* Save our place so we can resume later. We must 2302 * save the variable here, before calling fini_start() 2303 * below, because in some cases the module may immediately 2304 * call spdk_bdev_module_fini_start_done() and re-enter 2305 * this function to continue iterating. */ 2306 g_resume_bdev_module = bdev_module; 2307 } 2308 2309 if (bdev_module->fini_start) { 2310 bdev_module->fini_start(); 2311 } 2312 2313 if (bdev_module->async_fini_start) { 2314 return; 2315 } 2316 2317 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2318 } 2319 2320 g_resume_bdev_module = NULL; 2321 2322 bdev_finish_unregister_bdevs_iter(NULL, 0); 2323 } 2324 2325 void 2326 spdk_bdev_module_fini_start_done(void) 2327 { 2328 if (spdk_get_thread() != g_fini_thread) { 2329 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2330 } else { 2331 bdev_module_fini_start_iter(NULL); 2332 } 2333 } 2334 2335 static void 2336 bdev_finish_wait_for_examine_done(void *cb_arg) 2337 { 2338 bdev_module_fini_start_iter(NULL); 2339 } 2340 2341 void 2342 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2343 { 2344 int rc; 2345 2346 assert(cb_fn != NULL); 2347 2348 g_fini_thread = spdk_get_thread(); 2349 2350 g_fini_cb_fn = cb_fn; 2351 g_fini_cb_arg = cb_arg; 2352 2353 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2354 if (rc != 0) { 2355 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2356 bdev_finish_wait_for_examine_done(NULL); 2357 } 2358 } 2359 2360 struct spdk_bdev_io * 2361 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2362 { 2363 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2364 struct spdk_bdev_io *bdev_io; 2365 2366 if (ch->per_thread_cache_count > 0) { 2367 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2368 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2369 ch->per_thread_cache_count--; 2370 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2371 /* 2372 * Don't try to look for bdev_ios in the global pool if there are 2373 * waiters on bdev_ios - we don't want this caller to jump the line. 2374 */ 2375 bdev_io = NULL; 2376 } else { 2377 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2378 } 2379 2380 return bdev_io; 2381 } 2382 2383 void 2384 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2385 { 2386 struct spdk_bdev_mgmt_channel *ch; 2387 2388 assert(bdev_io != NULL); 2389 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2390 2391 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2392 2393 if (bdev_io->internal.buf != NULL) { 2394 bdev_io_put_buf(bdev_io); 2395 } 2396 2397 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2398 ch->per_thread_cache_count++; 2399 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2400 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2401 struct spdk_bdev_io_wait_entry *entry; 2402 2403 entry = TAILQ_FIRST(&ch->io_wait_queue); 2404 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2405 entry->cb_fn(entry->cb_arg); 2406 } 2407 } else { 2408 /* We should never have a full cache with entries on the io wait queue. */ 2409 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2410 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2411 } 2412 } 2413 2414 static bool 2415 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2416 { 2417 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2418 2419 switch (limit) { 2420 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2421 return true; 2422 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2423 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2424 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2425 return false; 2426 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2427 default: 2428 return false; 2429 } 2430 } 2431 2432 static bool 2433 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2434 { 2435 switch (bdev_io->type) { 2436 case SPDK_BDEV_IO_TYPE_NVME_IO: 2437 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2438 case SPDK_BDEV_IO_TYPE_READ: 2439 case SPDK_BDEV_IO_TYPE_WRITE: 2440 return true; 2441 case SPDK_BDEV_IO_TYPE_ZCOPY: 2442 if (bdev_io->u.bdev.zcopy.start) { 2443 return true; 2444 } else { 2445 return false; 2446 } 2447 default: 2448 return false; 2449 } 2450 } 2451 2452 static bool 2453 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2454 { 2455 switch (bdev_io->type) { 2456 case SPDK_BDEV_IO_TYPE_NVME_IO: 2457 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2458 /* Bit 1 (0x2) set for read operation */ 2459 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2460 return true; 2461 } else { 2462 return false; 2463 } 2464 case SPDK_BDEV_IO_TYPE_READ: 2465 return true; 2466 case SPDK_BDEV_IO_TYPE_ZCOPY: 2467 /* Populate to read from disk */ 2468 if (bdev_io->u.bdev.zcopy.populate) { 2469 return true; 2470 } else { 2471 return false; 2472 } 2473 default: 2474 return false; 2475 } 2476 } 2477 2478 static uint64_t 2479 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2480 { 2481 struct spdk_bdev *bdev = bdev_io->bdev; 2482 2483 switch (bdev_io->type) { 2484 case SPDK_BDEV_IO_TYPE_NVME_IO: 2485 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2486 return bdev_io->u.nvme_passthru.nbytes; 2487 case SPDK_BDEV_IO_TYPE_READ: 2488 case SPDK_BDEV_IO_TYPE_WRITE: 2489 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2490 case SPDK_BDEV_IO_TYPE_ZCOPY: 2491 /* Track the data in the start phase only */ 2492 if (bdev_io->u.bdev.zcopy.start) { 2493 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2494 } else { 2495 return 0; 2496 } 2497 default: 2498 return 0; 2499 } 2500 } 2501 2502 static bool 2503 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2504 { 2505 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2506 return true; 2507 } else { 2508 return false; 2509 } 2510 } 2511 2512 static bool 2513 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2514 { 2515 if (bdev_is_read_io(io) == false) { 2516 return false; 2517 } 2518 2519 return bdev_qos_rw_queue_io(limit, io); 2520 } 2521 2522 static bool 2523 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2524 { 2525 if (bdev_is_read_io(io) == true) { 2526 return false; 2527 } 2528 2529 return bdev_qos_rw_queue_io(limit, io); 2530 } 2531 2532 static void 2533 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2534 { 2535 limit->remaining_this_timeslice--; 2536 } 2537 2538 static void 2539 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2540 { 2541 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2542 } 2543 2544 static void 2545 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2546 { 2547 if (bdev_is_read_io(io) == false) { 2548 return; 2549 } 2550 2551 return bdev_qos_rw_bps_update_quota(limit, io); 2552 } 2553 2554 static void 2555 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2556 { 2557 if (bdev_is_read_io(io) == true) { 2558 return; 2559 } 2560 2561 return bdev_qos_rw_bps_update_quota(limit, io); 2562 } 2563 2564 static void 2565 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2566 { 2567 int i; 2568 2569 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2570 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2571 qos->rate_limits[i].queue_io = NULL; 2572 qos->rate_limits[i].update_quota = NULL; 2573 continue; 2574 } 2575 2576 switch (i) { 2577 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2578 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2579 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2580 break; 2581 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2582 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2583 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2584 break; 2585 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2586 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2587 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2588 break; 2589 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2590 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2591 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2592 break; 2593 default: 2594 break; 2595 } 2596 } 2597 } 2598 2599 static void 2600 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2601 struct spdk_bdev_io *bdev_io, 2602 enum spdk_bdev_io_status status) 2603 { 2604 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2605 2606 bdev_io->internal.in_submit_request = true; 2607 bdev_ch->io_outstanding++; 2608 shared_resource->io_outstanding++; 2609 spdk_bdev_io_complete(bdev_io, status); 2610 bdev_io->internal.in_submit_request = false; 2611 } 2612 2613 static inline void 2614 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2615 { 2616 struct spdk_bdev *bdev = bdev_io->bdev; 2617 struct spdk_io_channel *ch = bdev_ch->channel; 2618 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2619 2620 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2621 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2622 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2623 2624 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2625 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2626 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2627 SPDK_BDEV_IO_STATUS_SUCCESS); 2628 return; 2629 } 2630 } 2631 2632 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2633 bdev_io->bdev->split_on_write_unit && 2634 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2635 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2636 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2637 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2638 return; 2639 } 2640 2641 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2642 bdev_ch->io_outstanding++; 2643 shared_resource->io_outstanding++; 2644 bdev_io->internal.in_submit_request = true; 2645 bdev_submit_request(bdev, ch, bdev_io); 2646 bdev_io->internal.in_submit_request = false; 2647 } else { 2648 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2649 } 2650 } 2651 2652 static bool 2653 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2654 { 2655 int i; 2656 2657 if (bdev_qos_io_to_limit(bdev_io) == true) { 2658 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2659 if (!qos->rate_limits[i].queue_io) { 2660 continue; 2661 } 2662 2663 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2664 bdev_io) == true) { 2665 return true; 2666 } 2667 } 2668 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2669 if (!qos->rate_limits[i].update_quota) { 2670 continue; 2671 } 2672 2673 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2674 } 2675 } 2676 2677 return false; 2678 } 2679 2680 static inline void 2681 _bdev_io_do_submit(void *ctx) 2682 { 2683 struct spdk_bdev_io *bdev_io = ctx; 2684 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2685 2686 bdev_io_do_submit(ch, bdev_io); 2687 } 2688 2689 static int 2690 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2691 { 2692 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2693 int submitted_ios = 0; 2694 2695 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2696 if (!bdev_qos_queue_io(qos, bdev_io)) { 2697 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2698 2699 if (bdev_io->internal.io_submit_ch) { 2700 /* Send back the IO to the original thread for the actual processing. */ 2701 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2702 bdev_io->internal.io_submit_ch = NULL; 2703 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2704 _bdev_io_do_submit, bdev_io); 2705 } else { 2706 bdev_io_do_submit(ch, bdev_io); 2707 } 2708 2709 submitted_ios++; 2710 } 2711 } 2712 2713 return submitted_ios; 2714 } 2715 2716 static void 2717 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2718 { 2719 int rc; 2720 2721 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2722 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2723 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2724 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2725 &bdev_io->internal.waitq_entry); 2726 if (rc != 0) { 2727 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2728 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2729 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2730 } 2731 } 2732 2733 static bool 2734 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2735 { 2736 uint32_t io_boundary; 2737 struct spdk_bdev *bdev = bdev_io->bdev; 2738 uint32_t max_size = bdev->max_segment_size; 2739 int max_segs = bdev->max_num_segments; 2740 2741 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2742 io_boundary = bdev->write_unit_size; 2743 } else if (bdev->split_on_optimal_io_boundary) { 2744 io_boundary = bdev->optimal_io_boundary; 2745 } else { 2746 io_boundary = 0; 2747 } 2748 2749 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2750 return false; 2751 } 2752 2753 if (io_boundary) { 2754 uint64_t start_stripe, end_stripe; 2755 2756 start_stripe = bdev_io->u.bdev.offset_blocks; 2757 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2758 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2759 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2760 start_stripe >>= spdk_u32log2(io_boundary); 2761 end_stripe >>= spdk_u32log2(io_boundary); 2762 } else { 2763 start_stripe /= io_boundary; 2764 end_stripe /= io_boundary; 2765 } 2766 2767 if (start_stripe != end_stripe) { 2768 return true; 2769 } 2770 } 2771 2772 if (max_segs) { 2773 if (bdev_io->u.bdev.iovcnt > max_segs) { 2774 return true; 2775 } 2776 } 2777 2778 if (max_size) { 2779 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2780 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2781 return true; 2782 } 2783 } 2784 } 2785 2786 return false; 2787 } 2788 2789 static bool 2790 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2791 { 2792 uint32_t num_unmap_segments; 2793 2794 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2795 return false; 2796 } 2797 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2798 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2799 return true; 2800 } 2801 2802 return false; 2803 } 2804 2805 static bool 2806 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2807 { 2808 if (!bdev_io->bdev->max_write_zeroes) { 2809 return false; 2810 } 2811 2812 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2813 return true; 2814 } 2815 2816 return false; 2817 } 2818 2819 static bool 2820 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2821 { 2822 if (bdev_io->bdev->max_copy != 0 && 2823 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2824 return true; 2825 } 2826 2827 return false; 2828 } 2829 2830 static bool 2831 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2832 { 2833 switch (bdev_io->type) { 2834 case SPDK_BDEV_IO_TYPE_READ: 2835 case SPDK_BDEV_IO_TYPE_WRITE: 2836 return bdev_rw_should_split(bdev_io); 2837 case SPDK_BDEV_IO_TYPE_UNMAP: 2838 return bdev_unmap_should_split(bdev_io); 2839 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2840 return bdev_write_zeroes_should_split(bdev_io); 2841 case SPDK_BDEV_IO_TYPE_COPY: 2842 return bdev_copy_should_split(bdev_io); 2843 default: 2844 return false; 2845 } 2846 } 2847 2848 static uint32_t 2849 _to_next_boundary(uint64_t offset, uint32_t boundary) 2850 { 2851 return (boundary - (offset % boundary)); 2852 } 2853 2854 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2855 2856 static void _bdev_rw_split(void *_bdev_io); 2857 2858 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2859 2860 static void 2861 _bdev_unmap_split(void *_bdev_io) 2862 { 2863 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2864 } 2865 2866 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2867 2868 static void 2869 _bdev_write_zeroes_split(void *_bdev_io) 2870 { 2871 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2872 } 2873 2874 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2875 2876 static void 2877 _bdev_copy_split(void *_bdev_io) 2878 { 2879 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2880 } 2881 2882 static int 2883 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2884 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2885 { 2886 int rc; 2887 uint64_t current_offset, current_remaining, current_src_offset; 2888 spdk_bdev_io_wait_cb io_wait_fn; 2889 2890 current_offset = *offset; 2891 current_remaining = *remaining; 2892 2893 bdev_io->u.bdev.split_outstanding++; 2894 2895 io_wait_fn = _bdev_rw_split; 2896 switch (bdev_io->type) { 2897 case SPDK_BDEV_IO_TYPE_READ: 2898 assert(bdev_io->u.bdev.accel_sequence == NULL); 2899 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2900 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2901 iov, iovcnt, md_buf, current_offset, 2902 num_blocks, bdev_io->internal.memory_domain, 2903 bdev_io->internal.memory_domain_ctx, NULL, 2904 bdev_io_split_done, bdev_io); 2905 break; 2906 case SPDK_BDEV_IO_TYPE_WRITE: 2907 assert(bdev_io->u.bdev.accel_sequence == NULL); 2908 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2909 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2910 iov, iovcnt, md_buf, current_offset, 2911 num_blocks, bdev_io->internal.memory_domain, 2912 bdev_io->internal.memory_domain_ctx, NULL, 2913 bdev_io_split_done, bdev_io); 2914 break; 2915 case SPDK_BDEV_IO_TYPE_UNMAP: 2916 io_wait_fn = _bdev_unmap_split; 2917 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2918 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2919 current_offset, num_blocks, 2920 bdev_io_split_done, bdev_io); 2921 break; 2922 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2923 io_wait_fn = _bdev_write_zeroes_split; 2924 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2925 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2926 current_offset, num_blocks, 2927 bdev_io_split_done, bdev_io); 2928 break; 2929 case SPDK_BDEV_IO_TYPE_COPY: 2930 io_wait_fn = _bdev_copy_split; 2931 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2932 (current_offset - bdev_io->u.bdev.offset_blocks); 2933 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2934 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2935 current_offset, current_src_offset, num_blocks, 2936 bdev_io_split_done, bdev_io); 2937 break; 2938 default: 2939 assert(false); 2940 rc = -EINVAL; 2941 break; 2942 } 2943 2944 if (rc == 0) { 2945 current_offset += num_blocks; 2946 current_remaining -= num_blocks; 2947 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2948 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2949 *offset = current_offset; 2950 *remaining = current_remaining; 2951 } else { 2952 bdev_io->u.bdev.split_outstanding--; 2953 if (rc == -ENOMEM) { 2954 if (bdev_io->u.bdev.split_outstanding == 0) { 2955 /* No I/O is outstanding. Hence we should wait here. */ 2956 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2957 } 2958 } else { 2959 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2960 if (bdev_io->u.bdev.split_outstanding == 0) { 2961 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2962 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2963 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2964 } 2965 } 2966 } 2967 2968 return rc; 2969 } 2970 2971 static void 2972 _bdev_rw_split(void *_bdev_io) 2973 { 2974 struct iovec *parent_iov, *iov; 2975 struct spdk_bdev_io *bdev_io = _bdev_io; 2976 struct spdk_bdev *bdev = bdev_io->bdev; 2977 uint64_t parent_offset, current_offset, remaining; 2978 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2979 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2980 uint32_t iovcnt, iov_len, child_iovsize; 2981 uint32_t blocklen = bdev->blocklen; 2982 uint32_t io_boundary; 2983 uint32_t max_segment_size = bdev->max_segment_size; 2984 uint32_t max_child_iovcnt = bdev->max_num_segments; 2985 void *md_buf = NULL; 2986 int rc; 2987 2988 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2989 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2990 SPDK_BDEV_IO_NUM_CHILD_IOV; 2991 2992 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2993 io_boundary = bdev->write_unit_size; 2994 } else if (bdev->split_on_optimal_io_boundary) { 2995 io_boundary = bdev->optimal_io_boundary; 2996 } else { 2997 io_boundary = UINT32_MAX; 2998 } 2999 3000 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3001 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3002 parent_offset = bdev_io->u.bdev.offset_blocks; 3003 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3004 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3005 3006 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3007 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3008 if (parent_iov_offset < parent_iov->iov_len) { 3009 break; 3010 } 3011 parent_iov_offset -= parent_iov->iov_len; 3012 } 3013 3014 child_iovcnt = 0; 3015 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3016 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3017 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3018 to_next_boundary = spdk_min(remaining, to_next_boundary); 3019 to_next_boundary_bytes = to_next_boundary * blocklen; 3020 3021 iov = &bdev_io->child_iov[child_iovcnt]; 3022 iovcnt = 0; 3023 3024 if (bdev_io->u.bdev.md_buf) { 3025 md_buf = (char *)bdev_io->u.bdev.md_buf + 3026 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3027 } 3028 3029 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3030 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3031 iovcnt < child_iovsize) { 3032 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3033 iov_len = parent_iov->iov_len - parent_iov_offset; 3034 3035 iov_len = spdk_min(iov_len, max_segment_size); 3036 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3037 to_next_boundary_bytes -= iov_len; 3038 3039 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3040 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3041 3042 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3043 parent_iov_offset += iov_len; 3044 } else { 3045 parent_iovpos++; 3046 parent_iov_offset = 0; 3047 } 3048 child_iovcnt++; 3049 iovcnt++; 3050 } 3051 3052 if (to_next_boundary_bytes > 0) { 3053 /* We had to stop this child I/O early because we ran out of 3054 * child_iov space or were limited by max_num_segments. 3055 * Ensure the iovs to be aligned with block size and 3056 * then adjust to_next_boundary before starting the 3057 * child I/O. 3058 */ 3059 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3060 iovcnt == child_iovsize); 3061 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3062 if (to_last_block_bytes != 0) { 3063 uint32_t child_iovpos = child_iovcnt - 1; 3064 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3065 * so the loop will naturally end 3066 */ 3067 3068 to_last_block_bytes = blocklen - to_last_block_bytes; 3069 to_next_boundary_bytes += to_last_block_bytes; 3070 while (to_last_block_bytes > 0 && iovcnt > 0) { 3071 iov_len = spdk_min(to_last_block_bytes, 3072 bdev_io->child_iov[child_iovpos].iov_len); 3073 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3074 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3075 child_iovpos--; 3076 if (--iovcnt == 0) { 3077 /* If the child IO is less than a block size just return. 3078 * If the first child IO of any split round is less than 3079 * a block size, an error exit. 3080 */ 3081 if (bdev_io->u.bdev.split_outstanding == 0) { 3082 SPDK_ERRLOG("The first child io was less than a block size\n"); 3083 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3084 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3085 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3086 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3087 } 3088 3089 return; 3090 } 3091 } 3092 3093 to_last_block_bytes -= iov_len; 3094 3095 if (parent_iov_offset == 0) { 3096 parent_iovpos--; 3097 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3098 } 3099 parent_iov_offset -= iov_len; 3100 } 3101 3102 assert(to_last_block_bytes == 0); 3103 } 3104 to_next_boundary -= to_next_boundary_bytes / blocklen; 3105 } 3106 3107 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3108 ¤t_offset, &remaining); 3109 if (spdk_unlikely(rc)) { 3110 return; 3111 } 3112 } 3113 } 3114 3115 static void 3116 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3117 { 3118 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3119 uint32_t num_children_reqs = 0; 3120 int rc; 3121 3122 offset = bdev_io->u.bdev.split_current_offset_blocks; 3123 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3124 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3125 3126 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3127 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3128 3129 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3130 &offset, &remaining); 3131 if (spdk_likely(rc == 0)) { 3132 num_children_reqs++; 3133 } else { 3134 return; 3135 } 3136 } 3137 } 3138 3139 static void 3140 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3141 { 3142 uint64_t offset, write_zeroes_blocks, remaining; 3143 uint32_t num_children_reqs = 0; 3144 int rc; 3145 3146 offset = bdev_io->u.bdev.split_current_offset_blocks; 3147 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3148 3149 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3150 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3151 3152 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3153 &offset, &remaining); 3154 if (spdk_likely(rc == 0)) { 3155 num_children_reqs++; 3156 } else { 3157 return; 3158 } 3159 } 3160 } 3161 3162 static void 3163 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3164 { 3165 uint64_t offset, copy_blocks, remaining; 3166 uint32_t num_children_reqs = 0; 3167 int rc; 3168 3169 offset = bdev_io->u.bdev.split_current_offset_blocks; 3170 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3171 3172 assert(bdev_io->bdev->max_copy != 0); 3173 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3174 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3175 3176 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3177 &offset, &remaining); 3178 if (spdk_likely(rc == 0)) { 3179 num_children_reqs++; 3180 } else { 3181 return; 3182 } 3183 } 3184 } 3185 3186 static void 3187 parent_bdev_io_complete(void *ctx, int rc) 3188 { 3189 struct spdk_bdev_io *parent_io = ctx; 3190 3191 if (rc) { 3192 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3193 } 3194 3195 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3196 parent_io->internal.caller_ctx); 3197 } 3198 3199 static void 3200 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3201 { 3202 struct spdk_bdev_io *bdev_io = ctx; 3203 3204 /* u.bdev.accel_sequence should have already been cleared at this point */ 3205 assert(bdev_io->u.bdev.accel_sequence == NULL); 3206 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3207 bdev_io->internal.accel_sequence = NULL; 3208 3209 if (spdk_unlikely(status != 0)) { 3210 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3211 } 3212 3213 parent_bdev_io_complete(bdev_io, status); 3214 } 3215 3216 static void 3217 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3218 { 3219 struct spdk_bdev_io *parent_io = cb_arg; 3220 3221 spdk_bdev_free_io(bdev_io); 3222 3223 if (!success) { 3224 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3225 /* If any child I/O failed, stop further splitting process. */ 3226 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3227 parent_io->u.bdev.split_remaining_num_blocks = 0; 3228 } 3229 parent_io->u.bdev.split_outstanding--; 3230 if (parent_io->u.bdev.split_outstanding != 0) { 3231 return; 3232 } 3233 3234 /* 3235 * Parent I/O finishes when all blocks are consumed. 3236 */ 3237 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3238 assert(parent_io->internal.cb != bdev_io_split_done); 3239 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3240 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3241 3242 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3243 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3244 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3245 return; 3246 } else if (parent_io->internal.orig_iovcnt != 0 && 3247 !bdev_io_use_accel_sequence(bdev_io)) { 3248 /* bdev IO will be completed in the callback */ 3249 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3250 return; 3251 } 3252 } 3253 3254 parent_bdev_io_complete(parent_io, 0); 3255 return; 3256 } 3257 3258 /* 3259 * Continue with the splitting process. This function will complete the parent I/O if the 3260 * splitting is done. 3261 */ 3262 switch (parent_io->type) { 3263 case SPDK_BDEV_IO_TYPE_READ: 3264 case SPDK_BDEV_IO_TYPE_WRITE: 3265 _bdev_rw_split(parent_io); 3266 break; 3267 case SPDK_BDEV_IO_TYPE_UNMAP: 3268 bdev_unmap_split(parent_io); 3269 break; 3270 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3271 bdev_write_zeroes_split(parent_io); 3272 break; 3273 case SPDK_BDEV_IO_TYPE_COPY: 3274 bdev_copy_split(parent_io); 3275 break; 3276 default: 3277 assert(false); 3278 break; 3279 } 3280 } 3281 3282 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3283 bool success); 3284 3285 static void 3286 bdev_io_split(struct spdk_bdev_io *bdev_io) 3287 { 3288 assert(bdev_io_should_split(bdev_io)); 3289 3290 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3291 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3292 bdev_io->u.bdev.split_outstanding = 0; 3293 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3294 3295 switch (bdev_io->type) { 3296 case SPDK_BDEV_IO_TYPE_READ: 3297 case SPDK_BDEV_IO_TYPE_WRITE: 3298 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3299 _bdev_rw_split(bdev_io); 3300 } else { 3301 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3302 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3303 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3304 } 3305 break; 3306 case SPDK_BDEV_IO_TYPE_UNMAP: 3307 bdev_unmap_split(bdev_io); 3308 break; 3309 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3310 bdev_write_zeroes_split(bdev_io); 3311 break; 3312 case SPDK_BDEV_IO_TYPE_COPY: 3313 bdev_copy_split(bdev_io); 3314 break; 3315 default: 3316 assert(false); 3317 break; 3318 } 3319 } 3320 3321 static void 3322 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3323 { 3324 if (!success) { 3325 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3326 return; 3327 } 3328 3329 _bdev_rw_split(bdev_io); 3330 } 3331 3332 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3333 * be inlined, at least on some compilers. 3334 */ 3335 static inline void 3336 _bdev_io_submit(void *ctx) 3337 { 3338 struct spdk_bdev_io *bdev_io = ctx; 3339 struct spdk_bdev *bdev = bdev_io->bdev; 3340 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3341 3342 if (spdk_likely(bdev_ch->flags == 0)) { 3343 bdev_io_do_submit(bdev_ch, bdev_io); 3344 return; 3345 } 3346 3347 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3348 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3349 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3350 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3351 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3352 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3353 } else { 3354 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3355 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3356 } 3357 } else { 3358 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3359 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3360 } 3361 } 3362 3363 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3364 3365 bool 3366 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3367 { 3368 if (range1->length == 0 || range2->length == 0) { 3369 return false; 3370 } 3371 3372 if (range1->offset + range1->length <= range2->offset) { 3373 return false; 3374 } 3375 3376 if (range2->offset + range2->length <= range1->offset) { 3377 return false; 3378 } 3379 3380 return true; 3381 } 3382 3383 static bool 3384 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3385 { 3386 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3387 struct lba_range r; 3388 3389 switch (bdev_io->type) { 3390 case SPDK_BDEV_IO_TYPE_NVME_IO: 3391 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3392 /* Don't try to decode the NVMe command - just assume worst-case and that 3393 * it overlaps a locked range. 3394 */ 3395 return true; 3396 case SPDK_BDEV_IO_TYPE_WRITE: 3397 case SPDK_BDEV_IO_TYPE_UNMAP: 3398 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3399 case SPDK_BDEV_IO_TYPE_ZCOPY: 3400 case SPDK_BDEV_IO_TYPE_COPY: 3401 r.offset = bdev_io->u.bdev.offset_blocks; 3402 r.length = bdev_io->u.bdev.num_blocks; 3403 if (!bdev_lba_range_overlapped(range, &r)) { 3404 /* This I/O doesn't overlap the specified LBA range. */ 3405 return false; 3406 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3407 /* This I/O overlaps, but the I/O is on the same channel that locked this 3408 * range, and the caller_ctx is the same as the locked_ctx. This means 3409 * that this I/O is associated with the lock, and is allowed to execute. 3410 */ 3411 return false; 3412 } else { 3413 return true; 3414 } 3415 default: 3416 return false; 3417 } 3418 } 3419 3420 void 3421 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3422 { 3423 struct spdk_bdev *bdev = bdev_io->bdev; 3424 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3425 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3426 3427 assert(thread != NULL); 3428 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3429 3430 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3431 struct lba_range *range; 3432 3433 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3434 if (bdev_io_range_is_locked(bdev_io, range)) { 3435 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3436 return; 3437 } 3438 } 3439 } 3440 3441 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3442 3443 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3444 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3445 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3446 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3447 spdk_bdev_get_name(bdev)); 3448 3449 if (bdev_io->internal.split) { 3450 bdev_io_split(bdev_io); 3451 return; 3452 } 3453 3454 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3455 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3456 _bdev_io_submit(bdev_io); 3457 } else { 3458 bdev_io->internal.io_submit_ch = ch; 3459 bdev_io->internal.ch = bdev->internal.qos->ch; 3460 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3461 } 3462 } else { 3463 _bdev_io_submit(bdev_io); 3464 } 3465 } 3466 3467 static inline void 3468 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3469 { 3470 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3471 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3472 * For write operation we need to pull buffers from memory domain before submitting IO. 3473 * Once read operation completes, we need to use memory_domain push functionality to 3474 * update data in original memory domain IO buffer 3475 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3476 bdev_io->u.bdev.memory_domain = NULL; 3477 bdev_io->u.bdev.memory_domain_ctx = NULL; 3478 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3479 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3480 } 3481 3482 static inline void 3483 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3484 { 3485 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3486 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3487 3488 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3489 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3490 bdev_io_complete_unsubmitted(bdev_io); 3491 return; 3492 } 3493 3494 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3495 * support them, but we need to execute an accel sequence and the data buffer is from accel 3496 * memory domain (to avoid doing a push/pull from that domain). 3497 */ 3498 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3499 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3500 _bdev_io_ext_use_bounce_buffer(bdev_io); 3501 return; 3502 } 3503 3504 if (needs_exec) { 3505 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3506 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3507 return; 3508 } 3509 /* For reads we'll execute the sequence after the data is read, so, for now, only 3510 * clear out accel_sequence pointer and submit the IO */ 3511 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3512 bdev_io->u.bdev.accel_sequence = NULL; 3513 } 3514 3515 bdev_io_submit(bdev_io); 3516 } 3517 3518 static void 3519 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3520 { 3521 struct spdk_bdev *bdev = bdev_io->bdev; 3522 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3523 struct spdk_io_channel *ch = bdev_ch->channel; 3524 3525 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3526 3527 bdev_io->internal.in_submit_request = true; 3528 bdev_submit_request(bdev, ch, bdev_io); 3529 bdev_io->internal.in_submit_request = false; 3530 } 3531 3532 void 3533 bdev_io_init(struct spdk_bdev_io *bdev_io, 3534 struct spdk_bdev *bdev, void *cb_arg, 3535 spdk_bdev_io_completion_cb cb) 3536 { 3537 bdev_io->bdev = bdev; 3538 bdev_io->internal.caller_ctx = cb_arg; 3539 bdev_io->internal.cb = cb; 3540 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3541 bdev_io->internal.in_submit_request = false; 3542 bdev_io->internal.buf = NULL; 3543 bdev_io->internal.io_submit_ch = NULL; 3544 bdev_io->internal.orig_iovs = NULL; 3545 bdev_io->internal.orig_iovcnt = 0; 3546 bdev_io->internal.orig_md_iov.iov_base = NULL; 3547 bdev_io->internal.error.nvme.cdw0 = 0; 3548 bdev_io->num_retries = 0; 3549 bdev_io->internal.get_buf_cb = NULL; 3550 bdev_io->internal.get_aux_buf_cb = NULL; 3551 bdev_io->internal.memory_domain = NULL; 3552 bdev_io->internal.memory_domain_ctx = NULL; 3553 bdev_io->internal.data_transfer_cpl = NULL; 3554 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3555 bdev_io->internal.accel_sequence = NULL; 3556 bdev_io->internal.has_accel_sequence = false; 3557 } 3558 3559 static bool 3560 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3561 { 3562 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3563 } 3564 3565 bool 3566 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3567 { 3568 bool supported; 3569 3570 supported = bdev_io_type_supported(bdev, io_type); 3571 3572 if (!supported) { 3573 switch (io_type) { 3574 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3575 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3576 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3577 break; 3578 default: 3579 break; 3580 } 3581 } 3582 3583 return supported; 3584 } 3585 3586 uint64_t 3587 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3588 { 3589 return bdev_io->internal.submit_tsc; 3590 } 3591 3592 int 3593 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3594 { 3595 if (bdev->fn_table->dump_info_json) { 3596 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3597 } 3598 3599 return 0; 3600 } 3601 3602 static void 3603 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3604 { 3605 uint32_t max_per_timeslice = 0; 3606 int i; 3607 3608 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3609 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3610 qos->rate_limits[i].max_per_timeslice = 0; 3611 continue; 3612 } 3613 3614 max_per_timeslice = qos->rate_limits[i].limit * 3615 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3616 3617 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3618 qos->rate_limits[i].min_per_timeslice); 3619 3620 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3621 } 3622 3623 bdev_qos_set_ops(qos); 3624 } 3625 3626 static int 3627 bdev_channel_poll_qos(void *arg) 3628 { 3629 struct spdk_bdev_qos *qos = arg; 3630 uint64_t now = spdk_get_ticks(); 3631 int i; 3632 3633 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3634 /* We received our callback earlier than expected - return 3635 * immediately and wait to do accounting until at least one 3636 * timeslice has actually expired. This should never happen 3637 * with a well-behaved timer implementation. 3638 */ 3639 return SPDK_POLLER_IDLE; 3640 } 3641 3642 /* Reset for next round of rate limiting */ 3643 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3644 /* We may have allowed the IOs or bytes to slightly overrun in the last 3645 * timeslice. remaining_this_timeslice is signed, so if it's negative 3646 * here, we'll account for the overrun so that the next timeslice will 3647 * be appropriately reduced. 3648 */ 3649 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3650 qos->rate_limits[i].remaining_this_timeslice = 0; 3651 } 3652 } 3653 3654 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3655 qos->last_timeslice += qos->timeslice_size; 3656 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3657 qos->rate_limits[i].remaining_this_timeslice += 3658 qos->rate_limits[i].max_per_timeslice; 3659 } 3660 } 3661 3662 return bdev_qos_io_submit(qos->ch, qos); 3663 } 3664 3665 static void 3666 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3667 { 3668 struct spdk_bdev_shared_resource *shared_resource; 3669 struct lba_range *range; 3670 3671 bdev_free_io_stat(ch->stat); 3672 #ifdef SPDK_CONFIG_VTUNE 3673 bdev_free_io_stat(ch->prev_stat); 3674 #endif 3675 3676 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3677 range = TAILQ_FIRST(&ch->locked_ranges); 3678 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3679 free(range); 3680 } 3681 3682 spdk_put_io_channel(ch->channel); 3683 spdk_put_io_channel(ch->accel_channel); 3684 3685 shared_resource = ch->shared_resource; 3686 3687 assert(TAILQ_EMPTY(&ch->io_locked)); 3688 assert(TAILQ_EMPTY(&ch->io_submitted)); 3689 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3690 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3691 assert(ch->io_outstanding == 0); 3692 assert(shared_resource->ref > 0); 3693 shared_resource->ref--; 3694 if (shared_resource->ref == 0) { 3695 assert(shared_resource->io_outstanding == 0); 3696 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3697 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3698 free(shared_resource); 3699 } 3700 } 3701 3702 static void 3703 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3704 { 3705 struct spdk_bdev_qos *qos = bdev->internal.qos; 3706 int i; 3707 3708 assert(spdk_spin_held(&bdev->internal.spinlock)); 3709 3710 /* Rate limiting on this bdev enabled */ 3711 if (qos) { 3712 if (qos->ch == NULL) { 3713 struct spdk_io_channel *io_ch; 3714 3715 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3716 bdev->name, spdk_get_thread()); 3717 3718 /* No qos channel has been selected, so set one up */ 3719 3720 /* Take another reference to ch */ 3721 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3722 assert(io_ch != NULL); 3723 qos->ch = ch; 3724 3725 qos->thread = spdk_io_channel_get_thread(io_ch); 3726 3727 TAILQ_INIT(&qos->queued); 3728 3729 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3730 if (bdev_qos_is_iops_rate_limit(i) == true) { 3731 qos->rate_limits[i].min_per_timeslice = 3732 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3733 } else { 3734 qos->rate_limits[i].min_per_timeslice = 3735 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3736 } 3737 3738 if (qos->rate_limits[i].limit == 0) { 3739 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3740 } 3741 } 3742 bdev_qos_update_max_quota_per_timeslice(qos); 3743 qos->timeslice_size = 3744 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3745 qos->last_timeslice = spdk_get_ticks(); 3746 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3747 qos, 3748 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3749 } 3750 3751 ch->flags |= BDEV_CH_QOS_ENABLED; 3752 } 3753 } 3754 3755 struct poll_timeout_ctx { 3756 struct spdk_bdev_desc *desc; 3757 uint64_t timeout_in_sec; 3758 spdk_bdev_io_timeout_cb cb_fn; 3759 void *cb_arg; 3760 }; 3761 3762 static void 3763 bdev_desc_free(struct spdk_bdev_desc *desc) 3764 { 3765 spdk_spin_destroy(&desc->spinlock); 3766 free(desc->media_events_buffer); 3767 free(desc); 3768 } 3769 3770 static void 3771 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3772 { 3773 struct poll_timeout_ctx *ctx = _ctx; 3774 struct spdk_bdev_desc *desc = ctx->desc; 3775 3776 free(ctx); 3777 3778 spdk_spin_lock(&desc->spinlock); 3779 desc->refs--; 3780 if (desc->closed == true && desc->refs == 0) { 3781 spdk_spin_unlock(&desc->spinlock); 3782 bdev_desc_free(desc); 3783 return; 3784 } 3785 spdk_spin_unlock(&desc->spinlock); 3786 } 3787 3788 static void 3789 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3790 struct spdk_io_channel *io_ch, void *_ctx) 3791 { 3792 struct poll_timeout_ctx *ctx = _ctx; 3793 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3794 struct spdk_bdev_desc *desc = ctx->desc; 3795 struct spdk_bdev_io *bdev_io; 3796 uint64_t now; 3797 3798 spdk_spin_lock(&desc->spinlock); 3799 if (desc->closed == true) { 3800 spdk_spin_unlock(&desc->spinlock); 3801 spdk_bdev_for_each_channel_continue(i, -1); 3802 return; 3803 } 3804 spdk_spin_unlock(&desc->spinlock); 3805 3806 now = spdk_get_ticks(); 3807 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3808 /* Exclude any I/O that are generated via splitting. */ 3809 if (bdev_io->internal.cb == bdev_io_split_done) { 3810 continue; 3811 } 3812 3813 /* Once we find an I/O that has not timed out, we can immediately 3814 * exit the loop. 3815 */ 3816 if (now < (bdev_io->internal.submit_tsc + 3817 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3818 goto end; 3819 } 3820 3821 if (bdev_io->internal.desc == desc) { 3822 ctx->cb_fn(ctx->cb_arg, bdev_io); 3823 } 3824 } 3825 3826 end: 3827 spdk_bdev_for_each_channel_continue(i, 0); 3828 } 3829 3830 static int 3831 bdev_poll_timeout_io(void *arg) 3832 { 3833 struct spdk_bdev_desc *desc = arg; 3834 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3835 struct poll_timeout_ctx *ctx; 3836 3837 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3838 if (!ctx) { 3839 SPDK_ERRLOG("failed to allocate memory\n"); 3840 return SPDK_POLLER_BUSY; 3841 } 3842 ctx->desc = desc; 3843 ctx->cb_arg = desc->cb_arg; 3844 ctx->cb_fn = desc->cb_fn; 3845 ctx->timeout_in_sec = desc->timeout_in_sec; 3846 3847 /* Take a ref on the descriptor in case it gets closed while we are checking 3848 * all of the channels. 3849 */ 3850 spdk_spin_lock(&desc->spinlock); 3851 desc->refs++; 3852 spdk_spin_unlock(&desc->spinlock); 3853 3854 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3855 bdev_channel_poll_timeout_io_done); 3856 3857 return SPDK_POLLER_BUSY; 3858 } 3859 3860 int 3861 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3862 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3863 { 3864 assert(desc->thread == spdk_get_thread()); 3865 3866 spdk_poller_unregister(&desc->io_timeout_poller); 3867 3868 if (timeout_in_sec) { 3869 assert(cb_fn != NULL); 3870 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3871 desc, 3872 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3873 1000); 3874 if (desc->io_timeout_poller == NULL) { 3875 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3876 return -1; 3877 } 3878 } 3879 3880 desc->cb_fn = cb_fn; 3881 desc->cb_arg = cb_arg; 3882 desc->timeout_in_sec = timeout_in_sec; 3883 3884 return 0; 3885 } 3886 3887 static int 3888 bdev_channel_create(void *io_device, void *ctx_buf) 3889 { 3890 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3891 struct spdk_bdev_channel *ch = ctx_buf; 3892 struct spdk_io_channel *mgmt_io_ch; 3893 struct spdk_bdev_mgmt_channel *mgmt_ch; 3894 struct spdk_bdev_shared_resource *shared_resource; 3895 struct lba_range *range; 3896 3897 ch->bdev = bdev; 3898 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3899 if (!ch->channel) { 3900 return -1; 3901 } 3902 3903 ch->accel_channel = spdk_accel_get_io_channel(); 3904 if (!ch->accel_channel) { 3905 spdk_put_io_channel(ch->channel); 3906 return -1; 3907 } 3908 3909 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3910 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3911 3912 assert(ch->histogram == NULL); 3913 if (bdev->internal.histogram_enabled) { 3914 ch->histogram = spdk_histogram_data_alloc(); 3915 if (ch->histogram == NULL) { 3916 SPDK_ERRLOG("Could not allocate histogram\n"); 3917 } 3918 } 3919 3920 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3921 if (!mgmt_io_ch) { 3922 spdk_put_io_channel(ch->channel); 3923 spdk_put_io_channel(ch->accel_channel); 3924 return -1; 3925 } 3926 3927 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3928 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3929 if (shared_resource->shared_ch == ch->channel) { 3930 spdk_put_io_channel(mgmt_io_ch); 3931 shared_resource->ref++; 3932 break; 3933 } 3934 } 3935 3936 if (shared_resource == NULL) { 3937 shared_resource = calloc(1, sizeof(*shared_resource)); 3938 if (shared_resource == NULL) { 3939 spdk_put_io_channel(ch->channel); 3940 spdk_put_io_channel(ch->accel_channel); 3941 spdk_put_io_channel(mgmt_io_ch); 3942 return -1; 3943 } 3944 3945 shared_resource->mgmt_ch = mgmt_ch; 3946 shared_resource->io_outstanding = 0; 3947 TAILQ_INIT(&shared_resource->nomem_io); 3948 shared_resource->nomem_threshold = 0; 3949 shared_resource->shared_ch = ch->channel; 3950 shared_resource->ref = 1; 3951 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3952 } 3953 3954 ch->io_outstanding = 0; 3955 TAILQ_INIT(&ch->queued_resets); 3956 TAILQ_INIT(&ch->locked_ranges); 3957 ch->flags = 0; 3958 ch->shared_resource = shared_resource; 3959 3960 TAILQ_INIT(&ch->io_submitted); 3961 TAILQ_INIT(&ch->io_locked); 3962 TAILQ_INIT(&ch->io_accel_exec); 3963 TAILQ_INIT(&ch->io_memory_domain); 3964 3965 ch->stat = bdev_alloc_io_stat(false); 3966 if (ch->stat == NULL) { 3967 bdev_channel_destroy_resource(ch); 3968 return -1; 3969 } 3970 3971 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3972 3973 #ifdef SPDK_CONFIG_VTUNE 3974 { 3975 char *name; 3976 __itt_init_ittlib(NULL, 0); 3977 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3978 if (!name) { 3979 bdev_channel_destroy_resource(ch); 3980 return -1; 3981 } 3982 ch->handle = __itt_string_handle_create(name); 3983 free(name); 3984 ch->start_tsc = spdk_get_ticks(); 3985 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3986 ch->prev_stat = bdev_alloc_io_stat(false); 3987 if (ch->prev_stat == NULL) { 3988 bdev_channel_destroy_resource(ch); 3989 return -1; 3990 } 3991 } 3992 #endif 3993 3994 spdk_spin_lock(&bdev->internal.spinlock); 3995 bdev_enable_qos(bdev, ch); 3996 3997 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3998 struct lba_range *new_range; 3999 4000 new_range = calloc(1, sizeof(*new_range)); 4001 if (new_range == NULL) { 4002 spdk_spin_unlock(&bdev->internal.spinlock); 4003 bdev_channel_destroy_resource(ch); 4004 return -1; 4005 } 4006 new_range->length = range->length; 4007 new_range->offset = range->offset; 4008 new_range->locked_ctx = range->locked_ctx; 4009 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4010 } 4011 4012 spdk_spin_unlock(&bdev->internal.spinlock); 4013 4014 return 0; 4015 } 4016 4017 static int 4018 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4019 void *cb_ctx) 4020 { 4021 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4022 struct spdk_bdev_io *bdev_io; 4023 uint64_t buf_len; 4024 4025 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4026 if (bdev_io->internal.ch == bdev_ch) { 4027 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4028 spdk_iobuf_entry_abort(ch, entry, buf_len); 4029 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4030 } 4031 4032 return 0; 4033 } 4034 4035 /* 4036 * Abort I/O that are waiting on a data buffer. 4037 */ 4038 static void 4039 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4040 { 4041 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4042 bdev_abort_all_buf_io_cb, ch); 4043 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4044 bdev_abort_all_buf_io_cb, ch); 4045 } 4046 4047 /* 4048 * Abort I/O that are queued waiting for submission. These types of I/O are 4049 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4050 */ 4051 static void 4052 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4053 { 4054 struct spdk_bdev_io *bdev_io, *tmp; 4055 4056 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4057 if (bdev_io->internal.ch == ch) { 4058 TAILQ_REMOVE(queue, bdev_io, internal.link); 4059 /* 4060 * spdk_bdev_io_complete() assumes that the completed I/O had 4061 * been submitted to the bdev module. Since in this case it 4062 * hadn't, bump io_outstanding to account for the decrement 4063 * that spdk_bdev_io_complete() will do. 4064 */ 4065 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4066 ch->io_outstanding++; 4067 ch->shared_resource->io_outstanding++; 4068 } 4069 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4070 } 4071 } 4072 } 4073 4074 static bool 4075 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4076 { 4077 struct spdk_bdev_io *bdev_io; 4078 4079 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4080 if (bdev_io == bio_to_abort) { 4081 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4082 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4083 return true; 4084 } 4085 } 4086 4087 return false; 4088 } 4089 4090 static int 4091 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4092 { 4093 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4094 uint64_t buf_len; 4095 4096 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4097 if (bdev_io == bio_to_abort) { 4098 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4099 spdk_iobuf_entry_abort(ch, entry, buf_len); 4100 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4101 return 1; 4102 } 4103 4104 return 0; 4105 } 4106 4107 static bool 4108 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4109 { 4110 int rc; 4111 4112 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4113 bdev_abort_buf_io_cb, bio_to_abort); 4114 if (rc == 1) { 4115 return true; 4116 } 4117 4118 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4119 bdev_abort_buf_io_cb, bio_to_abort); 4120 return rc == 1; 4121 } 4122 4123 static void 4124 bdev_qos_channel_destroy(void *cb_arg) 4125 { 4126 struct spdk_bdev_qos *qos = cb_arg; 4127 4128 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4129 spdk_poller_unregister(&qos->poller); 4130 4131 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4132 4133 free(qos); 4134 } 4135 4136 static int 4137 bdev_qos_destroy(struct spdk_bdev *bdev) 4138 { 4139 int i; 4140 4141 /* 4142 * Cleanly shutting down the QoS poller is tricky, because 4143 * during the asynchronous operation the user could open 4144 * a new descriptor and create a new channel, spawning 4145 * a new QoS poller. 4146 * 4147 * The strategy is to create a new QoS structure here and swap it 4148 * in. The shutdown path then continues to refer to the old one 4149 * until it completes and then releases it. 4150 */ 4151 struct spdk_bdev_qos *new_qos, *old_qos; 4152 4153 old_qos = bdev->internal.qos; 4154 4155 new_qos = calloc(1, sizeof(*new_qos)); 4156 if (!new_qos) { 4157 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4158 return -ENOMEM; 4159 } 4160 4161 /* Copy the old QoS data into the newly allocated structure */ 4162 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4163 4164 /* Zero out the key parts of the QoS structure */ 4165 new_qos->ch = NULL; 4166 new_qos->thread = NULL; 4167 new_qos->poller = NULL; 4168 TAILQ_INIT(&new_qos->queued); 4169 /* 4170 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4171 * It will be used later for the new QoS structure. 4172 */ 4173 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4174 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4175 new_qos->rate_limits[i].min_per_timeslice = 0; 4176 new_qos->rate_limits[i].max_per_timeslice = 0; 4177 } 4178 4179 bdev->internal.qos = new_qos; 4180 4181 if (old_qos->thread == NULL) { 4182 free(old_qos); 4183 } else { 4184 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4185 } 4186 4187 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4188 * been destroyed yet. The destruction path will end up waiting for the final 4189 * channel to be put before it releases resources. */ 4190 4191 return 0; 4192 } 4193 4194 void 4195 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4196 { 4197 total->bytes_read += add->bytes_read; 4198 total->num_read_ops += add->num_read_ops; 4199 total->bytes_written += add->bytes_written; 4200 total->num_write_ops += add->num_write_ops; 4201 total->bytes_unmapped += add->bytes_unmapped; 4202 total->num_unmap_ops += add->num_unmap_ops; 4203 total->bytes_copied += add->bytes_copied; 4204 total->num_copy_ops += add->num_copy_ops; 4205 total->read_latency_ticks += add->read_latency_ticks; 4206 total->write_latency_ticks += add->write_latency_ticks; 4207 total->unmap_latency_ticks += add->unmap_latency_ticks; 4208 total->copy_latency_ticks += add->copy_latency_ticks; 4209 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4210 total->max_read_latency_ticks = add->max_read_latency_ticks; 4211 } 4212 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4213 total->min_read_latency_ticks = add->min_read_latency_ticks; 4214 } 4215 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4216 total->max_write_latency_ticks = add->max_write_latency_ticks; 4217 } 4218 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4219 total->min_write_latency_ticks = add->min_write_latency_ticks; 4220 } 4221 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4222 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4223 } 4224 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4225 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4226 } 4227 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4228 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4229 } 4230 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4231 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4232 } 4233 } 4234 4235 static void 4236 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4237 { 4238 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4239 4240 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4241 memcpy(to_stat->io_error, from_stat->io_error, 4242 sizeof(struct spdk_bdev_io_error_stat)); 4243 } 4244 } 4245 4246 void 4247 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4248 { 4249 stat->max_read_latency_ticks = 0; 4250 stat->min_read_latency_ticks = UINT64_MAX; 4251 stat->max_write_latency_ticks = 0; 4252 stat->min_write_latency_ticks = UINT64_MAX; 4253 stat->max_unmap_latency_ticks = 0; 4254 stat->min_unmap_latency_ticks = UINT64_MAX; 4255 stat->max_copy_latency_ticks = 0; 4256 stat->min_copy_latency_ticks = UINT64_MAX; 4257 4258 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4259 return; 4260 } 4261 4262 stat->bytes_read = 0; 4263 stat->num_read_ops = 0; 4264 stat->bytes_written = 0; 4265 stat->num_write_ops = 0; 4266 stat->bytes_unmapped = 0; 4267 stat->num_unmap_ops = 0; 4268 stat->bytes_copied = 0; 4269 stat->num_copy_ops = 0; 4270 stat->read_latency_ticks = 0; 4271 stat->write_latency_ticks = 0; 4272 stat->unmap_latency_ticks = 0; 4273 stat->copy_latency_ticks = 0; 4274 4275 if (stat->io_error != NULL) { 4276 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4277 } 4278 } 4279 4280 struct spdk_bdev_io_stat * 4281 bdev_alloc_io_stat(bool io_error_stat) 4282 { 4283 struct spdk_bdev_io_stat *stat; 4284 4285 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4286 if (stat == NULL) { 4287 return NULL; 4288 } 4289 4290 if (io_error_stat) { 4291 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4292 if (stat->io_error == NULL) { 4293 free(stat); 4294 return NULL; 4295 } 4296 } else { 4297 stat->io_error = NULL; 4298 } 4299 4300 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4301 4302 return stat; 4303 } 4304 4305 void 4306 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4307 { 4308 if (stat != NULL) { 4309 free(stat->io_error); 4310 free(stat); 4311 } 4312 } 4313 4314 void 4315 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4316 { 4317 int i; 4318 4319 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4320 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4321 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4322 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4323 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4324 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4325 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4326 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4327 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4328 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4329 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4330 stat->min_read_latency_ticks != UINT64_MAX ? 4331 stat->min_read_latency_ticks : 0); 4332 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4333 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4334 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4335 stat->min_write_latency_ticks != UINT64_MAX ? 4336 stat->min_write_latency_ticks : 0); 4337 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4338 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4339 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4340 stat->min_unmap_latency_ticks != UINT64_MAX ? 4341 stat->min_unmap_latency_ticks : 0); 4342 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4343 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4344 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4345 stat->min_copy_latency_ticks != UINT64_MAX ? 4346 stat->min_copy_latency_ticks : 0); 4347 4348 if (stat->io_error != NULL) { 4349 spdk_json_write_named_object_begin(w, "io_error"); 4350 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4351 if (stat->io_error->error_status[i] != 0) { 4352 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4353 stat->io_error->error_status[i]); 4354 } 4355 } 4356 spdk_json_write_object_end(w); 4357 } 4358 } 4359 4360 static void 4361 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4362 { 4363 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4364 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4365 4366 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4367 bdev_abort_all_buf_io(mgmt_ch, ch); 4368 } 4369 4370 static void 4371 bdev_channel_destroy(void *io_device, void *ctx_buf) 4372 { 4373 struct spdk_bdev_channel *ch = ctx_buf; 4374 4375 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4376 spdk_get_thread()); 4377 4378 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4379 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4380 4381 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4382 spdk_spin_lock(&ch->bdev->internal.spinlock); 4383 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4384 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4385 4386 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4387 4388 bdev_channel_abort_queued_ios(ch); 4389 4390 if (ch->histogram) { 4391 spdk_histogram_data_free(ch->histogram); 4392 } 4393 4394 bdev_channel_destroy_resource(ch); 4395 } 4396 4397 /* 4398 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4399 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4400 */ 4401 static int 4402 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4403 { 4404 struct spdk_bdev_name *tmp; 4405 4406 bdev_name->name = strdup(name); 4407 if (bdev_name->name == NULL) { 4408 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4409 return -ENOMEM; 4410 } 4411 4412 bdev_name->bdev = bdev; 4413 4414 spdk_spin_lock(&g_bdev_mgr.spinlock); 4415 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4416 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4417 4418 if (tmp != NULL) { 4419 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4420 free(bdev_name->name); 4421 return -EEXIST; 4422 } 4423 4424 return 0; 4425 } 4426 4427 static void 4428 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4429 { 4430 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4431 free(bdev_name->name); 4432 } 4433 4434 static void 4435 bdev_name_del(struct spdk_bdev_name *bdev_name) 4436 { 4437 spdk_spin_lock(&g_bdev_mgr.spinlock); 4438 bdev_name_del_unsafe(bdev_name); 4439 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4440 } 4441 4442 int 4443 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4444 { 4445 struct spdk_bdev_alias *tmp; 4446 int ret; 4447 4448 if (alias == NULL) { 4449 SPDK_ERRLOG("Empty alias passed\n"); 4450 return -EINVAL; 4451 } 4452 4453 tmp = calloc(1, sizeof(*tmp)); 4454 if (tmp == NULL) { 4455 SPDK_ERRLOG("Unable to allocate alias\n"); 4456 return -ENOMEM; 4457 } 4458 4459 ret = bdev_name_add(&tmp->alias, bdev, alias); 4460 if (ret != 0) { 4461 free(tmp); 4462 return ret; 4463 } 4464 4465 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4466 4467 return 0; 4468 } 4469 4470 static int 4471 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4472 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4473 { 4474 struct spdk_bdev_alias *tmp; 4475 4476 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4477 if (strcmp(alias, tmp->alias.name) == 0) { 4478 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4479 alias_del_fn(&tmp->alias); 4480 free(tmp); 4481 return 0; 4482 } 4483 } 4484 4485 return -ENOENT; 4486 } 4487 4488 int 4489 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4490 { 4491 int rc; 4492 4493 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4494 if (rc == -ENOENT) { 4495 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4496 } 4497 4498 return rc; 4499 } 4500 4501 void 4502 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4503 { 4504 struct spdk_bdev_alias *p, *tmp; 4505 4506 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4507 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4508 bdev_name_del(&p->alias); 4509 free(p); 4510 } 4511 } 4512 4513 struct spdk_io_channel * 4514 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4515 { 4516 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4517 } 4518 4519 void * 4520 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4521 { 4522 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4523 void *ctx = NULL; 4524 4525 if (bdev->fn_table->get_module_ctx) { 4526 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4527 } 4528 4529 return ctx; 4530 } 4531 4532 const char * 4533 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4534 { 4535 return bdev->module->name; 4536 } 4537 4538 const char * 4539 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4540 { 4541 return bdev->name; 4542 } 4543 4544 const char * 4545 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4546 { 4547 return bdev->product_name; 4548 } 4549 4550 const struct spdk_bdev_aliases_list * 4551 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4552 { 4553 return &bdev->aliases; 4554 } 4555 4556 uint32_t 4557 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4558 { 4559 return bdev->blocklen; 4560 } 4561 4562 uint32_t 4563 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4564 { 4565 return bdev->write_unit_size; 4566 } 4567 4568 uint64_t 4569 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4570 { 4571 return bdev->blockcnt; 4572 } 4573 4574 const char * 4575 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4576 { 4577 return qos_rpc_type[type]; 4578 } 4579 4580 void 4581 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4582 { 4583 int i; 4584 4585 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4586 4587 spdk_spin_lock(&bdev->internal.spinlock); 4588 if (bdev->internal.qos) { 4589 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4590 if (bdev->internal.qos->rate_limits[i].limit != 4591 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4592 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4593 if (bdev_qos_is_iops_rate_limit(i) == false) { 4594 /* Change from Byte to Megabyte which is user visible. */ 4595 limits[i] = limits[i] / 1024 / 1024; 4596 } 4597 } 4598 } 4599 } 4600 spdk_spin_unlock(&bdev->internal.spinlock); 4601 } 4602 4603 size_t 4604 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4605 { 4606 return 1 << bdev->required_alignment; 4607 } 4608 4609 uint32_t 4610 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4611 { 4612 return bdev->optimal_io_boundary; 4613 } 4614 4615 bool 4616 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4617 { 4618 return bdev->write_cache; 4619 } 4620 4621 const struct spdk_uuid * 4622 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4623 { 4624 return &bdev->uuid; 4625 } 4626 4627 uint16_t 4628 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4629 { 4630 return bdev->acwu; 4631 } 4632 4633 uint32_t 4634 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4635 { 4636 return bdev->md_len; 4637 } 4638 4639 bool 4640 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4641 { 4642 return (bdev->md_len != 0) && bdev->md_interleave; 4643 } 4644 4645 bool 4646 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4647 { 4648 return (bdev->md_len != 0) && !bdev->md_interleave; 4649 } 4650 4651 bool 4652 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4653 { 4654 return bdev->zoned; 4655 } 4656 4657 uint32_t 4658 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4659 { 4660 if (spdk_bdev_is_md_interleaved(bdev)) { 4661 return bdev->blocklen - bdev->md_len; 4662 } else { 4663 return bdev->blocklen; 4664 } 4665 } 4666 4667 uint32_t 4668 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4669 { 4670 return bdev->phys_blocklen; 4671 } 4672 4673 static uint32_t 4674 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4675 { 4676 if (!spdk_bdev_is_md_interleaved(bdev)) { 4677 return bdev->blocklen + bdev->md_len; 4678 } else { 4679 return bdev->blocklen; 4680 } 4681 } 4682 4683 /* We have to use the typedef in the function declaration to appease astyle. */ 4684 typedef enum spdk_dif_type spdk_dif_type_t; 4685 4686 spdk_dif_type_t 4687 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4688 { 4689 if (bdev->md_len != 0) { 4690 return bdev->dif_type; 4691 } else { 4692 return SPDK_DIF_DISABLE; 4693 } 4694 } 4695 4696 bool 4697 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4698 { 4699 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4700 return bdev->dif_is_head_of_md; 4701 } else { 4702 return false; 4703 } 4704 } 4705 4706 bool 4707 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4708 enum spdk_dif_check_type check_type) 4709 { 4710 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4711 return false; 4712 } 4713 4714 switch (check_type) { 4715 case SPDK_DIF_CHECK_TYPE_REFTAG: 4716 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4717 case SPDK_DIF_CHECK_TYPE_APPTAG: 4718 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4719 case SPDK_DIF_CHECK_TYPE_GUARD: 4720 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4721 default: 4722 return false; 4723 } 4724 } 4725 4726 static uint32_t 4727 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4728 { 4729 uint64_t aligned_length, max_write_blocks; 4730 4731 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4732 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4733 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4734 4735 return max_write_blocks; 4736 } 4737 4738 uint32_t 4739 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4740 { 4741 return bdev->max_copy; 4742 } 4743 4744 uint64_t 4745 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4746 { 4747 return bdev->internal.measured_queue_depth; 4748 } 4749 4750 uint64_t 4751 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4752 { 4753 return bdev->internal.period; 4754 } 4755 4756 uint64_t 4757 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4758 { 4759 return bdev->internal.weighted_io_time; 4760 } 4761 4762 uint64_t 4763 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4764 { 4765 return bdev->internal.io_time; 4766 } 4767 4768 static void bdev_update_qd_sampling_period(void *ctx); 4769 4770 static void 4771 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4772 { 4773 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4774 4775 if (bdev->internal.measured_queue_depth) { 4776 bdev->internal.io_time += bdev->internal.period; 4777 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4778 } 4779 4780 bdev->internal.qd_poll_in_progress = false; 4781 4782 bdev_update_qd_sampling_period(bdev); 4783 } 4784 4785 static void 4786 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4787 struct spdk_io_channel *io_ch, void *_ctx) 4788 { 4789 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4790 4791 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4792 spdk_bdev_for_each_channel_continue(i, 0); 4793 } 4794 4795 static int 4796 bdev_calculate_measured_queue_depth(void *ctx) 4797 { 4798 struct spdk_bdev *bdev = ctx; 4799 4800 bdev->internal.qd_poll_in_progress = true; 4801 bdev->internal.temporary_queue_depth = 0; 4802 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4803 return SPDK_POLLER_BUSY; 4804 } 4805 4806 static void 4807 bdev_update_qd_sampling_period(void *ctx) 4808 { 4809 struct spdk_bdev *bdev = ctx; 4810 4811 if (bdev->internal.period == bdev->internal.new_period) { 4812 return; 4813 } 4814 4815 if (bdev->internal.qd_poll_in_progress) { 4816 return; 4817 } 4818 4819 bdev->internal.period = bdev->internal.new_period; 4820 4821 spdk_poller_unregister(&bdev->internal.qd_poller); 4822 if (bdev->internal.period != 0) { 4823 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4824 bdev, bdev->internal.period); 4825 } else { 4826 spdk_bdev_close(bdev->internal.qd_desc); 4827 bdev->internal.qd_desc = NULL; 4828 } 4829 } 4830 4831 static void 4832 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4833 { 4834 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4835 } 4836 4837 void 4838 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4839 { 4840 int rc; 4841 4842 if (bdev->internal.new_period == period) { 4843 return; 4844 } 4845 4846 bdev->internal.new_period = period; 4847 4848 if (bdev->internal.qd_desc != NULL) { 4849 assert(bdev->internal.period != 0); 4850 4851 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4852 bdev_update_qd_sampling_period, bdev); 4853 return; 4854 } 4855 4856 assert(bdev->internal.period == 0); 4857 4858 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4859 NULL, &bdev->internal.qd_desc); 4860 if (rc != 0) { 4861 return; 4862 } 4863 4864 bdev->internal.period = period; 4865 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4866 bdev, period); 4867 } 4868 4869 struct bdev_get_current_qd_ctx { 4870 uint64_t current_qd; 4871 spdk_bdev_get_current_qd_cb cb_fn; 4872 void *cb_arg; 4873 }; 4874 4875 static void 4876 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4877 { 4878 struct bdev_get_current_qd_ctx *ctx = _ctx; 4879 4880 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4881 4882 free(ctx); 4883 } 4884 4885 static void 4886 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4887 struct spdk_io_channel *io_ch, void *_ctx) 4888 { 4889 struct bdev_get_current_qd_ctx *ctx = _ctx; 4890 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4891 4892 ctx->current_qd += bdev_ch->io_outstanding; 4893 4894 spdk_bdev_for_each_channel_continue(i, 0); 4895 } 4896 4897 void 4898 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4899 void *cb_arg) 4900 { 4901 struct bdev_get_current_qd_ctx *ctx; 4902 4903 assert(cb_fn != NULL); 4904 4905 ctx = calloc(1, sizeof(*ctx)); 4906 if (ctx == NULL) { 4907 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4908 return; 4909 } 4910 4911 ctx->cb_fn = cb_fn; 4912 ctx->cb_arg = cb_arg; 4913 4914 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4915 } 4916 4917 static void 4918 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4919 { 4920 assert(desc->thread == spdk_get_thread()); 4921 4922 spdk_spin_lock(&desc->spinlock); 4923 desc->refs--; 4924 if (!desc->closed) { 4925 spdk_spin_unlock(&desc->spinlock); 4926 desc->callback.event_fn(type, 4927 desc->bdev, 4928 desc->callback.ctx); 4929 return; 4930 } else if (desc->refs == 0) { 4931 /* This descriptor was closed after this event_notify message was sent. 4932 * spdk_bdev_close() could not free the descriptor since this message was 4933 * in flight, so we free it now using bdev_desc_free(). 4934 */ 4935 spdk_spin_unlock(&desc->spinlock); 4936 bdev_desc_free(desc); 4937 return; 4938 } 4939 spdk_spin_unlock(&desc->spinlock); 4940 } 4941 4942 static void 4943 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4944 { 4945 spdk_spin_lock(&desc->spinlock); 4946 desc->refs++; 4947 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4948 spdk_spin_unlock(&desc->spinlock); 4949 } 4950 4951 static void 4952 _resize_notify(void *ctx) 4953 { 4954 struct spdk_bdev_desc *desc = ctx; 4955 4956 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4957 } 4958 4959 int 4960 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4961 { 4962 struct spdk_bdev_desc *desc; 4963 int ret; 4964 4965 if (size == bdev->blockcnt) { 4966 return 0; 4967 } 4968 4969 spdk_spin_lock(&bdev->internal.spinlock); 4970 4971 /* bdev has open descriptors */ 4972 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4973 bdev->blockcnt > size) { 4974 ret = -EBUSY; 4975 } else { 4976 bdev->blockcnt = size; 4977 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4978 event_notify(desc, _resize_notify); 4979 } 4980 ret = 0; 4981 } 4982 4983 spdk_spin_unlock(&bdev->internal.spinlock); 4984 4985 return ret; 4986 } 4987 4988 /* 4989 * Convert I/O offset and length from bytes to blocks. 4990 * 4991 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4992 */ 4993 static uint64_t 4994 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4995 uint64_t num_bytes, uint64_t *num_blocks) 4996 { 4997 uint32_t block_size = bdev->blocklen; 4998 uint8_t shift_cnt; 4999 5000 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5001 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5002 shift_cnt = spdk_u32log2(block_size); 5003 *offset_blocks = offset_bytes >> shift_cnt; 5004 *num_blocks = num_bytes >> shift_cnt; 5005 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5006 (num_bytes - (*num_blocks << shift_cnt)); 5007 } else { 5008 *offset_blocks = offset_bytes / block_size; 5009 *num_blocks = num_bytes / block_size; 5010 return (offset_bytes % block_size) | (num_bytes % block_size); 5011 } 5012 } 5013 5014 static bool 5015 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5016 { 5017 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5018 * has been an overflow and hence the offset has been wrapped around */ 5019 if (offset_blocks + num_blocks < offset_blocks) { 5020 return false; 5021 } 5022 5023 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5024 if (offset_blocks + num_blocks > bdev->blockcnt) { 5025 return false; 5026 } 5027 5028 return true; 5029 } 5030 5031 static void 5032 bdev_seek_complete_cb(void *ctx) 5033 { 5034 struct spdk_bdev_io *bdev_io = ctx; 5035 5036 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5037 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5038 } 5039 5040 static int 5041 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5042 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5043 spdk_bdev_io_completion_cb cb, void *cb_arg) 5044 { 5045 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5046 struct spdk_bdev_io *bdev_io; 5047 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5048 5049 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5050 5051 /* Check if offset_blocks is valid looking at the validity of one block */ 5052 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5053 return -EINVAL; 5054 } 5055 5056 bdev_io = bdev_channel_get_io(channel); 5057 if (!bdev_io) { 5058 return -ENOMEM; 5059 } 5060 5061 bdev_io->internal.ch = channel; 5062 bdev_io->internal.desc = desc; 5063 bdev_io->type = io_type; 5064 bdev_io->u.bdev.offset_blocks = offset_blocks; 5065 bdev_io->u.bdev.memory_domain = NULL; 5066 bdev_io->u.bdev.memory_domain_ctx = NULL; 5067 bdev_io->u.bdev.accel_sequence = NULL; 5068 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5069 5070 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5071 /* In case bdev doesn't support seek to next data/hole offset, 5072 * it is assumed that only data and no holes are present */ 5073 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5074 bdev_io->u.bdev.seek.offset = offset_blocks; 5075 } else { 5076 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5077 } 5078 5079 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5080 return 0; 5081 } 5082 5083 bdev_io_submit(bdev_io); 5084 return 0; 5085 } 5086 5087 int 5088 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5089 uint64_t offset_blocks, 5090 spdk_bdev_io_completion_cb cb, void *cb_arg) 5091 { 5092 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5093 } 5094 5095 int 5096 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5097 uint64_t offset_blocks, 5098 spdk_bdev_io_completion_cb cb, void *cb_arg) 5099 { 5100 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5101 } 5102 5103 uint64_t 5104 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5105 { 5106 return bdev_io->u.bdev.seek.offset; 5107 } 5108 5109 static int 5110 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5111 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5112 spdk_bdev_io_completion_cb cb, void *cb_arg) 5113 { 5114 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5115 struct spdk_bdev_io *bdev_io; 5116 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5117 5118 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5119 return -EINVAL; 5120 } 5121 5122 bdev_io = bdev_channel_get_io(channel); 5123 if (!bdev_io) { 5124 return -ENOMEM; 5125 } 5126 5127 bdev_io->internal.ch = channel; 5128 bdev_io->internal.desc = desc; 5129 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5130 bdev_io->u.bdev.iovs = &bdev_io->iov; 5131 bdev_io->u.bdev.iovs[0].iov_base = buf; 5132 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5133 bdev_io->u.bdev.iovcnt = 1; 5134 bdev_io->u.bdev.md_buf = md_buf; 5135 bdev_io->u.bdev.num_blocks = num_blocks; 5136 bdev_io->u.bdev.offset_blocks = offset_blocks; 5137 bdev_io->u.bdev.memory_domain = NULL; 5138 bdev_io->u.bdev.memory_domain_ctx = NULL; 5139 bdev_io->u.bdev.accel_sequence = NULL; 5140 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5141 5142 bdev_io_submit(bdev_io); 5143 return 0; 5144 } 5145 5146 int 5147 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5148 void *buf, uint64_t offset, uint64_t nbytes, 5149 spdk_bdev_io_completion_cb cb, void *cb_arg) 5150 { 5151 uint64_t offset_blocks, num_blocks; 5152 5153 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5154 nbytes, &num_blocks) != 0) { 5155 return -EINVAL; 5156 } 5157 5158 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5159 } 5160 5161 int 5162 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5163 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5164 spdk_bdev_io_completion_cb cb, void *cb_arg) 5165 { 5166 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5167 } 5168 5169 int 5170 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5171 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5172 spdk_bdev_io_completion_cb cb, void *cb_arg) 5173 { 5174 struct iovec iov = { 5175 .iov_base = buf, 5176 }; 5177 5178 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5179 return -EINVAL; 5180 } 5181 5182 if (md_buf && !_is_buf_allocated(&iov)) { 5183 return -EINVAL; 5184 } 5185 5186 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5187 cb, cb_arg); 5188 } 5189 5190 int 5191 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5192 struct iovec *iov, int iovcnt, 5193 uint64_t offset, uint64_t nbytes, 5194 spdk_bdev_io_completion_cb cb, void *cb_arg) 5195 { 5196 uint64_t offset_blocks, num_blocks; 5197 5198 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5199 nbytes, &num_blocks) != 0) { 5200 return -EINVAL; 5201 } 5202 5203 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5204 } 5205 5206 static int 5207 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5208 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5209 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5210 struct spdk_accel_sequence *seq, 5211 spdk_bdev_io_completion_cb cb, void *cb_arg) 5212 { 5213 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5214 struct spdk_bdev_io *bdev_io; 5215 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5216 5217 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5218 return -EINVAL; 5219 } 5220 5221 bdev_io = bdev_channel_get_io(channel); 5222 if (!bdev_io) { 5223 return -ENOMEM; 5224 } 5225 5226 bdev_io->internal.ch = channel; 5227 bdev_io->internal.desc = desc; 5228 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5229 bdev_io->u.bdev.iovs = iov; 5230 bdev_io->u.bdev.iovcnt = iovcnt; 5231 bdev_io->u.bdev.md_buf = md_buf; 5232 bdev_io->u.bdev.num_blocks = num_blocks; 5233 bdev_io->u.bdev.offset_blocks = offset_blocks; 5234 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5235 bdev_io->internal.memory_domain = domain; 5236 bdev_io->internal.memory_domain_ctx = domain_ctx; 5237 bdev_io->internal.accel_sequence = seq; 5238 bdev_io->internal.has_accel_sequence = seq != NULL; 5239 bdev_io->u.bdev.memory_domain = domain; 5240 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5241 bdev_io->u.bdev.accel_sequence = seq; 5242 5243 _bdev_io_submit_ext(desc, bdev_io); 5244 5245 return 0; 5246 } 5247 5248 int 5249 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5250 struct iovec *iov, int iovcnt, 5251 uint64_t offset_blocks, uint64_t num_blocks, 5252 spdk_bdev_io_completion_cb cb, void *cb_arg) 5253 { 5254 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5255 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5256 } 5257 5258 int 5259 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5260 struct iovec *iov, int iovcnt, void *md_buf, 5261 uint64_t offset_blocks, uint64_t num_blocks, 5262 spdk_bdev_io_completion_cb cb, void *cb_arg) 5263 { 5264 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5265 return -EINVAL; 5266 } 5267 5268 if (md_buf && !_is_buf_allocated(iov)) { 5269 return -EINVAL; 5270 } 5271 5272 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5273 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5274 } 5275 5276 static inline bool 5277 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5278 { 5279 /* 5280 * We check if opts size is at least of size when we first introduced 5281 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5282 * are not checked internal. 5283 */ 5284 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5285 sizeof(opts->metadata) && 5286 opts->size <= sizeof(*opts) && 5287 /* When memory domain is used, the user must provide data buffers */ 5288 (!opts->memory_domain || (iov && iov[0].iov_base)); 5289 } 5290 5291 int 5292 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5293 struct iovec *iov, int iovcnt, 5294 uint64_t offset_blocks, uint64_t num_blocks, 5295 spdk_bdev_io_completion_cb cb, void *cb_arg, 5296 struct spdk_bdev_ext_io_opts *opts) 5297 { 5298 void *md = NULL; 5299 5300 if (opts) { 5301 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5302 return -EINVAL; 5303 } 5304 md = opts->metadata; 5305 } 5306 5307 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5308 return -EINVAL; 5309 } 5310 5311 if (md && !_is_buf_allocated(iov)) { 5312 return -EINVAL; 5313 } 5314 5315 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5316 num_blocks, 5317 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5318 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5319 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5320 cb, cb_arg); 5321 } 5322 5323 static int 5324 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5325 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5326 spdk_bdev_io_completion_cb cb, void *cb_arg) 5327 { 5328 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5329 struct spdk_bdev_io *bdev_io; 5330 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5331 5332 if (!desc->write) { 5333 return -EBADF; 5334 } 5335 5336 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5337 return -EINVAL; 5338 } 5339 5340 bdev_io = bdev_channel_get_io(channel); 5341 if (!bdev_io) { 5342 return -ENOMEM; 5343 } 5344 5345 bdev_io->internal.ch = channel; 5346 bdev_io->internal.desc = desc; 5347 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5348 bdev_io->u.bdev.iovs = &bdev_io->iov; 5349 bdev_io->u.bdev.iovs[0].iov_base = buf; 5350 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5351 bdev_io->u.bdev.iovcnt = 1; 5352 bdev_io->u.bdev.md_buf = md_buf; 5353 bdev_io->u.bdev.num_blocks = num_blocks; 5354 bdev_io->u.bdev.offset_blocks = offset_blocks; 5355 bdev_io->u.bdev.memory_domain = NULL; 5356 bdev_io->u.bdev.memory_domain_ctx = NULL; 5357 bdev_io->u.bdev.accel_sequence = NULL; 5358 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5359 5360 bdev_io_submit(bdev_io); 5361 return 0; 5362 } 5363 5364 int 5365 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5366 void *buf, uint64_t offset, uint64_t nbytes, 5367 spdk_bdev_io_completion_cb cb, void *cb_arg) 5368 { 5369 uint64_t offset_blocks, num_blocks; 5370 5371 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5372 nbytes, &num_blocks) != 0) { 5373 return -EINVAL; 5374 } 5375 5376 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5377 } 5378 5379 int 5380 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5381 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5382 spdk_bdev_io_completion_cb cb, void *cb_arg) 5383 { 5384 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5385 cb, cb_arg); 5386 } 5387 5388 int 5389 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5390 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5391 spdk_bdev_io_completion_cb cb, void *cb_arg) 5392 { 5393 struct iovec iov = { 5394 .iov_base = buf, 5395 }; 5396 5397 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5398 return -EINVAL; 5399 } 5400 5401 if (md_buf && !_is_buf_allocated(&iov)) { 5402 return -EINVAL; 5403 } 5404 5405 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5406 cb, cb_arg); 5407 } 5408 5409 static int 5410 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5411 struct iovec *iov, int iovcnt, void *md_buf, 5412 uint64_t offset_blocks, uint64_t num_blocks, 5413 struct spdk_memory_domain *domain, void *domain_ctx, 5414 struct spdk_accel_sequence *seq, 5415 spdk_bdev_io_completion_cb cb, void *cb_arg) 5416 { 5417 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5418 struct spdk_bdev_io *bdev_io; 5419 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5420 5421 if (!desc->write) { 5422 return -EBADF; 5423 } 5424 5425 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5426 return -EINVAL; 5427 } 5428 5429 bdev_io = bdev_channel_get_io(channel); 5430 if (!bdev_io) { 5431 return -ENOMEM; 5432 } 5433 5434 bdev_io->internal.ch = channel; 5435 bdev_io->internal.desc = desc; 5436 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5437 bdev_io->u.bdev.iovs = iov; 5438 bdev_io->u.bdev.iovcnt = iovcnt; 5439 bdev_io->u.bdev.md_buf = md_buf; 5440 bdev_io->u.bdev.num_blocks = num_blocks; 5441 bdev_io->u.bdev.offset_blocks = offset_blocks; 5442 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5443 bdev_io->internal.memory_domain = domain; 5444 bdev_io->internal.memory_domain_ctx = domain_ctx; 5445 bdev_io->internal.accel_sequence = seq; 5446 bdev_io->internal.has_accel_sequence = seq != NULL; 5447 bdev_io->u.bdev.memory_domain = domain; 5448 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5449 bdev_io->u.bdev.accel_sequence = seq; 5450 5451 _bdev_io_submit_ext(desc, bdev_io); 5452 5453 return 0; 5454 } 5455 5456 int 5457 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5458 struct iovec *iov, int iovcnt, 5459 uint64_t offset, uint64_t len, 5460 spdk_bdev_io_completion_cb cb, void *cb_arg) 5461 { 5462 uint64_t offset_blocks, num_blocks; 5463 5464 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5465 len, &num_blocks) != 0) { 5466 return -EINVAL; 5467 } 5468 5469 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5470 } 5471 5472 int 5473 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5474 struct iovec *iov, int iovcnt, 5475 uint64_t offset_blocks, uint64_t num_blocks, 5476 spdk_bdev_io_completion_cb cb, void *cb_arg) 5477 { 5478 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5479 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5480 } 5481 5482 int 5483 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5484 struct iovec *iov, int iovcnt, void *md_buf, 5485 uint64_t offset_blocks, uint64_t num_blocks, 5486 spdk_bdev_io_completion_cb cb, void *cb_arg) 5487 { 5488 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5489 return -EINVAL; 5490 } 5491 5492 if (md_buf && !_is_buf_allocated(iov)) { 5493 return -EINVAL; 5494 } 5495 5496 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5497 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5498 } 5499 5500 int 5501 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5502 struct iovec *iov, int iovcnt, 5503 uint64_t offset_blocks, uint64_t num_blocks, 5504 spdk_bdev_io_completion_cb cb, void *cb_arg, 5505 struct spdk_bdev_ext_io_opts *opts) 5506 { 5507 void *md = NULL; 5508 5509 if (opts) { 5510 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5511 return -EINVAL; 5512 } 5513 md = opts->metadata; 5514 } 5515 5516 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5517 return -EINVAL; 5518 } 5519 5520 if (md && !_is_buf_allocated(iov)) { 5521 return -EINVAL; 5522 } 5523 5524 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5525 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5526 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5527 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5528 cb, cb_arg); 5529 } 5530 5531 static void 5532 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5533 { 5534 struct spdk_bdev_io *parent_io = cb_arg; 5535 struct spdk_bdev *bdev = parent_io->bdev; 5536 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5537 int i, rc = 0; 5538 5539 if (!success) { 5540 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5541 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5542 spdk_bdev_free_io(bdev_io); 5543 return; 5544 } 5545 5546 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5547 rc = memcmp(read_buf, 5548 parent_io->u.bdev.iovs[i].iov_base, 5549 parent_io->u.bdev.iovs[i].iov_len); 5550 if (rc) { 5551 break; 5552 } 5553 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5554 } 5555 5556 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5557 rc = memcmp(bdev_io->u.bdev.md_buf, 5558 parent_io->u.bdev.md_buf, 5559 spdk_bdev_get_md_size(bdev)); 5560 } 5561 5562 spdk_bdev_free_io(bdev_io); 5563 5564 if (rc == 0) { 5565 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5566 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5567 } else { 5568 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5569 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5570 } 5571 } 5572 5573 static void 5574 bdev_compare_do_read(void *_bdev_io) 5575 { 5576 struct spdk_bdev_io *bdev_io = _bdev_io; 5577 int rc; 5578 5579 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5580 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5581 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5582 bdev_compare_do_read_done, bdev_io); 5583 5584 if (rc == -ENOMEM) { 5585 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5586 } else if (rc != 0) { 5587 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5588 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5589 } 5590 } 5591 5592 static int 5593 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5594 struct iovec *iov, int iovcnt, void *md_buf, 5595 uint64_t offset_blocks, uint64_t num_blocks, 5596 spdk_bdev_io_completion_cb cb, void *cb_arg) 5597 { 5598 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5599 struct spdk_bdev_io *bdev_io; 5600 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5601 5602 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5603 return -EINVAL; 5604 } 5605 5606 bdev_io = bdev_channel_get_io(channel); 5607 if (!bdev_io) { 5608 return -ENOMEM; 5609 } 5610 5611 bdev_io->internal.ch = channel; 5612 bdev_io->internal.desc = desc; 5613 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5614 bdev_io->u.bdev.iovs = iov; 5615 bdev_io->u.bdev.iovcnt = iovcnt; 5616 bdev_io->u.bdev.md_buf = md_buf; 5617 bdev_io->u.bdev.num_blocks = num_blocks; 5618 bdev_io->u.bdev.offset_blocks = offset_blocks; 5619 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5620 bdev_io->u.bdev.memory_domain = NULL; 5621 bdev_io->u.bdev.memory_domain_ctx = NULL; 5622 bdev_io->u.bdev.accel_sequence = NULL; 5623 5624 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5625 bdev_io_submit(bdev_io); 5626 return 0; 5627 } 5628 5629 bdev_compare_do_read(bdev_io); 5630 5631 return 0; 5632 } 5633 5634 int 5635 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5636 struct iovec *iov, int iovcnt, 5637 uint64_t offset_blocks, uint64_t num_blocks, 5638 spdk_bdev_io_completion_cb cb, void *cb_arg) 5639 { 5640 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5641 num_blocks, cb, cb_arg); 5642 } 5643 5644 int 5645 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5646 struct iovec *iov, int iovcnt, void *md_buf, 5647 uint64_t offset_blocks, uint64_t num_blocks, 5648 spdk_bdev_io_completion_cb cb, void *cb_arg) 5649 { 5650 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5651 return -EINVAL; 5652 } 5653 5654 if (md_buf && !_is_buf_allocated(iov)) { 5655 return -EINVAL; 5656 } 5657 5658 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5659 num_blocks, cb, cb_arg); 5660 } 5661 5662 static int 5663 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5664 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5665 spdk_bdev_io_completion_cb cb, void *cb_arg) 5666 { 5667 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5668 struct spdk_bdev_io *bdev_io; 5669 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5670 5671 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5672 return -EINVAL; 5673 } 5674 5675 bdev_io = bdev_channel_get_io(channel); 5676 if (!bdev_io) { 5677 return -ENOMEM; 5678 } 5679 5680 bdev_io->internal.ch = channel; 5681 bdev_io->internal.desc = desc; 5682 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5683 bdev_io->u.bdev.iovs = &bdev_io->iov; 5684 bdev_io->u.bdev.iovs[0].iov_base = buf; 5685 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5686 bdev_io->u.bdev.iovcnt = 1; 5687 bdev_io->u.bdev.md_buf = md_buf; 5688 bdev_io->u.bdev.num_blocks = num_blocks; 5689 bdev_io->u.bdev.offset_blocks = offset_blocks; 5690 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5691 bdev_io->u.bdev.memory_domain = NULL; 5692 bdev_io->u.bdev.memory_domain_ctx = NULL; 5693 bdev_io->u.bdev.accel_sequence = NULL; 5694 5695 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5696 bdev_io_submit(bdev_io); 5697 return 0; 5698 } 5699 5700 bdev_compare_do_read(bdev_io); 5701 5702 return 0; 5703 } 5704 5705 int 5706 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5707 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5708 spdk_bdev_io_completion_cb cb, void *cb_arg) 5709 { 5710 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5711 cb, cb_arg); 5712 } 5713 5714 int 5715 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5716 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5717 spdk_bdev_io_completion_cb cb, void *cb_arg) 5718 { 5719 struct iovec iov = { 5720 .iov_base = buf, 5721 }; 5722 5723 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5724 return -EINVAL; 5725 } 5726 5727 if (md_buf && !_is_buf_allocated(&iov)) { 5728 return -EINVAL; 5729 } 5730 5731 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5732 cb, cb_arg); 5733 } 5734 5735 static void 5736 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5737 { 5738 struct spdk_bdev_io *bdev_io = ctx; 5739 5740 if (unlock_status) { 5741 SPDK_ERRLOG("LBA range unlock failed\n"); 5742 } 5743 5744 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5745 false, bdev_io->internal.caller_ctx); 5746 } 5747 5748 static void 5749 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5750 { 5751 bdev_io->internal.status = status; 5752 5753 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5754 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5755 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5756 } 5757 5758 static void 5759 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5760 { 5761 struct spdk_bdev_io *parent_io = cb_arg; 5762 5763 if (!success) { 5764 SPDK_ERRLOG("Compare and write operation failed\n"); 5765 } 5766 5767 spdk_bdev_free_io(bdev_io); 5768 5769 bdev_comparev_and_writev_blocks_unlock(parent_io, 5770 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5771 } 5772 5773 static void 5774 bdev_compare_and_write_do_write(void *_bdev_io) 5775 { 5776 struct spdk_bdev_io *bdev_io = _bdev_io; 5777 int rc; 5778 5779 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5780 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5781 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5782 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5783 bdev_compare_and_write_do_write_done, bdev_io); 5784 5785 5786 if (rc == -ENOMEM) { 5787 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5788 } else if (rc != 0) { 5789 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5790 } 5791 } 5792 5793 static void 5794 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5795 { 5796 struct spdk_bdev_io *parent_io = cb_arg; 5797 5798 spdk_bdev_free_io(bdev_io); 5799 5800 if (!success) { 5801 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5802 return; 5803 } 5804 5805 bdev_compare_and_write_do_write(parent_io); 5806 } 5807 5808 static void 5809 bdev_compare_and_write_do_compare(void *_bdev_io) 5810 { 5811 struct spdk_bdev_io *bdev_io = _bdev_io; 5812 int rc; 5813 5814 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5815 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5816 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5817 bdev_compare_and_write_do_compare_done, bdev_io); 5818 5819 if (rc == -ENOMEM) { 5820 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5821 } else if (rc != 0) { 5822 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5823 } 5824 } 5825 5826 static void 5827 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5828 { 5829 struct spdk_bdev_io *bdev_io = ctx; 5830 5831 if (status) { 5832 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5833 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5834 return; 5835 } 5836 5837 bdev_compare_and_write_do_compare(bdev_io); 5838 } 5839 5840 int 5841 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5842 struct iovec *compare_iov, int compare_iovcnt, 5843 struct iovec *write_iov, int write_iovcnt, 5844 uint64_t offset_blocks, uint64_t num_blocks, 5845 spdk_bdev_io_completion_cb cb, void *cb_arg) 5846 { 5847 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5848 struct spdk_bdev_io *bdev_io; 5849 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5850 5851 if (!desc->write) { 5852 return -EBADF; 5853 } 5854 5855 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5856 return -EINVAL; 5857 } 5858 5859 if (num_blocks > bdev->acwu) { 5860 return -EINVAL; 5861 } 5862 5863 bdev_io = bdev_channel_get_io(channel); 5864 if (!bdev_io) { 5865 return -ENOMEM; 5866 } 5867 5868 bdev_io->internal.ch = channel; 5869 bdev_io->internal.desc = desc; 5870 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5871 bdev_io->u.bdev.iovs = compare_iov; 5872 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5873 bdev_io->u.bdev.fused_iovs = write_iov; 5874 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5875 bdev_io->u.bdev.md_buf = NULL; 5876 bdev_io->u.bdev.num_blocks = num_blocks; 5877 bdev_io->u.bdev.offset_blocks = offset_blocks; 5878 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5879 bdev_io->u.bdev.memory_domain = NULL; 5880 bdev_io->u.bdev.memory_domain_ctx = NULL; 5881 bdev_io->u.bdev.accel_sequence = NULL; 5882 5883 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5884 bdev_io_submit(bdev_io); 5885 return 0; 5886 } 5887 5888 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5889 bdev_comparev_and_writev_blocks_locked, bdev_io); 5890 } 5891 5892 int 5893 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5894 struct iovec *iov, int iovcnt, 5895 uint64_t offset_blocks, uint64_t num_blocks, 5896 bool populate, 5897 spdk_bdev_io_completion_cb cb, void *cb_arg) 5898 { 5899 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5900 struct spdk_bdev_io *bdev_io; 5901 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5902 5903 if (!desc->write) { 5904 return -EBADF; 5905 } 5906 5907 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5908 return -EINVAL; 5909 } 5910 5911 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5912 return -ENOTSUP; 5913 } 5914 5915 bdev_io = bdev_channel_get_io(channel); 5916 if (!bdev_io) { 5917 return -ENOMEM; 5918 } 5919 5920 bdev_io->internal.ch = channel; 5921 bdev_io->internal.desc = desc; 5922 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5923 bdev_io->u.bdev.num_blocks = num_blocks; 5924 bdev_io->u.bdev.offset_blocks = offset_blocks; 5925 bdev_io->u.bdev.iovs = iov; 5926 bdev_io->u.bdev.iovcnt = iovcnt; 5927 bdev_io->u.bdev.md_buf = NULL; 5928 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5929 bdev_io->u.bdev.zcopy.commit = 0; 5930 bdev_io->u.bdev.zcopy.start = 1; 5931 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5932 bdev_io->u.bdev.memory_domain = NULL; 5933 bdev_io->u.bdev.memory_domain_ctx = NULL; 5934 bdev_io->u.bdev.accel_sequence = NULL; 5935 5936 bdev_io_submit(bdev_io); 5937 5938 return 0; 5939 } 5940 5941 int 5942 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5943 spdk_bdev_io_completion_cb cb, void *cb_arg) 5944 { 5945 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5946 return -EINVAL; 5947 } 5948 5949 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5950 bdev_io->u.bdev.zcopy.start = 0; 5951 bdev_io->internal.caller_ctx = cb_arg; 5952 bdev_io->internal.cb = cb; 5953 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5954 5955 bdev_io_submit(bdev_io); 5956 5957 return 0; 5958 } 5959 5960 int 5961 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5962 uint64_t offset, uint64_t len, 5963 spdk_bdev_io_completion_cb cb, void *cb_arg) 5964 { 5965 uint64_t offset_blocks, num_blocks; 5966 5967 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5968 len, &num_blocks) != 0) { 5969 return -EINVAL; 5970 } 5971 5972 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5973 } 5974 5975 int 5976 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5977 uint64_t offset_blocks, uint64_t num_blocks, 5978 spdk_bdev_io_completion_cb cb, void *cb_arg) 5979 { 5980 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5981 struct spdk_bdev_io *bdev_io; 5982 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5983 5984 if (!desc->write) { 5985 return -EBADF; 5986 } 5987 5988 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5989 return -EINVAL; 5990 } 5991 5992 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5993 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5994 return -ENOTSUP; 5995 } 5996 5997 bdev_io = bdev_channel_get_io(channel); 5998 5999 if (!bdev_io) { 6000 return -ENOMEM; 6001 } 6002 6003 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6004 bdev_io->internal.ch = channel; 6005 bdev_io->internal.desc = desc; 6006 bdev_io->u.bdev.offset_blocks = offset_blocks; 6007 bdev_io->u.bdev.num_blocks = num_blocks; 6008 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6009 bdev_io->u.bdev.memory_domain = NULL; 6010 bdev_io->u.bdev.memory_domain_ctx = NULL; 6011 bdev_io->u.bdev.accel_sequence = NULL; 6012 6013 /* If the write_zeroes size is large and should be split, use the generic split 6014 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6015 * 6016 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6017 * or emulate it using regular write request otherwise. 6018 */ 6019 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6020 bdev_io->internal.split) { 6021 bdev_io_submit(bdev_io); 6022 return 0; 6023 } 6024 6025 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6026 6027 return bdev_write_zero_buffer(bdev_io); 6028 } 6029 6030 int 6031 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6032 uint64_t offset, uint64_t nbytes, 6033 spdk_bdev_io_completion_cb cb, void *cb_arg) 6034 { 6035 uint64_t offset_blocks, num_blocks; 6036 6037 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6038 nbytes, &num_blocks) != 0) { 6039 return -EINVAL; 6040 } 6041 6042 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6043 } 6044 6045 int 6046 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6047 uint64_t offset_blocks, uint64_t num_blocks, 6048 spdk_bdev_io_completion_cb cb, void *cb_arg) 6049 { 6050 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6051 struct spdk_bdev_io *bdev_io; 6052 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6053 6054 if (!desc->write) { 6055 return -EBADF; 6056 } 6057 6058 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6059 return -EINVAL; 6060 } 6061 6062 if (num_blocks == 0) { 6063 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6064 return -EINVAL; 6065 } 6066 6067 bdev_io = bdev_channel_get_io(channel); 6068 if (!bdev_io) { 6069 return -ENOMEM; 6070 } 6071 6072 bdev_io->internal.ch = channel; 6073 bdev_io->internal.desc = desc; 6074 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6075 6076 bdev_io->u.bdev.iovs = &bdev_io->iov; 6077 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6078 bdev_io->u.bdev.iovs[0].iov_len = 0; 6079 bdev_io->u.bdev.iovcnt = 1; 6080 6081 bdev_io->u.bdev.offset_blocks = offset_blocks; 6082 bdev_io->u.bdev.num_blocks = num_blocks; 6083 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6084 bdev_io->u.bdev.memory_domain = NULL; 6085 bdev_io->u.bdev.memory_domain_ctx = NULL; 6086 bdev_io->u.bdev.accel_sequence = NULL; 6087 6088 bdev_io_submit(bdev_io); 6089 return 0; 6090 } 6091 6092 int 6093 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6094 uint64_t offset, uint64_t length, 6095 spdk_bdev_io_completion_cb cb, void *cb_arg) 6096 { 6097 uint64_t offset_blocks, num_blocks; 6098 6099 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6100 length, &num_blocks) != 0) { 6101 return -EINVAL; 6102 } 6103 6104 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6105 } 6106 6107 int 6108 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6109 uint64_t offset_blocks, uint64_t num_blocks, 6110 spdk_bdev_io_completion_cb cb, void *cb_arg) 6111 { 6112 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6113 struct spdk_bdev_io *bdev_io; 6114 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6115 6116 if (!desc->write) { 6117 return -EBADF; 6118 } 6119 6120 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6121 return -EINVAL; 6122 } 6123 6124 bdev_io = bdev_channel_get_io(channel); 6125 if (!bdev_io) { 6126 return -ENOMEM; 6127 } 6128 6129 bdev_io->internal.ch = channel; 6130 bdev_io->internal.desc = desc; 6131 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6132 bdev_io->u.bdev.iovs = NULL; 6133 bdev_io->u.bdev.iovcnt = 0; 6134 bdev_io->u.bdev.offset_blocks = offset_blocks; 6135 bdev_io->u.bdev.num_blocks = num_blocks; 6136 bdev_io->u.bdev.memory_domain = NULL; 6137 bdev_io->u.bdev.memory_domain_ctx = NULL; 6138 bdev_io->u.bdev.accel_sequence = NULL; 6139 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6140 6141 bdev_io_submit(bdev_io); 6142 return 0; 6143 } 6144 6145 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6146 6147 static void 6148 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6149 { 6150 struct spdk_bdev_channel *ch = _ctx; 6151 struct spdk_bdev_io *bdev_io; 6152 6153 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6154 6155 if (status == -EBUSY) { 6156 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6157 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6158 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6159 } else { 6160 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6161 6162 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6163 /* If outstanding IOs are still present and reset_io_drain_timeout 6164 * seconds passed, start the reset. */ 6165 bdev_io_submit_reset(bdev_io); 6166 } else { 6167 /* We still have in progress memory domain pull/push or we're 6168 * executing accel sequence. Since we cannot abort either of those 6169 * operaions, fail the reset request. */ 6170 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6171 } 6172 } 6173 } else { 6174 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6175 SPDK_DEBUGLOG(bdev, 6176 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6177 ch->bdev->name); 6178 /* Mark the completion status as a SUCCESS and complete the reset. */ 6179 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6180 } 6181 } 6182 6183 static void 6184 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6185 struct spdk_io_channel *io_ch, void *_ctx) 6186 { 6187 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6188 int status = 0; 6189 6190 if (cur_ch->io_outstanding > 0 || 6191 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6192 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6193 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6194 * further iteration over the rest of the channels and pass non-zero status 6195 * to the callback function. */ 6196 status = -EBUSY; 6197 } 6198 spdk_bdev_for_each_channel_continue(i, status); 6199 } 6200 6201 static int 6202 bdev_reset_poll_for_outstanding_io(void *ctx) 6203 { 6204 struct spdk_bdev_channel *ch = ctx; 6205 struct spdk_bdev_io *bdev_io; 6206 6207 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6208 6209 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6210 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6211 bdev_reset_check_outstanding_io_done); 6212 6213 return SPDK_POLLER_BUSY; 6214 } 6215 6216 static void 6217 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6218 { 6219 struct spdk_bdev_channel *ch = _ctx; 6220 struct spdk_bdev_io *bdev_io; 6221 6222 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6223 6224 if (bdev->reset_io_drain_timeout == 0) { 6225 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6226 6227 bdev_io_submit_reset(bdev_io); 6228 return; 6229 } 6230 6231 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6232 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6233 6234 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6235 * submit the reset to the underlying module only if outstanding I/O 6236 * remain after reset_io_drain_timeout seconds have passed. */ 6237 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6238 bdev_reset_check_outstanding_io_done); 6239 } 6240 6241 static void 6242 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6243 struct spdk_io_channel *ch, void *_ctx) 6244 { 6245 struct spdk_bdev_channel *channel; 6246 struct spdk_bdev_mgmt_channel *mgmt_channel; 6247 struct spdk_bdev_shared_resource *shared_resource; 6248 bdev_io_tailq_t tmp_queued; 6249 6250 TAILQ_INIT(&tmp_queued); 6251 6252 channel = __io_ch_to_bdev_ch(ch); 6253 shared_resource = channel->shared_resource; 6254 mgmt_channel = shared_resource->mgmt_ch; 6255 6256 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6257 6258 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6259 /* The QoS object is always valid and readable while 6260 * the channel flag is set, so the lock here should not 6261 * be necessary. We're not in the fast path though, so 6262 * just take it anyway. */ 6263 spdk_spin_lock(&channel->bdev->internal.spinlock); 6264 if (channel->bdev->internal.qos->ch == channel) { 6265 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6266 } 6267 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6268 } 6269 6270 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6271 bdev_abort_all_buf_io(mgmt_channel, channel); 6272 bdev_abort_all_queued_io(&tmp_queued, channel); 6273 6274 spdk_bdev_for_each_channel_continue(i, 0); 6275 } 6276 6277 static void 6278 bdev_start_reset(void *ctx) 6279 { 6280 struct spdk_bdev_channel *ch = ctx; 6281 6282 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6283 bdev_reset_freeze_channel_done); 6284 } 6285 6286 static void 6287 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6288 { 6289 struct spdk_bdev *bdev = ch->bdev; 6290 6291 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6292 6293 spdk_spin_lock(&bdev->internal.spinlock); 6294 if (bdev->internal.reset_in_progress == NULL) { 6295 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6296 /* 6297 * Take a channel reference for the target bdev for the life of this 6298 * reset. This guards against the channel getting destroyed while 6299 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6300 * progress. We will release the reference when this reset is 6301 * completed. 6302 */ 6303 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6304 bdev_start_reset(ch); 6305 } 6306 spdk_spin_unlock(&bdev->internal.spinlock); 6307 } 6308 6309 int 6310 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6311 spdk_bdev_io_completion_cb cb, void *cb_arg) 6312 { 6313 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6314 struct spdk_bdev_io *bdev_io; 6315 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6316 6317 bdev_io = bdev_channel_get_io(channel); 6318 if (!bdev_io) { 6319 return -ENOMEM; 6320 } 6321 6322 bdev_io->internal.ch = channel; 6323 bdev_io->internal.desc = desc; 6324 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6325 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6326 bdev_io->u.reset.ch_ref = NULL; 6327 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6328 6329 spdk_spin_lock(&bdev->internal.spinlock); 6330 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6331 spdk_spin_unlock(&bdev->internal.spinlock); 6332 6333 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6334 internal.ch_link); 6335 6336 bdev_channel_start_reset(channel); 6337 6338 return 0; 6339 } 6340 6341 void 6342 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6343 struct spdk_bdev_io_stat *stat) 6344 { 6345 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6346 6347 bdev_get_io_stat(stat, channel->stat); 6348 } 6349 6350 static void 6351 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6352 { 6353 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6354 6355 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6356 bdev_iostat_ctx->cb_arg, 0); 6357 free(bdev_iostat_ctx); 6358 } 6359 6360 static void 6361 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6362 struct spdk_io_channel *ch, void *_ctx) 6363 { 6364 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6365 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6366 6367 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6368 spdk_bdev_for_each_channel_continue(i, 0); 6369 } 6370 6371 void 6372 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6373 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6374 { 6375 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6376 6377 assert(bdev != NULL); 6378 assert(stat != NULL); 6379 assert(cb != NULL); 6380 6381 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6382 if (bdev_iostat_ctx == NULL) { 6383 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6384 cb(bdev, stat, cb_arg, -ENOMEM); 6385 return; 6386 } 6387 6388 bdev_iostat_ctx->stat = stat; 6389 bdev_iostat_ctx->cb = cb; 6390 bdev_iostat_ctx->cb_arg = cb_arg; 6391 6392 /* Start with the statistics from previously deleted channels. */ 6393 spdk_spin_lock(&bdev->internal.spinlock); 6394 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6395 spdk_spin_unlock(&bdev->internal.spinlock); 6396 6397 /* Then iterate and add the statistics from each existing channel. */ 6398 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6399 bdev_get_device_stat_done); 6400 } 6401 6402 struct bdev_iostat_reset_ctx { 6403 enum spdk_bdev_reset_stat_mode mode; 6404 bdev_reset_device_stat_cb cb; 6405 void *cb_arg; 6406 }; 6407 6408 static void 6409 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6410 { 6411 struct bdev_iostat_reset_ctx *ctx = _ctx; 6412 6413 ctx->cb(bdev, ctx->cb_arg, 0); 6414 6415 free(ctx); 6416 } 6417 6418 static void 6419 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6420 struct spdk_io_channel *ch, void *_ctx) 6421 { 6422 struct bdev_iostat_reset_ctx *ctx = _ctx; 6423 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6424 6425 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6426 6427 spdk_bdev_for_each_channel_continue(i, 0); 6428 } 6429 6430 void 6431 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6432 bdev_reset_device_stat_cb cb, void *cb_arg) 6433 { 6434 struct bdev_iostat_reset_ctx *ctx; 6435 6436 assert(bdev != NULL); 6437 assert(cb != NULL); 6438 6439 ctx = calloc(1, sizeof(*ctx)); 6440 if (ctx == NULL) { 6441 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6442 cb(bdev, cb_arg, -ENOMEM); 6443 return; 6444 } 6445 6446 ctx->mode = mode; 6447 ctx->cb = cb; 6448 ctx->cb_arg = cb_arg; 6449 6450 spdk_spin_lock(&bdev->internal.spinlock); 6451 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6452 spdk_spin_unlock(&bdev->internal.spinlock); 6453 6454 spdk_bdev_for_each_channel(bdev, 6455 bdev_reset_each_channel_stat, 6456 ctx, 6457 bdev_reset_device_stat_done); 6458 } 6459 6460 int 6461 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6462 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6463 spdk_bdev_io_completion_cb cb, void *cb_arg) 6464 { 6465 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6466 struct spdk_bdev_io *bdev_io; 6467 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6468 6469 if (!desc->write) { 6470 return -EBADF; 6471 } 6472 6473 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6474 return -ENOTSUP; 6475 } 6476 6477 bdev_io = bdev_channel_get_io(channel); 6478 if (!bdev_io) { 6479 return -ENOMEM; 6480 } 6481 6482 bdev_io->internal.ch = channel; 6483 bdev_io->internal.desc = desc; 6484 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6485 bdev_io->u.nvme_passthru.cmd = *cmd; 6486 bdev_io->u.nvme_passthru.buf = buf; 6487 bdev_io->u.nvme_passthru.nbytes = nbytes; 6488 bdev_io->u.nvme_passthru.md_buf = NULL; 6489 bdev_io->u.nvme_passthru.md_len = 0; 6490 6491 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6492 6493 bdev_io_submit(bdev_io); 6494 return 0; 6495 } 6496 6497 int 6498 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6499 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6500 spdk_bdev_io_completion_cb cb, void *cb_arg) 6501 { 6502 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6503 struct spdk_bdev_io *bdev_io; 6504 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6505 6506 if (!desc->write) { 6507 /* 6508 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6509 * to easily determine if the command is a read or write, but for now just 6510 * do not allow io_passthru with a read-only descriptor. 6511 */ 6512 return -EBADF; 6513 } 6514 6515 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6516 return -ENOTSUP; 6517 } 6518 6519 bdev_io = bdev_channel_get_io(channel); 6520 if (!bdev_io) { 6521 return -ENOMEM; 6522 } 6523 6524 bdev_io->internal.ch = channel; 6525 bdev_io->internal.desc = desc; 6526 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6527 bdev_io->u.nvme_passthru.cmd = *cmd; 6528 bdev_io->u.nvme_passthru.buf = buf; 6529 bdev_io->u.nvme_passthru.nbytes = nbytes; 6530 bdev_io->u.nvme_passthru.md_buf = NULL; 6531 bdev_io->u.nvme_passthru.md_len = 0; 6532 6533 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6534 6535 bdev_io_submit(bdev_io); 6536 return 0; 6537 } 6538 6539 int 6540 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6541 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6542 spdk_bdev_io_completion_cb cb, void *cb_arg) 6543 { 6544 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6545 struct spdk_bdev_io *bdev_io; 6546 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6547 6548 if (!desc->write) { 6549 /* 6550 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6551 * to easily determine if the command is a read or write, but for now just 6552 * do not allow io_passthru with a read-only descriptor. 6553 */ 6554 return -EBADF; 6555 } 6556 6557 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6558 return -ENOTSUP; 6559 } 6560 6561 bdev_io = bdev_channel_get_io(channel); 6562 if (!bdev_io) { 6563 return -ENOMEM; 6564 } 6565 6566 bdev_io->internal.ch = channel; 6567 bdev_io->internal.desc = desc; 6568 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6569 bdev_io->u.nvme_passthru.cmd = *cmd; 6570 bdev_io->u.nvme_passthru.buf = buf; 6571 bdev_io->u.nvme_passthru.nbytes = nbytes; 6572 bdev_io->u.nvme_passthru.md_buf = md_buf; 6573 bdev_io->u.nvme_passthru.md_len = md_len; 6574 6575 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6576 6577 bdev_io_submit(bdev_io); 6578 return 0; 6579 } 6580 6581 static void bdev_abort_retry(void *ctx); 6582 static void bdev_abort(struct spdk_bdev_io *parent_io); 6583 6584 static void 6585 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6586 { 6587 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6588 struct spdk_bdev_io *parent_io = cb_arg; 6589 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6590 6591 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6592 6593 spdk_bdev_free_io(bdev_io); 6594 6595 if (!success) { 6596 /* Check if the target I/O completed in the meantime. */ 6597 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6598 if (tmp_io == bio_to_abort) { 6599 break; 6600 } 6601 } 6602 6603 /* If the target I/O still exists, set the parent to failed. */ 6604 if (tmp_io != NULL) { 6605 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6606 } 6607 } 6608 6609 parent_io->u.bdev.split_outstanding--; 6610 if (parent_io->u.bdev.split_outstanding == 0) { 6611 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6612 bdev_abort_retry(parent_io); 6613 } else { 6614 bdev_io_complete(parent_io); 6615 } 6616 } 6617 } 6618 6619 static int 6620 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6621 struct spdk_bdev_io *bio_to_abort, 6622 spdk_bdev_io_completion_cb cb, void *cb_arg) 6623 { 6624 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6625 struct spdk_bdev_io *bdev_io; 6626 6627 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6628 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6629 /* TODO: Abort reset or abort request. */ 6630 return -ENOTSUP; 6631 } 6632 6633 bdev_io = bdev_channel_get_io(channel); 6634 if (bdev_io == NULL) { 6635 return -ENOMEM; 6636 } 6637 6638 bdev_io->internal.ch = channel; 6639 bdev_io->internal.desc = desc; 6640 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6641 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6642 6643 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6644 assert(bdev_io_should_split(bio_to_abort)); 6645 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6646 6647 /* Parent abort request is not submitted directly, but to manage its 6648 * execution add it to the submitted list here. 6649 */ 6650 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6651 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6652 6653 bdev_abort(bdev_io); 6654 6655 return 0; 6656 } 6657 6658 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6659 6660 /* Submit the abort request to the underlying bdev module. */ 6661 bdev_io_submit(bdev_io); 6662 6663 return 0; 6664 } 6665 6666 static bool 6667 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6668 { 6669 struct spdk_bdev_io *iter; 6670 6671 TAILQ_FOREACH(iter, tailq, internal.link) { 6672 if (iter == bdev_io) { 6673 return true; 6674 } 6675 } 6676 6677 return false; 6678 } 6679 6680 static uint32_t 6681 _bdev_abort(struct spdk_bdev_io *parent_io) 6682 { 6683 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6684 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6685 void *bio_cb_arg; 6686 struct spdk_bdev_io *bio_to_abort; 6687 uint32_t matched_ios; 6688 int rc; 6689 6690 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6691 6692 /* matched_ios is returned and will be kept by the caller. 6693 * 6694 * This function will be used for two cases, 1) the same cb_arg is used for 6695 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6696 * Incrementing split_outstanding directly here may confuse readers especially 6697 * for the 1st case. 6698 * 6699 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6700 * works as expected. 6701 */ 6702 matched_ios = 0; 6703 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6704 6705 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6706 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6707 continue; 6708 } 6709 6710 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6711 /* Any I/O which was submitted after this abort command should be excluded. */ 6712 continue; 6713 } 6714 6715 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6716 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6717 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6718 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6719 break; 6720 } 6721 6722 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6723 if (rc != 0) { 6724 if (rc == -ENOMEM) { 6725 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6726 } else { 6727 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6728 } 6729 break; 6730 } 6731 matched_ios++; 6732 } 6733 6734 return matched_ios; 6735 } 6736 6737 static void 6738 bdev_abort_retry(void *ctx) 6739 { 6740 struct spdk_bdev_io *parent_io = ctx; 6741 uint32_t matched_ios; 6742 6743 matched_ios = _bdev_abort(parent_io); 6744 6745 if (matched_ios == 0) { 6746 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6747 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6748 } else { 6749 /* For retry, the case that no target I/O was found is success 6750 * because it means target I/Os completed in the meantime. 6751 */ 6752 bdev_io_complete(parent_io); 6753 } 6754 return; 6755 } 6756 6757 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6758 parent_io->u.bdev.split_outstanding = matched_ios; 6759 } 6760 6761 static void 6762 bdev_abort(struct spdk_bdev_io *parent_io) 6763 { 6764 uint32_t matched_ios; 6765 6766 matched_ios = _bdev_abort(parent_io); 6767 6768 if (matched_ios == 0) { 6769 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6770 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6771 } else { 6772 /* The case the no target I/O was found is failure. */ 6773 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6774 bdev_io_complete(parent_io); 6775 } 6776 return; 6777 } 6778 6779 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6780 parent_io->u.bdev.split_outstanding = matched_ios; 6781 } 6782 6783 int 6784 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6785 void *bio_cb_arg, 6786 spdk_bdev_io_completion_cb cb, void *cb_arg) 6787 { 6788 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6789 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6790 struct spdk_bdev_io *bdev_io; 6791 6792 if (bio_cb_arg == NULL) { 6793 return -EINVAL; 6794 } 6795 6796 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6797 return -ENOTSUP; 6798 } 6799 6800 bdev_io = bdev_channel_get_io(channel); 6801 if (bdev_io == NULL) { 6802 return -ENOMEM; 6803 } 6804 6805 bdev_io->internal.ch = channel; 6806 bdev_io->internal.desc = desc; 6807 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6808 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6809 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6810 6811 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6812 6813 /* Parent abort request is not submitted directly, but to manage its execution, 6814 * add it to the submitted list here. 6815 */ 6816 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6817 6818 bdev_abort(bdev_io); 6819 6820 return 0; 6821 } 6822 6823 int 6824 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6825 struct spdk_bdev_io_wait_entry *entry) 6826 { 6827 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6828 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6829 6830 if (bdev != entry->bdev) { 6831 SPDK_ERRLOG("bdevs do not match\n"); 6832 return -EINVAL; 6833 } 6834 6835 if (mgmt_ch->per_thread_cache_count > 0) { 6836 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6837 return -EINVAL; 6838 } 6839 6840 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6841 return 0; 6842 } 6843 6844 static inline void 6845 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6846 { 6847 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6848 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6849 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6850 uint32_t blocklen = bdev_io->bdev->blocklen; 6851 6852 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6853 switch (bdev_io->type) { 6854 case SPDK_BDEV_IO_TYPE_READ: 6855 io_stat->bytes_read += num_blocks * blocklen; 6856 io_stat->num_read_ops++; 6857 io_stat->read_latency_ticks += tsc_diff; 6858 if (io_stat->max_read_latency_ticks < tsc_diff) { 6859 io_stat->max_read_latency_ticks = tsc_diff; 6860 } 6861 if (io_stat->min_read_latency_ticks > tsc_diff) { 6862 io_stat->min_read_latency_ticks = tsc_diff; 6863 } 6864 break; 6865 case SPDK_BDEV_IO_TYPE_WRITE: 6866 io_stat->bytes_written += num_blocks * blocklen; 6867 io_stat->num_write_ops++; 6868 io_stat->write_latency_ticks += tsc_diff; 6869 if (io_stat->max_write_latency_ticks < tsc_diff) { 6870 io_stat->max_write_latency_ticks = tsc_diff; 6871 } 6872 if (io_stat->min_write_latency_ticks > tsc_diff) { 6873 io_stat->min_write_latency_ticks = tsc_diff; 6874 } 6875 break; 6876 case SPDK_BDEV_IO_TYPE_UNMAP: 6877 io_stat->bytes_unmapped += num_blocks * blocklen; 6878 io_stat->num_unmap_ops++; 6879 io_stat->unmap_latency_ticks += tsc_diff; 6880 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6881 io_stat->max_unmap_latency_ticks = tsc_diff; 6882 } 6883 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6884 io_stat->min_unmap_latency_ticks = tsc_diff; 6885 } 6886 break; 6887 case SPDK_BDEV_IO_TYPE_ZCOPY: 6888 /* Track the data in the start phase only */ 6889 if (bdev_io->u.bdev.zcopy.start) { 6890 if (bdev_io->u.bdev.zcopy.populate) { 6891 io_stat->bytes_read += num_blocks * blocklen; 6892 io_stat->num_read_ops++; 6893 io_stat->read_latency_ticks += tsc_diff; 6894 if (io_stat->max_read_latency_ticks < tsc_diff) { 6895 io_stat->max_read_latency_ticks = tsc_diff; 6896 } 6897 if (io_stat->min_read_latency_ticks > tsc_diff) { 6898 io_stat->min_read_latency_ticks = tsc_diff; 6899 } 6900 } else { 6901 io_stat->bytes_written += num_blocks * blocklen; 6902 io_stat->num_write_ops++; 6903 io_stat->write_latency_ticks += tsc_diff; 6904 if (io_stat->max_write_latency_ticks < tsc_diff) { 6905 io_stat->max_write_latency_ticks = tsc_diff; 6906 } 6907 if (io_stat->min_write_latency_ticks > tsc_diff) { 6908 io_stat->min_write_latency_ticks = tsc_diff; 6909 } 6910 } 6911 } 6912 break; 6913 case SPDK_BDEV_IO_TYPE_COPY: 6914 io_stat->bytes_copied += num_blocks * blocklen; 6915 io_stat->num_copy_ops++; 6916 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6917 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6918 io_stat->max_copy_latency_ticks = tsc_diff; 6919 } 6920 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6921 io_stat->min_copy_latency_ticks = tsc_diff; 6922 } 6923 break; 6924 default: 6925 break; 6926 } 6927 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6928 io_stat = bdev_io->bdev->internal.stat; 6929 assert(io_stat->io_error != NULL); 6930 6931 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6932 io_stat->io_error->error_status[-io_status - 1]++; 6933 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6934 } 6935 6936 #ifdef SPDK_CONFIG_VTUNE 6937 uint64_t now_tsc = spdk_get_ticks(); 6938 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6939 uint64_t data[5]; 6940 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6941 6942 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6943 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6944 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6945 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6946 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6947 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6948 6949 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6950 __itt_metadata_u64, 5, data); 6951 6952 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6953 bdev_io->internal.ch->start_tsc = now_tsc; 6954 } 6955 #endif 6956 } 6957 6958 static inline void 6959 _bdev_io_complete(void *ctx) 6960 { 6961 struct spdk_bdev_io *bdev_io = ctx; 6962 6963 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6964 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6965 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6966 } 6967 6968 assert(bdev_io->internal.cb != NULL); 6969 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6970 6971 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6972 bdev_io->internal.caller_ctx); 6973 } 6974 6975 static inline void 6976 bdev_io_complete(void *ctx) 6977 { 6978 struct spdk_bdev_io *bdev_io = ctx; 6979 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6980 uint64_t tsc, tsc_diff; 6981 6982 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6983 /* 6984 * Defer completion to avoid potential infinite recursion if the 6985 * user's completion callback issues a new I/O. 6986 */ 6987 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6988 bdev_io_complete, bdev_io); 6989 return; 6990 } 6991 6992 tsc = spdk_get_ticks(); 6993 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6994 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6995 bdev_io->internal.caller_ctx); 6996 6997 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6998 6999 if (bdev_io->internal.ch->histogram) { 7000 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7001 } 7002 7003 bdev_io_update_io_stat(bdev_io, tsc_diff); 7004 _bdev_io_complete(bdev_io); 7005 } 7006 7007 /* The difference between this function and bdev_io_complete() is that this should be called to 7008 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7009 * io_submitted list and don't have submit_tsc updated. 7010 */ 7011 static inline void 7012 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7013 { 7014 /* Since the IO hasn't been submitted it's bound to be failed */ 7015 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7016 7017 /* At this point we don't know if the IO is completed from submission context or not, but, 7018 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7019 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7020 _bdev_io_complete, bdev_io); 7021 } 7022 7023 static void bdev_destroy_cb(void *io_device); 7024 7025 static void 7026 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7027 { 7028 struct spdk_bdev_io *bdev_io = _ctx; 7029 7030 if (bdev_io->u.reset.ch_ref != NULL) { 7031 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7032 bdev_io->u.reset.ch_ref = NULL; 7033 } 7034 7035 bdev_io_complete(bdev_io); 7036 7037 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7038 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7039 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7040 } 7041 } 7042 7043 static void 7044 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7045 struct spdk_io_channel *_ch, void *_ctx) 7046 { 7047 struct spdk_bdev_io *bdev_io = _ctx; 7048 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7049 struct spdk_bdev_io *queued_reset; 7050 7051 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7052 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7053 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7054 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7055 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7056 } 7057 7058 spdk_bdev_for_each_channel_continue(i, 0); 7059 } 7060 7061 static void 7062 bdev_io_complete_sequence_cb(void *ctx, int status) 7063 { 7064 struct spdk_bdev_io *bdev_io = ctx; 7065 7066 /* u.bdev.accel_sequence should have already been cleared at this point */ 7067 assert(bdev_io->u.bdev.accel_sequence == NULL); 7068 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7069 bdev_io->internal.accel_sequence = NULL; 7070 7071 if (spdk_unlikely(status != 0)) { 7072 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7073 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7074 } 7075 7076 bdev_io_complete(bdev_io); 7077 } 7078 7079 void 7080 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7081 { 7082 struct spdk_bdev *bdev = bdev_io->bdev; 7083 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7084 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7085 7086 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7087 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7088 spdk_bdev_get_module_name(bdev), 7089 bdev_io_status_get_string(bdev_io->internal.status)); 7090 assert(false); 7091 } 7092 bdev_io->internal.status = status; 7093 7094 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7095 bool unlock_channels = false; 7096 7097 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7098 SPDK_ERRLOG("NOMEM returned for reset\n"); 7099 } 7100 spdk_spin_lock(&bdev->internal.spinlock); 7101 if (bdev_io == bdev->internal.reset_in_progress) { 7102 bdev->internal.reset_in_progress = NULL; 7103 unlock_channels = true; 7104 } 7105 spdk_spin_unlock(&bdev->internal.spinlock); 7106 7107 if (unlock_channels) { 7108 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7109 bdev_reset_complete); 7110 return; 7111 } 7112 } else { 7113 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7114 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7115 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7116 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7117 return; 7118 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7119 !bdev_io_use_accel_sequence(bdev_io))) { 7120 _bdev_io_push_bounce_data_buffer(bdev_io, 7121 _bdev_io_complete_push_bounce_done); 7122 /* bdev IO will be completed in the callback */ 7123 return; 7124 } 7125 } 7126 7127 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7128 return; 7129 } 7130 } 7131 7132 bdev_io_complete(bdev_io); 7133 } 7134 7135 void 7136 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7137 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7138 { 7139 enum spdk_bdev_io_status status; 7140 7141 if (sc == SPDK_SCSI_STATUS_GOOD) { 7142 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7143 } else { 7144 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7145 bdev_io->internal.error.scsi.sc = sc; 7146 bdev_io->internal.error.scsi.sk = sk; 7147 bdev_io->internal.error.scsi.asc = asc; 7148 bdev_io->internal.error.scsi.ascq = ascq; 7149 } 7150 7151 spdk_bdev_io_complete(bdev_io, status); 7152 } 7153 7154 void 7155 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7156 int *sc, int *sk, int *asc, int *ascq) 7157 { 7158 assert(sc != NULL); 7159 assert(sk != NULL); 7160 assert(asc != NULL); 7161 assert(ascq != NULL); 7162 7163 switch (bdev_io->internal.status) { 7164 case SPDK_BDEV_IO_STATUS_SUCCESS: 7165 *sc = SPDK_SCSI_STATUS_GOOD; 7166 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7167 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7168 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7169 break; 7170 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7171 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7172 break; 7173 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7174 *sc = bdev_io->internal.error.scsi.sc; 7175 *sk = bdev_io->internal.error.scsi.sk; 7176 *asc = bdev_io->internal.error.scsi.asc; 7177 *ascq = bdev_io->internal.error.scsi.ascq; 7178 break; 7179 default: 7180 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7181 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7182 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7183 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7184 break; 7185 } 7186 } 7187 7188 void 7189 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7190 { 7191 enum spdk_bdev_io_status status; 7192 7193 if (aio_result == 0) { 7194 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7195 } else { 7196 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7197 } 7198 7199 bdev_io->internal.error.aio_result = aio_result; 7200 7201 spdk_bdev_io_complete(bdev_io, status); 7202 } 7203 7204 void 7205 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7206 { 7207 assert(aio_result != NULL); 7208 7209 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7210 *aio_result = bdev_io->internal.error.aio_result; 7211 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7212 *aio_result = 0; 7213 } else { 7214 *aio_result = -EIO; 7215 } 7216 } 7217 7218 void 7219 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7220 { 7221 enum spdk_bdev_io_status status; 7222 7223 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7224 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7225 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7226 status = SPDK_BDEV_IO_STATUS_ABORTED; 7227 } else { 7228 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7229 } 7230 7231 bdev_io->internal.error.nvme.cdw0 = cdw0; 7232 bdev_io->internal.error.nvme.sct = sct; 7233 bdev_io->internal.error.nvme.sc = sc; 7234 7235 spdk_bdev_io_complete(bdev_io, status); 7236 } 7237 7238 void 7239 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7240 { 7241 assert(sct != NULL); 7242 assert(sc != NULL); 7243 assert(cdw0 != NULL); 7244 7245 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7246 *sct = SPDK_NVME_SCT_GENERIC; 7247 *sc = SPDK_NVME_SC_SUCCESS; 7248 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7249 *cdw0 = 0; 7250 } else { 7251 *cdw0 = 1U; 7252 } 7253 return; 7254 } 7255 7256 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7257 *sct = bdev_io->internal.error.nvme.sct; 7258 *sc = bdev_io->internal.error.nvme.sc; 7259 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7260 *sct = SPDK_NVME_SCT_GENERIC; 7261 *sc = SPDK_NVME_SC_SUCCESS; 7262 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7263 *sct = SPDK_NVME_SCT_GENERIC; 7264 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7265 } else { 7266 *sct = SPDK_NVME_SCT_GENERIC; 7267 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7268 } 7269 7270 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7271 } 7272 7273 void 7274 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7275 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7276 { 7277 assert(first_sct != NULL); 7278 assert(first_sc != NULL); 7279 assert(second_sct != NULL); 7280 assert(second_sc != NULL); 7281 assert(cdw0 != NULL); 7282 7283 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7284 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7285 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7286 *first_sct = bdev_io->internal.error.nvme.sct; 7287 *first_sc = bdev_io->internal.error.nvme.sc; 7288 *second_sct = SPDK_NVME_SCT_GENERIC; 7289 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7290 } else { 7291 *first_sct = SPDK_NVME_SCT_GENERIC; 7292 *first_sc = SPDK_NVME_SC_SUCCESS; 7293 *second_sct = bdev_io->internal.error.nvme.sct; 7294 *second_sc = bdev_io->internal.error.nvme.sc; 7295 } 7296 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7297 *first_sct = SPDK_NVME_SCT_GENERIC; 7298 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7299 *second_sct = SPDK_NVME_SCT_GENERIC; 7300 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7301 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7302 *first_sct = SPDK_NVME_SCT_GENERIC; 7303 *first_sc = SPDK_NVME_SC_SUCCESS; 7304 *second_sct = SPDK_NVME_SCT_GENERIC; 7305 *second_sc = SPDK_NVME_SC_SUCCESS; 7306 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7307 *first_sct = SPDK_NVME_SCT_GENERIC; 7308 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7309 *second_sct = SPDK_NVME_SCT_GENERIC; 7310 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7311 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7312 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7313 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7314 *second_sct = SPDK_NVME_SCT_GENERIC; 7315 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7316 } else { 7317 *first_sct = SPDK_NVME_SCT_GENERIC; 7318 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7319 *second_sct = SPDK_NVME_SCT_GENERIC; 7320 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7321 } 7322 7323 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7324 } 7325 7326 struct spdk_thread * 7327 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7328 { 7329 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7330 } 7331 7332 struct spdk_io_channel * 7333 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7334 { 7335 return bdev_io->internal.ch->channel; 7336 } 7337 7338 static int 7339 bdev_register(struct spdk_bdev *bdev) 7340 { 7341 char *bdev_name; 7342 char uuid[SPDK_UUID_STRING_LEN]; 7343 struct spdk_iobuf_opts iobuf_opts; 7344 int ret, i; 7345 7346 assert(bdev->module != NULL); 7347 7348 if (!bdev->name) { 7349 SPDK_ERRLOG("Bdev name is NULL\n"); 7350 return -EINVAL; 7351 } 7352 7353 if (!strlen(bdev->name)) { 7354 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7355 return -EINVAL; 7356 } 7357 7358 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7359 if (bdev->fn_table->accel_sequence_supported == NULL) { 7360 continue; 7361 } 7362 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7363 (enum spdk_bdev_io_type)i)) { 7364 continue; 7365 } 7366 7367 if (spdk_bdev_is_md_separate(bdev)) { 7368 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7369 "accel sequence support\n"); 7370 return -EINVAL; 7371 } 7372 } 7373 7374 /* Users often register their own I/O devices using the bdev name. In 7375 * order to avoid conflicts, prepend bdev_. */ 7376 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7377 if (!bdev_name) { 7378 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7379 return -ENOMEM; 7380 } 7381 7382 bdev->internal.stat = bdev_alloc_io_stat(true); 7383 if (!bdev->internal.stat) { 7384 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7385 free(bdev_name); 7386 return -ENOMEM; 7387 } 7388 7389 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7390 bdev->internal.measured_queue_depth = UINT64_MAX; 7391 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7392 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7393 bdev->internal.qd_poller = NULL; 7394 bdev->internal.qos = NULL; 7395 7396 TAILQ_INIT(&bdev->internal.open_descs); 7397 TAILQ_INIT(&bdev->internal.locked_ranges); 7398 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7399 TAILQ_INIT(&bdev->aliases); 7400 7401 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7402 if (ret != 0) { 7403 bdev_free_io_stat(bdev->internal.stat); 7404 free(bdev_name); 7405 return ret; 7406 } 7407 7408 /* UUID may be specified by the user or defined by bdev itself. 7409 * Otherwise it will be generated here, so this field will never be empty. */ 7410 if (spdk_uuid_is_null(&bdev->uuid)) { 7411 spdk_uuid_generate(&bdev->uuid); 7412 } 7413 7414 /* Add the UUID alias only if it's different than the name */ 7415 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7416 if (strcmp(bdev->name, uuid) != 0) { 7417 ret = spdk_bdev_alias_add(bdev, uuid); 7418 if (ret != 0) { 7419 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7420 bdev_name_del(&bdev->internal.bdev_name); 7421 bdev_free_io_stat(bdev->internal.stat); 7422 free(bdev_name); 7423 return ret; 7424 } 7425 } 7426 7427 if (spdk_bdev_get_buf_align(bdev) > 1) { 7428 if (bdev->split_on_optimal_io_boundary) { 7429 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7430 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7431 } else { 7432 bdev->split_on_optimal_io_boundary = true; 7433 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7434 } 7435 } 7436 7437 /* If the user didn't specify a write unit size, set it to one. */ 7438 if (bdev->write_unit_size == 0) { 7439 bdev->write_unit_size = 1; 7440 } 7441 7442 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7443 if (bdev->acwu == 0) { 7444 bdev->acwu = bdev->write_unit_size; 7445 } 7446 7447 if (bdev->phys_blocklen == 0) { 7448 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7449 } 7450 7451 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7452 spdk_iobuf_get_opts(&iobuf_opts); 7453 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7454 } 7455 7456 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7457 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7458 } 7459 7460 bdev->internal.reset_in_progress = NULL; 7461 bdev->internal.qd_poll_in_progress = false; 7462 bdev->internal.period = 0; 7463 bdev->internal.new_period = 0; 7464 7465 spdk_io_device_register(__bdev_to_io_dev(bdev), 7466 bdev_channel_create, bdev_channel_destroy, 7467 sizeof(struct spdk_bdev_channel), 7468 bdev_name); 7469 7470 free(bdev_name); 7471 7472 spdk_spin_init(&bdev->internal.spinlock); 7473 7474 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7475 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7476 7477 return 0; 7478 } 7479 7480 static void 7481 bdev_destroy_cb(void *io_device) 7482 { 7483 int rc; 7484 struct spdk_bdev *bdev; 7485 spdk_bdev_unregister_cb cb_fn; 7486 void *cb_arg; 7487 7488 bdev = __bdev_from_io_dev(io_device); 7489 7490 if (bdev->internal.unregister_td != spdk_get_thread()) { 7491 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7492 return; 7493 } 7494 7495 cb_fn = bdev->internal.unregister_cb; 7496 cb_arg = bdev->internal.unregister_ctx; 7497 7498 spdk_spin_destroy(&bdev->internal.spinlock); 7499 free(bdev->internal.qos); 7500 bdev_free_io_stat(bdev->internal.stat); 7501 7502 rc = bdev->fn_table->destruct(bdev->ctxt); 7503 if (rc < 0) { 7504 SPDK_ERRLOG("destruct failed\n"); 7505 } 7506 if (rc <= 0 && cb_fn != NULL) { 7507 cb_fn(cb_arg, rc); 7508 } 7509 } 7510 7511 void 7512 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7513 { 7514 if (bdev->internal.unregister_cb != NULL) { 7515 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7516 } 7517 } 7518 7519 static void 7520 _remove_notify(void *arg) 7521 { 7522 struct spdk_bdev_desc *desc = arg; 7523 7524 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7525 } 7526 7527 /* returns: 0 - bdev removed and ready to be destructed. 7528 * -EBUSY - bdev can't be destructed yet. */ 7529 static int 7530 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7531 { 7532 struct spdk_bdev_desc *desc, *tmp; 7533 int rc = 0; 7534 char uuid[SPDK_UUID_STRING_LEN]; 7535 7536 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7537 assert(spdk_spin_held(&bdev->internal.spinlock)); 7538 7539 /* Notify each descriptor about hotremoval */ 7540 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7541 rc = -EBUSY; 7542 /* 7543 * Defer invocation of the event_cb to a separate message that will 7544 * run later on its thread. This ensures this context unwinds and 7545 * we don't recursively unregister this bdev again if the event_cb 7546 * immediately closes its descriptor. 7547 */ 7548 event_notify(desc, _remove_notify); 7549 } 7550 7551 /* If there are no descriptors, proceed removing the bdev */ 7552 if (rc == 0) { 7553 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7554 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7555 7556 /* Delete the name and the UUID alias */ 7557 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7558 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7559 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7560 7561 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7562 7563 if (bdev->internal.reset_in_progress != NULL) { 7564 /* If reset is in progress, let the completion callback for reset 7565 * unregister the bdev. 7566 */ 7567 rc = -EBUSY; 7568 } 7569 } 7570 7571 return rc; 7572 } 7573 7574 static void 7575 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7576 struct spdk_io_channel *io_ch, void *_ctx) 7577 { 7578 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7579 7580 bdev_channel_abort_queued_ios(bdev_ch); 7581 spdk_bdev_for_each_channel_continue(i, 0); 7582 } 7583 7584 static void 7585 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7586 { 7587 int rc; 7588 7589 spdk_spin_lock(&g_bdev_mgr.spinlock); 7590 spdk_spin_lock(&bdev->internal.spinlock); 7591 /* 7592 * Set the status to REMOVING after completing to abort channels. Otherwise, 7593 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7594 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7595 * may fail. 7596 */ 7597 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7598 rc = bdev_unregister_unsafe(bdev); 7599 spdk_spin_unlock(&bdev->internal.spinlock); 7600 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7601 7602 if (rc == 0) { 7603 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7604 } 7605 } 7606 7607 void 7608 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7609 { 7610 struct spdk_thread *thread; 7611 7612 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7613 7614 thread = spdk_get_thread(); 7615 if (!thread) { 7616 /* The user called this from a non-SPDK thread. */ 7617 if (cb_fn != NULL) { 7618 cb_fn(cb_arg, -ENOTSUP); 7619 } 7620 return; 7621 } 7622 7623 spdk_spin_lock(&g_bdev_mgr.spinlock); 7624 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7625 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7626 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7627 if (cb_fn) { 7628 cb_fn(cb_arg, -EBUSY); 7629 } 7630 return; 7631 } 7632 7633 spdk_spin_lock(&bdev->internal.spinlock); 7634 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7635 bdev->internal.unregister_cb = cb_fn; 7636 bdev->internal.unregister_ctx = cb_arg; 7637 bdev->internal.unregister_td = thread; 7638 spdk_spin_unlock(&bdev->internal.spinlock); 7639 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7640 7641 spdk_bdev_set_qd_sampling_period(bdev, 0); 7642 7643 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7644 bdev_unregister); 7645 } 7646 7647 int 7648 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7649 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7650 { 7651 struct spdk_bdev_desc *desc; 7652 struct spdk_bdev *bdev; 7653 int rc; 7654 7655 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7656 if (rc != 0) { 7657 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7658 return rc; 7659 } 7660 7661 bdev = spdk_bdev_desc_get_bdev(desc); 7662 7663 if (bdev->module != module) { 7664 spdk_bdev_close(desc); 7665 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7666 bdev_name); 7667 return -ENODEV; 7668 } 7669 7670 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7671 7672 spdk_bdev_close(desc); 7673 7674 return 0; 7675 } 7676 7677 static int 7678 bdev_start_qos(struct spdk_bdev *bdev) 7679 { 7680 struct set_qos_limit_ctx *ctx; 7681 7682 /* Enable QoS */ 7683 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7684 ctx = calloc(1, sizeof(*ctx)); 7685 if (ctx == NULL) { 7686 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7687 return -ENOMEM; 7688 } 7689 ctx->bdev = bdev; 7690 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7691 } 7692 7693 return 0; 7694 } 7695 7696 static void 7697 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7698 struct spdk_bdev *bdev) 7699 { 7700 enum spdk_bdev_claim_type type; 7701 const char *typename, *modname; 7702 extern struct spdk_log_flag SPDK_LOG_bdev; 7703 7704 assert(spdk_spin_held(&bdev->internal.spinlock)); 7705 7706 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7707 return; 7708 } 7709 7710 type = bdev->internal.claim_type; 7711 typename = spdk_bdev_claim_get_name(type); 7712 7713 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7714 modname = bdev->internal.claim.v1.module->name; 7715 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7716 bdev->name, detail, typename, modname); 7717 return; 7718 } 7719 7720 if (claim_type_is_v2(type)) { 7721 struct spdk_bdev_module_claim *claim; 7722 7723 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7724 modname = claim->module->name; 7725 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7726 bdev->name, detail, typename, modname); 7727 } 7728 return; 7729 } 7730 7731 assert(false); 7732 } 7733 7734 static int 7735 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7736 { 7737 struct spdk_thread *thread; 7738 int rc = 0; 7739 7740 thread = spdk_get_thread(); 7741 if (!thread) { 7742 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7743 return -ENOTSUP; 7744 } 7745 7746 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7747 spdk_get_thread()); 7748 7749 desc->bdev = bdev; 7750 desc->thread = thread; 7751 desc->write = write; 7752 7753 spdk_spin_lock(&bdev->internal.spinlock); 7754 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7755 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7756 spdk_spin_unlock(&bdev->internal.spinlock); 7757 return -ENODEV; 7758 } 7759 7760 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7761 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7762 spdk_spin_unlock(&bdev->internal.spinlock); 7763 return -EPERM; 7764 } 7765 7766 rc = bdev_start_qos(bdev); 7767 if (rc != 0) { 7768 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7769 spdk_spin_unlock(&bdev->internal.spinlock); 7770 return rc; 7771 } 7772 7773 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7774 7775 spdk_spin_unlock(&bdev->internal.spinlock); 7776 7777 return 0; 7778 } 7779 7780 static int 7781 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7782 struct spdk_bdev_desc **_desc) 7783 { 7784 struct spdk_bdev_desc *desc; 7785 unsigned int i; 7786 7787 desc = calloc(1, sizeof(*desc)); 7788 if (desc == NULL) { 7789 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7790 return -ENOMEM; 7791 } 7792 7793 TAILQ_INIT(&desc->pending_media_events); 7794 TAILQ_INIT(&desc->free_media_events); 7795 7796 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7797 desc->callback.event_fn = event_cb; 7798 desc->callback.ctx = event_ctx; 7799 spdk_spin_init(&desc->spinlock); 7800 7801 if (bdev->media_events) { 7802 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7803 sizeof(*desc->media_events_buffer)); 7804 if (desc->media_events_buffer == NULL) { 7805 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7806 bdev_desc_free(desc); 7807 return -ENOMEM; 7808 } 7809 7810 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7811 TAILQ_INSERT_TAIL(&desc->free_media_events, 7812 &desc->media_events_buffer[i], tailq); 7813 } 7814 } 7815 7816 if (bdev->fn_table->accel_sequence_supported != NULL) { 7817 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7818 desc->accel_sequence_supported[i] = 7819 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7820 (enum spdk_bdev_io_type)i); 7821 } 7822 } 7823 7824 *_desc = desc; 7825 7826 return 0; 7827 } 7828 7829 static int 7830 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7831 void *event_ctx, struct spdk_bdev_desc **_desc) 7832 { 7833 struct spdk_bdev_desc *desc; 7834 struct spdk_bdev *bdev; 7835 int rc; 7836 7837 bdev = bdev_get_by_name(bdev_name); 7838 7839 if (bdev == NULL) { 7840 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7841 return -ENODEV; 7842 } 7843 7844 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7845 if (rc != 0) { 7846 return rc; 7847 } 7848 7849 rc = bdev_open(bdev, write, desc); 7850 if (rc != 0) { 7851 bdev_desc_free(desc); 7852 desc = NULL; 7853 } 7854 7855 *_desc = desc; 7856 7857 return rc; 7858 } 7859 7860 int 7861 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7862 void *event_ctx, struct spdk_bdev_desc **_desc) 7863 { 7864 int rc; 7865 7866 if (event_cb == NULL) { 7867 SPDK_ERRLOG("Missing event callback function\n"); 7868 return -EINVAL; 7869 } 7870 7871 spdk_spin_lock(&g_bdev_mgr.spinlock); 7872 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 7873 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7874 7875 return rc; 7876 } 7877 7878 static void 7879 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7880 { 7881 int rc; 7882 7883 spdk_spin_lock(&bdev->internal.spinlock); 7884 spdk_spin_lock(&desc->spinlock); 7885 7886 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7887 7888 desc->closed = true; 7889 7890 if (desc->claim != NULL) { 7891 bdev_desc_release_claims(desc); 7892 } 7893 7894 if (0 == desc->refs) { 7895 spdk_spin_unlock(&desc->spinlock); 7896 bdev_desc_free(desc); 7897 } else { 7898 spdk_spin_unlock(&desc->spinlock); 7899 } 7900 7901 /* If no more descriptors, kill QoS channel */ 7902 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7903 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7904 bdev->name, spdk_get_thread()); 7905 7906 if (bdev_qos_destroy(bdev)) { 7907 /* There isn't anything we can do to recover here. Just let the 7908 * old QoS poller keep running. The QoS handling won't change 7909 * cores when the user allocates a new channel, but it won't break. */ 7910 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7911 } 7912 } 7913 7914 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7915 rc = bdev_unregister_unsafe(bdev); 7916 spdk_spin_unlock(&bdev->internal.spinlock); 7917 7918 if (rc == 0) { 7919 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7920 } 7921 } else { 7922 spdk_spin_unlock(&bdev->internal.spinlock); 7923 } 7924 } 7925 7926 void 7927 spdk_bdev_close(struct spdk_bdev_desc *desc) 7928 { 7929 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7930 7931 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7932 spdk_get_thread()); 7933 7934 assert(desc->thread == spdk_get_thread()); 7935 7936 spdk_poller_unregister(&desc->io_timeout_poller); 7937 7938 spdk_spin_lock(&g_bdev_mgr.spinlock); 7939 7940 bdev_close(bdev, desc); 7941 7942 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7943 } 7944 7945 static void 7946 bdev_register_finished(void *arg) 7947 { 7948 struct spdk_bdev_desc *desc = arg; 7949 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7950 7951 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7952 7953 spdk_spin_lock(&g_bdev_mgr.spinlock); 7954 7955 bdev_close(bdev, desc); 7956 7957 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7958 } 7959 7960 int 7961 spdk_bdev_register(struct spdk_bdev *bdev) 7962 { 7963 struct spdk_bdev_desc *desc; 7964 struct spdk_thread *thread = spdk_get_thread(); 7965 int rc; 7966 7967 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 7968 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7969 thread ? spdk_thread_get_name(thread) : "null"); 7970 return -EINVAL; 7971 } 7972 7973 rc = bdev_register(bdev); 7974 if (rc != 0) { 7975 return rc; 7976 } 7977 7978 /* A descriptor is opened to prevent bdev deletion during examination */ 7979 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7980 if (rc != 0) { 7981 spdk_bdev_unregister(bdev, NULL, NULL); 7982 return rc; 7983 } 7984 7985 rc = bdev_open(bdev, false, desc); 7986 if (rc != 0) { 7987 bdev_desc_free(desc); 7988 spdk_bdev_unregister(bdev, NULL, NULL); 7989 return rc; 7990 } 7991 7992 /* Examine configuration before initializing I/O */ 7993 bdev_examine(bdev); 7994 7995 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7996 if (rc != 0) { 7997 bdev_close(bdev, desc); 7998 spdk_bdev_unregister(bdev, NULL, NULL); 7999 } 8000 8001 return rc; 8002 } 8003 8004 int 8005 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8006 struct spdk_bdev_module *module) 8007 { 8008 spdk_spin_lock(&bdev->internal.spinlock); 8009 8010 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8011 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8012 spdk_spin_unlock(&bdev->internal.spinlock); 8013 return -EPERM; 8014 } 8015 8016 if (desc && !desc->write) { 8017 desc->write = true; 8018 } 8019 8020 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8021 bdev->internal.claim.v1.module = module; 8022 8023 spdk_spin_unlock(&bdev->internal.spinlock); 8024 return 0; 8025 } 8026 8027 void 8028 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8029 { 8030 spdk_spin_lock(&bdev->internal.spinlock); 8031 8032 assert(bdev->internal.claim.v1.module != NULL); 8033 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8034 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8035 bdev->internal.claim.v1.module = NULL; 8036 8037 spdk_spin_unlock(&bdev->internal.spinlock); 8038 } 8039 8040 /* 8041 * Start claims v2 8042 */ 8043 8044 const char * 8045 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8046 { 8047 switch (type) { 8048 case SPDK_BDEV_CLAIM_NONE: 8049 return "not_claimed"; 8050 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8051 return "exclusive_write"; 8052 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8053 return "read_many_write_one"; 8054 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8055 return "read_many_write_none"; 8056 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8057 return "read_many_write_many"; 8058 default: 8059 break; 8060 } 8061 return "invalid_claim"; 8062 } 8063 8064 static bool 8065 claim_type_is_v2(enum spdk_bdev_claim_type type) 8066 { 8067 switch (type) { 8068 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8069 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8070 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8071 return true; 8072 default: 8073 break; 8074 } 8075 return false; 8076 } 8077 8078 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8079 static bool 8080 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8081 { 8082 switch (type) { 8083 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8084 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8085 return true; 8086 default: 8087 break; 8088 } 8089 return false; 8090 } 8091 8092 void 8093 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8094 { 8095 if (opts == NULL) { 8096 SPDK_ERRLOG("opts should not be NULL\n"); 8097 assert(opts != NULL); 8098 return; 8099 } 8100 if (size == 0) { 8101 SPDK_ERRLOG("size should not be zero\n"); 8102 assert(size != 0); 8103 return; 8104 } 8105 8106 memset(opts, 0, size); 8107 opts->opts_size = size; 8108 8109 #define FIELD_OK(field) \ 8110 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8111 8112 #define SET_FIELD(field, value) \ 8113 if (FIELD_OK(field)) { \ 8114 opts->field = value; \ 8115 } \ 8116 8117 SET_FIELD(shared_claim_key, 0); 8118 8119 #undef FIELD_OK 8120 #undef SET_FIELD 8121 } 8122 8123 static int 8124 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8125 { 8126 if (src->opts_size == 0) { 8127 SPDK_ERRLOG("size should not be zero\n"); 8128 return -1; 8129 } 8130 8131 memset(dst, 0, sizeof(*dst)); 8132 dst->opts_size = src->opts_size; 8133 8134 #define FIELD_OK(field) \ 8135 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8136 8137 #define SET_FIELD(field) \ 8138 if (FIELD_OK(field)) { \ 8139 dst->field = src->field; \ 8140 } \ 8141 8142 if (FIELD_OK(name)) { 8143 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8144 } 8145 8146 SET_FIELD(shared_claim_key); 8147 8148 /* You should not remove this statement, but need to update the assert statement 8149 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8150 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8151 8152 #undef FIELD_OK 8153 #undef SET_FIELD 8154 return 0; 8155 } 8156 8157 /* Returns 0 if a read-write-once claim can be taken. */ 8158 static int 8159 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8160 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8161 { 8162 struct spdk_bdev *bdev = desc->bdev; 8163 struct spdk_bdev_desc *open_desc; 8164 8165 assert(spdk_spin_held(&bdev->internal.spinlock)); 8166 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8167 8168 if (opts->shared_claim_key != 0) { 8169 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8170 bdev->name); 8171 return -EINVAL; 8172 } 8173 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8174 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8175 return -EPERM; 8176 } 8177 if (desc->claim != NULL) { 8178 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8179 bdev->name, desc->claim->module->name); 8180 return -EPERM; 8181 } 8182 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8183 if (desc != open_desc && open_desc->write) { 8184 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8185 "another descriptor is open for writing\n", 8186 bdev->name); 8187 return -EPERM; 8188 } 8189 } 8190 8191 return 0; 8192 } 8193 8194 /* Returns 0 if a read-only-many claim can be taken. */ 8195 static int 8196 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8197 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8198 { 8199 struct spdk_bdev *bdev = desc->bdev; 8200 struct spdk_bdev_desc *open_desc; 8201 8202 assert(spdk_spin_held(&bdev->internal.spinlock)); 8203 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8204 assert(desc->claim == NULL); 8205 8206 if (desc->write) { 8207 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8208 bdev->name); 8209 return -EINVAL; 8210 } 8211 if (opts->shared_claim_key != 0) { 8212 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8213 return -EINVAL; 8214 } 8215 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8216 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8217 if (open_desc->write) { 8218 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8219 "another descriptor is open for writing\n", 8220 bdev->name); 8221 return -EPERM; 8222 } 8223 } 8224 } 8225 8226 return 0; 8227 } 8228 8229 /* Returns 0 if a read-write-many claim can be taken. */ 8230 static int 8231 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8232 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8233 { 8234 struct spdk_bdev *bdev = desc->bdev; 8235 struct spdk_bdev_desc *open_desc; 8236 8237 assert(spdk_spin_held(&bdev->internal.spinlock)); 8238 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8239 assert(desc->claim == NULL); 8240 8241 if (opts->shared_claim_key == 0) { 8242 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8243 bdev->name); 8244 return -EINVAL; 8245 } 8246 switch (bdev->internal.claim_type) { 8247 case SPDK_BDEV_CLAIM_NONE: 8248 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8249 if (open_desc == desc) { 8250 continue; 8251 } 8252 if (open_desc->write) { 8253 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8254 "another descriptor is open for writing without a " 8255 "claim\n", bdev->name); 8256 return -EPERM; 8257 } 8258 } 8259 break; 8260 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8261 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8262 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8263 return -EPERM; 8264 } 8265 break; 8266 default: 8267 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8268 return -EBUSY; 8269 } 8270 8271 return 0; 8272 } 8273 8274 /* Updates desc and its bdev with a v2 claim. */ 8275 static int 8276 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8277 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8278 { 8279 struct spdk_bdev *bdev = desc->bdev; 8280 struct spdk_bdev_module_claim *claim; 8281 8282 assert(spdk_spin_held(&bdev->internal.spinlock)); 8283 assert(claim_type_is_v2(type)); 8284 assert(desc->claim == NULL); 8285 8286 claim = calloc(1, sizeof(*desc->claim)); 8287 if (claim == NULL) { 8288 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8289 return -ENOMEM; 8290 } 8291 claim->module = module; 8292 claim->desc = desc; 8293 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8294 memcpy(claim->name, opts->name, sizeof(claim->name)); 8295 desc->claim = claim; 8296 8297 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8298 bdev->internal.claim_type = type; 8299 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8300 bdev->internal.claim.v2.key = opts->shared_claim_key; 8301 } 8302 assert(type == bdev->internal.claim_type); 8303 8304 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8305 8306 if (!desc->write && claim_type_promotes_to_write(type)) { 8307 desc->write = true; 8308 } 8309 8310 return 0; 8311 } 8312 8313 int 8314 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8315 struct spdk_bdev_claim_opts *_opts, 8316 struct spdk_bdev_module *module) 8317 { 8318 struct spdk_bdev *bdev; 8319 struct spdk_bdev_claim_opts opts; 8320 int rc = 0; 8321 8322 if (desc == NULL) { 8323 SPDK_ERRLOG("descriptor must not be NULL\n"); 8324 return -EINVAL; 8325 } 8326 8327 bdev = desc->bdev; 8328 8329 if (_opts == NULL) { 8330 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8331 } else if (claim_opts_copy(_opts, &opts) != 0) { 8332 return -EINVAL; 8333 } 8334 8335 spdk_spin_lock(&bdev->internal.spinlock); 8336 8337 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8338 bdev->internal.claim_type != type) { 8339 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8340 spdk_spin_unlock(&bdev->internal.spinlock); 8341 return -EPERM; 8342 } 8343 8344 if (claim_type_is_v2(type) && desc->claim != NULL) { 8345 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8346 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8347 spdk_spin_unlock(&bdev->internal.spinlock); 8348 return -EPERM; 8349 } 8350 8351 switch (type) { 8352 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8353 spdk_spin_unlock(&bdev->internal.spinlock); 8354 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8355 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8356 rc = claim_verify_rwo(desc, type, &opts, module); 8357 break; 8358 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8359 rc = claim_verify_rom(desc, type, &opts, module); 8360 break; 8361 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8362 rc = claim_verify_rwm(desc, type, &opts, module); 8363 break; 8364 default: 8365 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8366 rc = -ENOTSUP; 8367 } 8368 8369 if (rc == 0) { 8370 rc = claim_bdev(desc, type, &opts, module); 8371 } 8372 8373 spdk_spin_unlock(&bdev->internal.spinlock); 8374 return rc; 8375 } 8376 8377 static void 8378 claim_reset(struct spdk_bdev *bdev) 8379 { 8380 assert(spdk_spin_held(&bdev->internal.spinlock)); 8381 assert(claim_type_is_v2(bdev->internal.claim_type)); 8382 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8383 8384 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8385 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8386 } 8387 8388 static void 8389 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8390 { 8391 struct spdk_bdev *bdev = desc->bdev; 8392 8393 assert(spdk_spin_held(&bdev->internal.spinlock)); 8394 assert(claim_type_is_v2(bdev->internal.claim_type)); 8395 8396 if (bdev->internal.examine_in_progress == 0) { 8397 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8398 free(desc->claim); 8399 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8400 claim_reset(bdev); 8401 } 8402 } else { 8403 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8404 desc->claim->module = NULL; 8405 desc->claim->desc = NULL; 8406 } 8407 desc->claim = NULL; 8408 } 8409 8410 /* 8411 * End claims v2 8412 */ 8413 8414 struct spdk_bdev * 8415 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8416 { 8417 assert(desc != NULL); 8418 return desc->bdev; 8419 } 8420 8421 int 8422 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8423 { 8424 struct spdk_bdev *bdev, *tmp; 8425 struct spdk_bdev_desc *desc; 8426 int rc = 0; 8427 8428 assert(fn != NULL); 8429 8430 spdk_spin_lock(&g_bdev_mgr.spinlock); 8431 bdev = spdk_bdev_first(); 8432 while (bdev != NULL) { 8433 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8434 if (rc != 0) { 8435 break; 8436 } 8437 rc = bdev_open(bdev, false, desc); 8438 if (rc != 0) { 8439 bdev_desc_free(desc); 8440 if (rc == -ENODEV) { 8441 /* Ignore the error and move to the next bdev. */ 8442 rc = 0; 8443 bdev = spdk_bdev_next(bdev); 8444 continue; 8445 } 8446 break; 8447 } 8448 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8449 8450 rc = fn(ctx, bdev); 8451 8452 spdk_spin_lock(&g_bdev_mgr.spinlock); 8453 tmp = spdk_bdev_next(bdev); 8454 bdev_close(bdev, desc); 8455 if (rc != 0) { 8456 break; 8457 } 8458 bdev = tmp; 8459 } 8460 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8461 8462 return rc; 8463 } 8464 8465 int 8466 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8467 { 8468 struct spdk_bdev *bdev, *tmp; 8469 struct spdk_bdev_desc *desc; 8470 int rc = 0; 8471 8472 assert(fn != NULL); 8473 8474 spdk_spin_lock(&g_bdev_mgr.spinlock); 8475 bdev = spdk_bdev_first_leaf(); 8476 while (bdev != NULL) { 8477 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8478 if (rc != 0) { 8479 break; 8480 } 8481 rc = bdev_open(bdev, false, desc); 8482 if (rc != 0) { 8483 bdev_desc_free(desc); 8484 if (rc == -ENODEV) { 8485 /* Ignore the error and move to the next bdev. */ 8486 rc = 0; 8487 bdev = spdk_bdev_next_leaf(bdev); 8488 continue; 8489 } 8490 break; 8491 } 8492 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8493 8494 rc = fn(ctx, bdev); 8495 8496 spdk_spin_lock(&g_bdev_mgr.spinlock); 8497 tmp = spdk_bdev_next_leaf(bdev); 8498 bdev_close(bdev, desc); 8499 if (rc != 0) { 8500 break; 8501 } 8502 bdev = tmp; 8503 } 8504 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8505 8506 return rc; 8507 } 8508 8509 void 8510 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8511 { 8512 struct iovec *iovs; 8513 int iovcnt; 8514 8515 if (bdev_io == NULL) { 8516 return; 8517 } 8518 8519 switch (bdev_io->type) { 8520 case SPDK_BDEV_IO_TYPE_READ: 8521 case SPDK_BDEV_IO_TYPE_WRITE: 8522 case SPDK_BDEV_IO_TYPE_ZCOPY: 8523 iovs = bdev_io->u.bdev.iovs; 8524 iovcnt = bdev_io->u.bdev.iovcnt; 8525 break; 8526 default: 8527 iovs = NULL; 8528 iovcnt = 0; 8529 break; 8530 } 8531 8532 if (iovp) { 8533 *iovp = iovs; 8534 } 8535 if (iovcntp) { 8536 *iovcntp = iovcnt; 8537 } 8538 } 8539 8540 void * 8541 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8542 { 8543 if (bdev_io == NULL) { 8544 return NULL; 8545 } 8546 8547 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8548 return NULL; 8549 } 8550 8551 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8552 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8553 return bdev_io->u.bdev.md_buf; 8554 } 8555 8556 return NULL; 8557 } 8558 8559 void * 8560 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8561 { 8562 if (bdev_io == NULL) { 8563 assert(false); 8564 return NULL; 8565 } 8566 8567 return bdev_io->internal.caller_ctx; 8568 } 8569 8570 void 8571 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8572 { 8573 8574 if (spdk_bdev_module_list_find(bdev_module->name)) { 8575 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8576 assert(false); 8577 } 8578 8579 spdk_spin_init(&bdev_module->internal.spinlock); 8580 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8581 8582 /* 8583 * Modules with examine callbacks must be initialized first, so they are 8584 * ready to handle examine callbacks from later modules that will 8585 * register physical bdevs. 8586 */ 8587 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8588 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8589 } else { 8590 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8591 } 8592 } 8593 8594 struct spdk_bdev_module * 8595 spdk_bdev_module_list_find(const char *name) 8596 { 8597 struct spdk_bdev_module *bdev_module; 8598 8599 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8600 if (strcmp(name, bdev_module->name) == 0) { 8601 break; 8602 } 8603 } 8604 8605 return bdev_module; 8606 } 8607 8608 static int 8609 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8610 { 8611 uint64_t num_blocks; 8612 void *md_buf = NULL; 8613 8614 num_blocks = bdev_io->u.bdev.num_blocks; 8615 8616 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8617 md_buf = (char *)g_bdev_mgr.zero_buffer + 8618 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8619 } 8620 8621 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8622 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8623 g_bdev_mgr.zero_buffer, md_buf, 8624 bdev_io->u.bdev.offset_blocks, num_blocks, 8625 bdev_write_zero_buffer_done, bdev_io); 8626 } 8627 8628 static void 8629 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8630 { 8631 struct spdk_bdev_io *parent_io = cb_arg; 8632 8633 spdk_bdev_free_io(bdev_io); 8634 8635 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8636 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8637 } 8638 8639 static void 8640 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8641 { 8642 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8643 ctx->bdev->internal.qos_mod_in_progress = false; 8644 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8645 8646 if (ctx->cb_fn) { 8647 ctx->cb_fn(ctx->cb_arg, status); 8648 } 8649 free(ctx); 8650 } 8651 8652 static void 8653 bdev_disable_qos_done(void *cb_arg) 8654 { 8655 struct set_qos_limit_ctx *ctx = cb_arg; 8656 struct spdk_bdev *bdev = ctx->bdev; 8657 struct spdk_bdev_io *bdev_io; 8658 struct spdk_bdev_qos *qos; 8659 8660 spdk_spin_lock(&bdev->internal.spinlock); 8661 qos = bdev->internal.qos; 8662 bdev->internal.qos = NULL; 8663 spdk_spin_unlock(&bdev->internal.spinlock); 8664 8665 while (!TAILQ_EMPTY(&qos->queued)) { 8666 /* Send queued I/O back to their original thread for resubmission. */ 8667 bdev_io = TAILQ_FIRST(&qos->queued); 8668 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8669 8670 if (bdev_io->internal.io_submit_ch) { 8671 /* 8672 * Channel was changed when sending it to the QoS thread - change it back 8673 * before sending it back to the original thread. 8674 */ 8675 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8676 bdev_io->internal.io_submit_ch = NULL; 8677 } 8678 8679 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8680 _bdev_io_submit, bdev_io); 8681 } 8682 8683 if (qos->thread != NULL) { 8684 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8685 spdk_poller_unregister(&qos->poller); 8686 } 8687 8688 free(qos); 8689 8690 bdev_set_qos_limit_done(ctx, 0); 8691 } 8692 8693 static void 8694 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8695 { 8696 struct set_qos_limit_ctx *ctx = _ctx; 8697 struct spdk_thread *thread; 8698 8699 spdk_spin_lock(&bdev->internal.spinlock); 8700 thread = bdev->internal.qos->thread; 8701 spdk_spin_unlock(&bdev->internal.spinlock); 8702 8703 if (thread != NULL) { 8704 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8705 } else { 8706 bdev_disable_qos_done(ctx); 8707 } 8708 } 8709 8710 static void 8711 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8712 struct spdk_io_channel *ch, void *_ctx) 8713 { 8714 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8715 8716 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8717 8718 spdk_bdev_for_each_channel_continue(i, 0); 8719 } 8720 8721 static void 8722 bdev_update_qos_rate_limit_msg(void *cb_arg) 8723 { 8724 struct set_qos_limit_ctx *ctx = cb_arg; 8725 struct spdk_bdev *bdev = ctx->bdev; 8726 8727 spdk_spin_lock(&bdev->internal.spinlock); 8728 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8729 spdk_spin_unlock(&bdev->internal.spinlock); 8730 8731 bdev_set_qos_limit_done(ctx, 0); 8732 } 8733 8734 static void 8735 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8736 struct spdk_io_channel *ch, void *_ctx) 8737 { 8738 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8739 8740 spdk_spin_lock(&bdev->internal.spinlock); 8741 bdev_enable_qos(bdev, bdev_ch); 8742 spdk_spin_unlock(&bdev->internal.spinlock); 8743 spdk_bdev_for_each_channel_continue(i, 0); 8744 } 8745 8746 static void 8747 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8748 { 8749 struct set_qos_limit_ctx *ctx = _ctx; 8750 8751 bdev_set_qos_limit_done(ctx, status); 8752 } 8753 8754 static void 8755 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8756 { 8757 int i; 8758 8759 assert(bdev->internal.qos != NULL); 8760 8761 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8762 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8763 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8764 8765 if (limits[i] == 0) { 8766 bdev->internal.qos->rate_limits[i].limit = 8767 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8768 } 8769 } 8770 } 8771 } 8772 8773 void 8774 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8775 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8776 { 8777 struct set_qos_limit_ctx *ctx; 8778 uint32_t limit_set_complement; 8779 uint64_t min_limit_per_sec; 8780 int i; 8781 bool disable_rate_limit = true; 8782 8783 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8784 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8785 continue; 8786 } 8787 8788 if (limits[i] > 0) { 8789 disable_rate_limit = false; 8790 } 8791 8792 if (bdev_qos_is_iops_rate_limit(i) == true) { 8793 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8794 } else { 8795 /* Change from megabyte to byte rate limit */ 8796 limits[i] = limits[i] * 1024 * 1024; 8797 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8798 } 8799 8800 limit_set_complement = limits[i] % min_limit_per_sec; 8801 if (limit_set_complement) { 8802 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8803 limits[i], min_limit_per_sec); 8804 limits[i] += min_limit_per_sec - limit_set_complement; 8805 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8806 } 8807 } 8808 8809 ctx = calloc(1, sizeof(*ctx)); 8810 if (ctx == NULL) { 8811 cb_fn(cb_arg, -ENOMEM); 8812 return; 8813 } 8814 8815 ctx->cb_fn = cb_fn; 8816 ctx->cb_arg = cb_arg; 8817 ctx->bdev = bdev; 8818 8819 spdk_spin_lock(&bdev->internal.spinlock); 8820 if (bdev->internal.qos_mod_in_progress) { 8821 spdk_spin_unlock(&bdev->internal.spinlock); 8822 free(ctx); 8823 cb_fn(cb_arg, -EAGAIN); 8824 return; 8825 } 8826 bdev->internal.qos_mod_in_progress = true; 8827 8828 if (disable_rate_limit == true && bdev->internal.qos) { 8829 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8830 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8831 (bdev->internal.qos->rate_limits[i].limit > 0 && 8832 bdev->internal.qos->rate_limits[i].limit != 8833 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8834 disable_rate_limit = false; 8835 break; 8836 } 8837 } 8838 } 8839 8840 if (disable_rate_limit == false) { 8841 if (bdev->internal.qos == NULL) { 8842 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8843 if (!bdev->internal.qos) { 8844 spdk_spin_unlock(&bdev->internal.spinlock); 8845 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8846 bdev_set_qos_limit_done(ctx, -ENOMEM); 8847 return; 8848 } 8849 } 8850 8851 if (bdev->internal.qos->thread == NULL) { 8852 /* Enabling */ 8853 bdev_set_qos_rate_limits(bdev, limits); 8854 8855 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8856 bdev_enable_qos_done); 8857 } else { 8858 /* Updating */ 8859 bdev_set_qos_rate_limits(bdev, limits); 8860 8861 spdk_thread_send_msg(bdev->internal.qos->thread, 8862 bdev_update_qos_rate_limit_msg, ctx); 8863 } 8864 } else { 8865 if (bdev->internal.qos != NULL) { 8866 bdev_set_qos_rate_limits(bdev, limits); 8867 8868 /* Disabling */ 8869 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8870 bdev_disable_qos_msg_done); 8871 } else { 8872 spdk_spin_unlock(&bdev->internal.spinlock); 8873 bdev_set_qos_limit_done(ctx, 0); 8874 return; 8875 } 8876 } 8877 8878 spdk_spin_unlock(&bdev->internal.spinlock); 8879 } 8880 8881 struct spdk_bdev_histogram_ctx { 8882 spdk_bdev_histogram_status_cb cb_fn; 8883 void *cb_arg; 8884 struct spdk_bdev *bdev; 8885 int status; 8886 }; 8887 8888 static void 8889 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8890 { 8891 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8892 8893 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8894 ctx->bdev->internal.histogram_in_progress = false; 8895 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8896 ctx->cb_fn(ctx->cb_arg, ctx->status); 8897 free(ctx); 8898 } 8899 8900 static void 8901 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8902 struct spdk_io_channel *_ch, void *_ctx) 8903 { 8904 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8905 8906 if (ch->histogram != NULL) { 8907 spdk_histogram_data_free(ch->histogram); 8908 ch->histogram = NULL; 8909 } 8910 spdk_bdev_for_each_channel_continue(i, 0); 8911 } 8912 8913 static void 8914 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8915 { 8916 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8917 8918 if (status != 0) { 8919 ctx->status = status; 8920 ctx->bdev->internal.histogram_enabled = false; 8921 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8922 bdev_histogram_disable_channel_cb); 8923 } else { 8924 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8925 ctx->bdev->internal.histogram_in_progress = false; 8926 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8927 ctx->cb_fn(ctx->cb_arg, ctx->status); 8928 free(ctx); 8929 } 8930 } 8931 8932 static void 8933 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8934 struct spdk_io_channel *_ch, void *_ctx) 8935 { 8936 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8937 int status = 0; 8938 8939 if (ch->histogram == NULL) { 8940 ch->histogram = spdk_histogram_data_alloc(); 8941 if (ch->histogram == NULL) { 8942 status = -ENOMEM; 8943 } 8944 } 8945 8946 spdk_bdev_for_each_channel_continue(i, status); 8947 } 8948 8949 void 8950 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8951 void *cb_arg, bool enable) 8952 { 8953 struct spdk_bdev_histogram_ctx *ctx; 8954 8955 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8956 if (ctx == NULL) { 8957 cb_fn(cb_arg, -ENOMEM); 8958 return; 8959 } 8960 8961 ctx->bdev = bdev; 8962 ctx->status = 0; 8963 ctx->cb_fn = cb_fn; 8964 ctx->cb_arg = cb_arg; 8965 8966 spdk_spin_lock(&bdev->internal.spinlock); 8967 if (bdev->internal.histogram_in_progress) { 8968 spdk_spin_unlock(&bdev->internal.spinlock); 8969 free(ctx); 8970 cb_fn(cb_arg, -EAGAIN); 8971 return; 8972 } 8973 8974 bdev->internal.histogram_in_progress = true; 8975 spdk_spin_unlock(&bdev->internal.spinlock); 8976 8977 bdev->internal.histogram_enabled = enable; 8978 8979 if (enable) { 8980 /* Allocate histogram for each channel */ 8981 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8982 bdev_histogram_enable_channel_cb); 8983 } else { 8984 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8985 bdev_histogram_disable_channel_cb); 8986 } 8987 } 8988 8989 struct spdk_bdev_histogram_data_ctx { 8990 spdk_bdev_histogram_data_cb cb_fn; 8991 void *cb_arg; 8992 struct spdk_bdev *bdev; 8993 /** merged histogram data from all channels */ 8994 struct spdk_histogram_data *histogram; 8995 }; 8996 8997 static void 8998 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8999 { 9000 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9001 9002 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9003 free(ctx); 9004 } 9005 9006 static void 9007 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9008 struct spdk_io_channel *_ch, void *_ctx) 9009 { 9010 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9011 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9012 int status = 0; 9013 9014 if (ch->histogram == NULL) { 9015 status = -EFAULT; 9016 } else { 9017 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9018 } 9019 9020 spdk_bdev_for_each_channel_continue(i, status); 9021 } 9022 9023 void 9024 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9025 spdk_bdev_histogram_data_cb cb_fn, 9026 void *cb_arg) 9027 { 9028 struct spdk_bdev_histogram_data_ctx *ctx; 9029 9030 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9031 if (ctx == NULL) { 9032 cb_fn(cb_arg, -ENOMEM, NULL); 9033 return; 9034 } 9035 9036 ctx->bdev = bdev; 9037 ctx->cb_fn = cb_fn; 9038 ctx->cb_arg = cb_arg; 9039 9040 ctx->histogram = histogram; 9041 9042 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9043 bdev_histogram_get_channel_cb); 9044 } 9045 9046 void 9047 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9048 void *cb_arg) 9049 { 9050 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9051 int status = 0; 9052 9053 assert(cb_fn != NULL); 9054 9055 if (bdev_ch->histogram == NULL) { 9056 status = -EFAULT; 9057 } 9058 cb_fn(cb_arg, status, bdev_ch->histogram); 9059 } 9060 9061 size_t 9062 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9063 size_t max_events) 9064 { 9065 struct media_event_entry *entry; 9066 size_t num_events = 0; 9067 9068 for (; num_events < max_events; ++num_events) { 9069 entry = TAILQ_FIRST(&desc->pending_media_events); 9070 if (entry == NULL) { 9071 break; 9072 } 9073 9074 events[num_events] = entry->event; 9075 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9076 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9077 } 9078 9079 return num_events; 9080 } 9081 9082 int 9083 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9084 size_t num_events) 9085 { 9086 struct spdk_bdev_desc *desc; 9087 struct media_event_entry *entry; 9088 size_t event_id; 9089 int rc = 0; 9090 9091 assert(bdev->media_events); 9092 9093 spdk_spin_lock(&bdev->internal.spinlock); 9094 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9095 if (desc->write) { 9096 break; 9097 } 9098 } 9099 9100 if (desc == NULL || desc->media_events_buffer == NULL) { 9101 rc = -ENODEV; 9102 goto out; 9103 } 9104 9105 for (event_id = 0; event_id < num_events; ++event_id) { 9106 entry = TAILQ_FIRST(&desc->free_media_events); 9107 if (entry == NULL) { 9108 break; 9109 } 9110 9111 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9112 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9113 entry->event = events[event_id]; 9114 } 9115 9116 rc = event_id; 9117 out: 9118 spdk_spin_unlock(&bdev->internal.spinlock); 9119 return rc; 9120 } 9121 9122 static void 9123 _media_management_notify(void *arg) 9124 { 9125 struct spdk_bdev_desc *desc = arg; 9126 9127 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9128 } 9129 9130 void 9131 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9132 { 9133 struct spdk_bdev_desc *desc; 9134 9135 spdk_spin_lock(&bdev->internal.spinlock); 9136 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9137 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9138 event_notify(desc, _media_management_notify); 9139 } 9140 } 9141 spdk_spin_unlock(&bdev->internal.spinlock); 9142 } 9143 9144 struct locked_lba_range_ctx { 9145 struct lba_range range; 9146 struct lba_range *current_range; 9147 struct lba_range *owner_range; 9148 struct spdk_poller *poller; 9149 lock_range_cb cb_fn; 9150 void *cb_arg; 9151 }; 9152 9153 static void 9154 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9155 { 9156 struct locked_lba_range_ctx *ctx = _ctx; 9157 9158 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9159 free(ctx); 9160 } 9161 9162 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9163 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9164 9165 static void 9166 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9167 { 9168 struct locked_lba_range_ctx *ctx = _ctx; 9169 9170 if (status == -ENOMEM) { 9171 /* One of the channels could not allocate a range object. 9172 * So we have to go back and clean up any ranges that were 9173 * allocated successfully before we return error status to 9174 * the caller. We can reuse the unlock function to do that 9175 * clean up. 9176 */ 9177 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9178 bdev_lock_error_cleanup_cb); 9179 return; 9180 } 9181 9182 /* All channels have locked this range and no I/O overlapping the range 9183 * are outstanding! Set the owner_ch for the range object for the 9184 * locking channel, so that this channel will know that it is allowed 9185 * to write to this range. 9186 */ 9187 if (ctx->owner_range != NULL) { 9188 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9189 } 9190 9191 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9192 9193 /* Don't free the ctx here. Its range is in the bdev's global list of 9194 * locked ranges still, and will be removed and freed when this range 9195 * is later unlocked. 9196 */ 9197 } 9198 9199 static int 9200 bdev_lock_lba_range_check_io(void *_i) 9201 { 9202 struct spdk_bdev_channel_iter *i = _i; 9203 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9204 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9205 struct locked_lba_range_ctx *ctx = i->ctx; 9206 struct lba_range *range = ctx->current_range; 9207 struct spdk_bdev_io *bdev_io; 9208 9209 spdk_poller_unregister(&ctx->poller); 9210 9211 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9212 * range. But we need to wait until any outstanding IO overlapping with this range 9213 * are completed. 9214 */ 9215 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9216 if (bdev_io_range_is_locked(bdev_io, range)) { 9217 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9218 return SPDK_POLLER_BUSY; 9219 } 9220 } 9221 9222 spdk_bdev_for_each_channel_continue(i, 0); 9223 return SPDK_POLLER_BUSY; 9224 } 9225 9226 static void 9227 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9228 struct spdk_io_channel *_ch, void *_ctx) 9229 { 9230 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9231 struct locked_lba_range_ctx *ctx = _ctx; 9232 struct lba_range *range; 9233 9234 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9235 if (range->length == ctx->range.length && 9236 range->offset == ctx->range.offset && 9237 range->locked_ctx == ctx->range.locked_ctx) { 9238 /* This range already exists on this channel, so don't add 9239 * it again. This can happen when a new channel is created 9240 * while the for_each_channel operation is in progress. 9241 * Do not check for outstanding I/O in that case, since the 9242 * range was locked before any I/O could be submitted to the 9243 * new channel. 9244 */ 9245 spdk_bdev_for_each_channel_continue(i, 0); 9246 return; 9247 } 9248 } 9249 9250 range = calloc(1, sizeof(*range)); 9251 if (range == NULL) { 9252 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9253 return; 9254 } 9255 9256 range->length = ctx->range.length; 9257 range->offset = ctx->range.offset; 9258 range->locked_ctx = ctx->range.locked_ctx; 9259 ctx->current_range = range; 9260 if (ctx->range.owner_ch == ch) { 9261 /* This is the range object for the channel that will hold 9262 * the lock. Store it in the ctx object so that we can easily 9263 * set its owner_ch after the lock is finally acquired. 9264 */ 9265 ctx->owner_range = range; 9266 } 9267 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9268 bdev_lock_lba_range_check_io(i); 9269 } 9270 9271 static void 9272 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9273 { 9274 assert(spdk_get_thread() == ctx->range.owner_thread); 9275 assert(ctx->range.owner_ch == NULL || 9276 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9277 9278 /* We will add a copy of this range to each channel now. */ 9279 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9280 bdev_lock_lba_range_cb); 9281 } 9282 9283 static bool 9284 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9285 { 9286 struct lba_range *r; 9287 9288 TAILQ_FOREACH(r, tailq, tailq) { 9289 if (bdev_lba_range_overlapped(range, r)) { 9290 return true; 9291 } 9292 } 9293 return false; 9294 } 9295 9296 static int 9297 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9298 uint64_t offset, uint64_t length, 9299 lock_range_cb cb_fn, void *cb_arg) 9300 { 9301 struct locked_lba_range_ctx *ctx; 9302 9303 ctx = calloc(1, sizeof(*ctx)); 9304 if (ctx == NULL) { 9305 return -ENOMEM; 9306 } 9307 9308 ctx->range.offset = offset; 9309 ctx->range.length = length; 9310 ctx->range.owner_thread = spdk_get_thread(); 9311 ctx->range.owner_ch = ch; 9312 ctx->range.locked_ctx = cb_arg; 9313 ctx->range.bdev = bdev; 9314 ctx->cb_fn = cb_fn; 9315 ctx->cb_arg = cb_arg; 9316 9317 spdk_spin_lock(&bdev->internal.spinlock); 9318 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9319 /* There is an active lock overlapping with this range. 9320 * Put it on the pending list until this range no 9321 * longer overlaps with another. 9322 */ 9323 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9324 } else { 9325 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9326 bdev_lock_lba_range_ctx(bdev, ctx); 9327 } 9328 spdk_spin_unlock(&bdev->internal.spinlock); 9329 return 0; 9330 } 9331 9332 static int 9333 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9334 uint64_t offset, uint64_t length, 9335 lock_range_cb cb_fn, void *cb_arg) 9336 { 9337 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9338 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9339 9340 if (cb_arg == NULL) { 9341 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9342 return -EINVAL; 9343 } 9344 9345 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9346 } 9347 9348 static void 9349 bdev_lock_lba_range_ctx_msg(void *_ctx) 9350 { 9351 struct locked_lba_range_ctx *ctx = _ctx; 9352 9353 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9354 } 9355 9356 static void 9357 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9358 { 9359 struct locked_lba_range_ctx *ctx = _ctx; 9360 struct locked_lba_range_ctx *pending_ctx; 9361 struct lba_range *range, *tmp; 9362 9363 spdk_spin_lock(&bdev->internal.spinlock); 9364 /* Check if there are any pending locked ranges that overlap with this range 9365 * that was just unlocked. If there are, check that it doesn't overlap with any 9366 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9367 * the lock process. 9368 */ 9369 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9370 if (bdev_lba_range_overlapped(range, &ctx->range) && 9371 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9372 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9373 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9374 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9375 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9376 bdev_lock_lba_range_ctx_msg, pending_ctx); 9377 } 9378 } 9379 spdk_spin_unlock(&bdev->internal.spinlock); 9380 9381 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9382 free(ctx); 9383 } 9384 9385 static void 9386 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9387 struct spdk_io_channel *_ch, void *_ctx) 9388 { 9389 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9390 struct locked_lba_range_ctx *ctx = _ctx; 9391 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9392 struct spdk_bdev_io *bdev_io; 9393 struct lba_range *range; 9394 9395 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9396 if (ctx->range.offset == range->offset && 9397 ctx->range.length == range->length && 9398 ctx->range.locked_ctx == range->locked_ctx) { 9399 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9400 free(range); 9401 break; 9402 } 9403 } 9404 9405 /* Note: we should almost always be able to assert that the range specified 9406 * was found. But there are some very rare corner cases where a new channel 9407 * gets created simultaneously with a range unlock, where this function 9408 * would execute on that new channel and wouldn't have the range. 9409 * We also use this to clean up range allocations when a later allocation 9410 * fails in the locking path. 9411 * So we can't actually assert() here. 9412 */ 9413 9414 /* Swap the locked IO into a temporary list, and then try to submit them again. 9415 * We could hyper-optimize this to only resubmit locked I/O that overlap 9416 * with the range that was just unlocked, but this isn't a performance path so 9417 * we go for simplicity here. 9418 */ 9419 TAILQ_INIT(&io_locked); 9420 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9421 while (!TAILQ_EMPTY(&io_locked)) { 9422 bdev_io = TAILQ_FIRST(&io_locked); 9423 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9424 bdev_io_submit(bdev_io); 9425 } 9426 9427 spdk_bdev_for_each_channel_continue(i, 0); 9428 } 9429 9430 static int 9431 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9432 lock_range_cb cb_fn, void *cb_arg) 9433 { 9434 struct locked_lba_range_ctx *ctx; 9435 struct lba_range *range; 9436 9437 spdk_spin_lock(&bdev->internal.spinlock); 9438 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9439 * and remove it. This ensures new channels don't inherit the locked range. 9440 * Then we will send a message to each channel to remove the range from its 9441 * per-channel list. 9442 */ 9443 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9444 if (range->offset == offset && range->length == length && 9445 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9446 break; 9447 } 9448 } 9449 if (range == NULL) { 9450 assert(false); 9451 spdk_spin_unlock(&bdev->internal.spinlock); 9452 return -EINVAL; 9453 } 9454 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9455 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9456 spdk_spin_unlock(&bdev->internal.spinlock); 9457 9458 ctx->cb_fn = cb_fn; 9459 ctx->cb_arg = cb_arg; 9460 9461 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9462 bdev_unlock_lba_range_cb); 9463 return 0; 9464 } 9465 9466 static int 9467 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9468 uint64_t offset, uint64_t length, 9469 lock_range_cb cb_fn, void *cb_arg) 9470 { 9471 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9472 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9473 struct lba_range *range; 9474 bool range_found = false; 9475 9476 /* Let's make sure the specified channel actually has a lock on 9477 * the specified range. Note that the range must match exactly. 9478 */ 9479 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9480 if (range->offset == offset && range->length == length && 9481 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9482 range_found = true; 9483 break; 9484 } 9485 } 9486 9487 if (!range_found) { 9488 return -EINVAL; 9489 } 9490 9491 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9492 } 9493 9494 struct bdev_quiesce_ctx { 9495 spdk_bdev_quiesce_cb cb_fn; 9496 void *cb_arg; 9497 }; 9498 9499 static void 9500 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9501 { 9502 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9503 9504 if (quiesce_ctx->cb_fn != NULL) { 9505 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9506 } 9507 9508 free(quiesce_ctx); 9509 } 9510 9511 static void 9512 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9513 { 9514 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9515 struct spdk_bdev_module *module = range->bdev->module; 9516 9517 if (status != 0) { 9518 if (quiesce_ctx->cb_fn != NULL) { 9519 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9520 } 9521 free(quiesce_ctx); 9522 return; 9523 } 9524 9525 spdk_spin_lock(&module->internal.spinlock); 9526 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9527 spdk_spin_unlock(&module->internal.spinlock); 9528 9529 if (quiesce_ctx->cb_fn != NULL) { 9530 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9531 quiesce_ctx->cb_fn = NULL; 9532 quiesce_ctx->cb_arg = NULL; 9533 } 9534 /* quiesce_ctx will be freed on unquiesce */ 9535 } 9536 9537 static int 9538 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9539 uint64_t offset, uint64_t length, 9540 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9541 bool unquiesce) 9542 { 9543 struct bdev_quiesce_ctx *quiesce_ctx; 9544 int rc; 9545 9546 if (module != bdev->module) { 9547 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9548 return -EINVAL; 9549 } 9550 9551 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9552 return -EINVAL; 9553 } 9554 9555 if (unquiesce) { 9556 struct lba_range *range; 9557 9558 /* Make sure the specified range is actually quiesced in the specified module and 9559 * then remove it from the list. Note that the range must match exactly. 9560 */ 9561 spdk_spin_lock(&module->internal.spinlock); 9562 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9563 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9564 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9565 break; 9566 } 9567 } 9568 spdk_spin_unlock(&module->internal.spinlock); 9569 9570 if (range == NULL) { 9571 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9572 return -EINVAL; 9573 } 9574 9575 quiesce_ctx = range->locked_ctx; 9576 quiesce_ctx->cb_fn = cb_fn; 9577 quiesce_ctx->cb_arg = cb_arg; 9578 9579 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9580 } else { 9581 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9582 if (quiesce_ctx == NULL) { 9583 return -ENOMEM; 9584 } 9585 9586 quiesce_ctx->cb_fn = cb_fn; 9587 quiesce_ctx->cb_arg = cb_arg; 9588 9589 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9590 if (rc != 0) { 9591 free(quiesce_ctx); 9592 } 9593 } 9594 9595 return rc; 9596 } 9597 9598 int 9599 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9600 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9601 { 9602 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9603 } 9604 9605 int 9606 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9607 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9608 { 9609 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9610 } 9611 9612 int 9613 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9614 uint64_t offset, uint64_t length, 9615 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9616 { 9617 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9618 } 9619 9620 int 9621 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9622 uint64_t offset, uint64_t length, 9623 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9624 { 9625 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9626 } 9627 9628 int 9629 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9630 int array_size) 9631 { 9632 if (!bdev) { 9633 return -EINVAL; 9634 } 9635 9636 if (bdev->fn_table->get_memory_domains) { 9637 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9638 } 9639 9640 return 0; 9641 } 9642 9643 struct spdk_bdev_for_each_io_ctx { 9644 void *ctx; 9645 spdk_bdev_io_fn fn; 9646 spdk_bdev_for_each_io_cb cb; 9647 }; 9648 9649 static void 9650 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9651 struct spdk_io_channel *io_ch, void *_ctx) 9652 { 9653 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9654 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9655 struct spdk_bdev_io *bdev_io; 9656 int rc = 0; 9657 9658 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9659 rc = ctx->fn(ctx->ctx, bdev_io); 9660 if (rc != 0) { 9661 break; 9662 } 9663 } 9664 9665 spdk_bdev_for_each_channel_continue(i, rc); 9666 } 9667 9668 static void 9669 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9670 { 9671 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9672 9673 ctx->cb(ctx->ctx, status); 9674 9675 free(ctx); 9676 } 9677 9678 void 9679 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9680 spdk_bdev_for_each_io_cb cb) 9681 { 9682 struct spdk_bdev_for_each_io_ctx *ctx; 9683 9684 assert(fn != NULL && cb != NULL); 9685 9686 ctx = calloc(1, sizeof(*ctx)); 9687 if (ctx == NULL) { 9688 SPDK_ERRLOG("Failed to allocate context.\n"); 9689 cb(_ctx, -ENOMEM); 9690 return; 9691 } 9692 9693 ctx->ctx = _ctx; 9694 ctx->fn = fn; 9695 ctx->cb = cb; 9696 9697 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9698 bdev_for_each_io_done); 9699 } 9700 9701 void 9702 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9703 { 9704 spdk_for_each_channel_continue(iter->i, status); 9705 } 9706 9707 static struct spdk_bdev * 9708 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9709 { 9710 void *io_device = spdk_io_channel_iter_get_io_device(i); 9711 9712 return __bdev_from_io_dev(io_device); 9713 } 9714 9715 static void 9716 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9717 { 9718 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9719 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9720 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9721 9722 iter->i = i; 9723 iter->fn(iter, bdev, ch, iter->ctx); 9724 } 9725 9726 static void 9727 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9728 { 9729 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9730 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9731 9732 iter->i = i; 9733 iter->cpl(bdev, iter->ctx, status); 9734 9735 free(iter); 9736 } 9737 9738 void 9739 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9740 void *ctx, spdk_bdev_for_each_channel_done cpl) 9741 { 9742 struct spdk_bdev_channel_iter *iter; 9743 9744 assert(bdev != NULL && fn != NULL && ctx != NULL); 9745 9746 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9747 if (iter == NULL) { 9748 SPDK_ERRLOG("Unable to allocate iterator\n"); 9749 assert(false); 9750 return; 9751 } 9752 9753 iter->fn = fn; 9754 iter->cpl = cpl; 9755 iter->ctx = ctx; 9756 9757 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9758 iter, bdev_each_channel_cpl); 9759 } 9760 9761 static void 9762 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9763 { 9764 struct spdk_bdev_io *parent_io = cb_arg; 9765 9766 spdk_bdev_free_io(bdev_io); 9767 9768 /* Check return status of write */ 9769 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9770 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9771 } 9772 9773 static void 9774 bdev_copy_do_write(void *_bdev_io) 9775 { 9776 struct spdk_bdev_io *bdev_io = _bdev_io; 9777 int rc; 9778 9779 /* Write blocks */ 9780 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9781 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9782 bdev_io->u.bdev.iovs[0].iov_base, 9783 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9784 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 9785 9786 if (rc == -ENOMEM) { 9787 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9788 } else if (rc != 0) { 9789 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9790 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9791 } 9792 } 9793 9794 static void 9795 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9796 { 9797 struct spdk_bdev_io *parent_io = cb_arg; 9798 9799 spdk_bdev_free_io(bdev_io); 9800 9801 /* Check return status of read */ 9802 if (!success) { 9803 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9804 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9805 return; 9806 } 9807 9808 /* Do write */ 9809 bdev_copy_do_write(parent_io); 9810 } 9811 9812 static void 9813 bdev_copy_do_read(void *_bdev_io) 9814 { 9815 struct spdk_bdev_io *bdev_io = _bdev_io; 9816 int rc; 9817 9818 /* Read blocks */ 9819 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9820 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9821 bdev_io->u.bdev.iovs[0].iov_base, 9822 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9823 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 9824 9825 if (rc == -ENOMEM) { 9826 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9827 } else if (rc != 0) { 9828 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9829 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9830 } 9831 } 9832 9833 static void 9834 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9835 { 9836 if (!success) { 9837 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9838 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9839 return; 9840 } 9841 9842 bdev_copy_do_read(bdev_io); 9843 } 9844 9845 int 9846 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9847 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9848 spdk_bdev_io_completion_cb cb, void *cb_arg) 9849 { 9850 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9851 struct spdk_bdev_io *bdev_io; 9852 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9853 9854 if (!desc->write) { 9855 return -EBADF; 9856 } 9857 9858 if (num_blocks == 0) { 9859 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9860 return -EINVAL; 9861 } 9862 9863 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9864 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9865 SPDK_DEBUGLOG(bdev, 9866 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9867 dst_offset_blocks, src_offset_blocks, num_blocks); 9868 return -EINVAL; 9869 } 9870 9871 bdev_io = bdev_channel_get_io(channel); 9872 if (!bdev_io) { 9873 return -ENOMEM; 9874 } 9875 9876 bdev_io->internal.ch = channel; 9877 bdev_io->internal.desc = desc; 9878 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9879 9880 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9881 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9882 bdev_io->u.bdev.num_blocks = num_blocks; 9883 bdev_io->u.bdev.memory_domain = NULL; 9884 bdev_io->u.bdev.memory_domain_ctx = NULL; 9885 bdev_io->u.bdev.iovs = NULL; 9886 bdev_io->u.bdev.iovcnt = 0; 9887 bdev_io->u.bdev.md_buf = NULL; 9888 bdev_io->u.bdev.accel_sequence = NULL; 9889 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9890 9891 if (dst_offset_blocks == src_offset_blocks) { 9892 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9893 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9894 9895 return 0; 9896 } 9897 9898 9899 /* If the copy size is large and should be split, use the generic split logic 9900 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 9901 * 9902 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 9903 * emulate it using regular read and write requests otherwise. 9904 */ 9905 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 9906 bdev_io->internal.split) { 9907 bdev_io_submit(bdev_io); 9908 return 0; 9909 } 9910 9911 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9912 9913 return 0; 9914 } 9915 9916 SPDK_LOG_REGISTER_COMPONENT(bdev) 9917 9918 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9919 { 9920 struct spdk_trace_tpoint_opts opts[] = { 9921 { 9922 "BDEV_IO_START", TRACE_BDEV_IO_START, 9923 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9924 { 9925 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9926 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9927 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9928 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9929 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9930 } 9931 }, 9932 { 9933 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9934 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9935 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9936 }, 9937 { 9938 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9939 OWNER_BDEV, OBJECT_NONE, 1, 9940 { 9941 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9942 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9943 } 9944 }, 9945 { 9946 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9947 OWNER_BDEV, OBJECT_NONE, 0, 9948 { 9949 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9950 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9951 } 9952 }, 9953 }; 9954 9955 9956 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9957 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9958 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9959 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9960 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9961 } 9962