1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "v23.09", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 void *locked_ctx; 140 struct spdk_thread *owner_thread; 141 struct spdk_bdev_channel *owner_ch; 142 TAILQ_ENTRY(lba_range) tailq; 143 TAILQ_ENTRY(lba_range) tailq_module; 144 }; 145 146 static struct spdk_bdev_opts g_bdev_opts = { 147 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 148 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 149 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 150 }; 151 152 static spdk_bdev_init_cb g_init_cb_fn = NULL; 153 static void *g_init_cb_arg = NULL; 154 155 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 156 static void *g_fini_cb_arg = NULL; 157 static struct spdk_thread *g_fini_thread = NULL; 158 159 struct spdk_bdev_qos_limit { 160 /** IOs or bytes allowed per second (i.e., 1s). */ 161 uint64_t limit; 162 163 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 164 * For remaining bytes, allowed to run negative if an I/O is submitted when 165 * some bytes are remaining, but the I/O is bigger than that amount. The 166 * excess will be deducted from the next timeslice. 167 */ 168 int64_t remaining_this_timeslice; 169 170 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 171 uint32_t min_per_timeslice; 172 173 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t max_per_timeslice; 175 176 /** Function to check whether to queue the IO. */ 177 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 178 179 /** Function to update for the submitted IO. */ 180 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 181 }; 182 183 struct spdk_bdev_qos { 184 /** Types of structure of rate limits. */ 185 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 186 187 /** The channel that all I/O are funneled through. */ 188 struct spdk_bdev_channel *ch; 189 190 /** The thread on which the poller is running. */ 191 struct spdk_thread *thread; 192 193 /** Queue of I/O waiting to be issued. */ 194 bdev_io_tailq_t queued; 195 196 /** Size of a timeslice in tsc ticks. */ 197 uint64_t timeslice_size; 198 199 /** Timestamp of start of last timeslice. */ 200 uint64_t last_timeslice; 201 202 /** Poller that processes queued I/O commands each time slice. */ 203 struct spdk_poller *poller; 204 }; 205 206 struct spdk_bdev_mgmt_channel { 207 /* 208 * Each thread keeps a cache of bdev_io - this allows 209 * bdev threads which are *not* DPDK threads to still 210 * benefit from a per-thread bdev_io cache. Without 211 * this, non-DPDK threads fetching from the mempool 212 * incur a cmpxchg on get and put. 213 */ 214 bdev_io_stailq_t per_thread_cache; 215 uint32_t per_thread_cache_count; 216 uint32_t bdev_io_cache_size; 217 218 struct spdk_iobuf_channel iobuf; 219 220 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 221 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 222 }; 223 224 /* 225 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 226 * will queue here their IO that awaits retry. It makes it possible to retry sending 227 * IO to one bdev after IO from other bdev completes. 228 */ 229 struct spdk_bdev_shared_resource { 230 /* The bdev management channel */ 231 struct spdk_bdev_mgmt_channel *mgmt_ch; 232 233 /* 234 * Count of I/O submitted to bdev module and waiting for completion. 235 * Incremented before submit_request() is called on an spdk_bdev_io. 236 */ 237 uint64_t io_outstanding; 238 239 /* 240 * Queue of IO awaiting retry because of a previous NOMEM status returned 241 * on this channel. 242 */ 243 bdev_io_tailq_t nomem_io; 244 245 /* 246 * Threshold which io_outstanding must drop to before retrying nomem_io. 247 */ 248 uint64_t nomem_threshold; 249 250 /* I/O channel allocated by a bdev module */ 251 struct spdk_io_channel *shared_ch; 252 253 /* Refcount of bdev channels using this resource */ 254 uint32_t ref; 255 256 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 257 }; 258 259 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 260 #define BDEV_CH_QOS_ENABLED (1 << 1) 261 262 struct spdk_bdev_channel { 263 struct spdk_bdev *bdev; 264 265 /* The channel for the underlying device */ 266 struct spdk_io_channel *channel; 267 268 /* Accel channel */ 269 struct spdk_io_channel *accel_channel; 270 271 /* Per io_device per thread data */ 272 struct spdk_bdev_shared_resource *shared_resource; 273 274 struct spdk_bdev_io_stat *stat; 275 276 /* 277 * Count of I/O submitted to the underlying dev module through this channel 278 * and waiting for completion. 279 */ 280 uint64_t io_outstanding; 281 282 /* 283 * List of all submitted I/Os including I/O that are generated via splitting. 284 */ 285 bdev_io_tailq_t io_submitted; 286 287 /* 288 * List of spdk_bdev_io that are currently queued because they write to a locked 289 * LBA range. 290 */ 291 bdev_io_tailq_t io_locked; 292 293 /* List of I/Os with accel sequence being currently executed */ 294 bdev_io_tailq_t io_accel_exec; 295 296 /* List of I/Os doing memory domain pull/push */ 297 bdev_io_tailq_t io_memory_domain; 298 299 uint32_t flags; 300 301 struct spdk_histogram_data *histogram; 302 303 #ifdef SPDK_CONFIG_VTUNE 304 uint64_t start_tsc; 305 uint64_t interval_tsc; 306 __itt_string_handle *handle; 307 struct spdk_bdev_io_stat *prev_stat; 308 #endif 309 310 bdev_io_tailq_t queued_resets; 311 312 lba_range_tailq_t locked_ranges; 313 }; 314 315 struct media_event_entry { 316 struct spdk_bdev_media_event event; 317 TAILQ_ENTRY(media_event_entry) tailq; 318 }; 319 320 #define MEDIA_EVENT_POOL_SIZE 64 321 322 struct spdk_bdev_desc { 323 struct spdk_bdev *bdev; 324 struct spdk_thread *thread; 325 struct { 326 spdk_bdev_event_cb_t event_fn; 327 void *ctx; 328 } callback; 329 bool closed; 330 bool write; 331 bool memory_domains_supported; 332 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 333 struct spdk_spinlock spinlock; 334 uint32_t refs; 335 TAILQ_HEAD(, media_event_entry) pending_media_events; 336 TAILQ_HEAD(, media_event_entry) free_media_events; 337 struct media_event_entry *media_events_buffer; 338 TAILQ_ENTRY(spdk_bdev_desc) link; 339 340 uint64_t timeout_in_sec; 341 spdk_bdev_io_timeout_cb cb_fn; 342 void *cb_arg; 343 struct spdk_poller *io_timeout_poller; 344 struct spdk_bdev_module_claim *claim; 345 }; 346 347 struct spdk_bdev_iostat_ctx { 348 struct spdk_bdev_io_stat *stat; 349 spdk_bdev_get_device_stat_cb cb; 350 void *cb_arg; 351 }; 352 353 struct set_qos_limit_ctx { 354 void (*cb_fn)(void *cb_arg, int status); 355 void *cb_arg; 356 struct spdk_bdev *bdev; 357 }; 358 359 struct spdk_bdev_channel_iter { 360 spdk_bdev_for_each_channel_msg fn; 361 spdk_bdev_for_each_channel_done cpl; 362 struct spdk_io_channel_iter *i; 363 void *ctx; 364 }; 365 366 struct spdk_bdev_io_error_stat { 367 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 368 }; 369 370 enum bdev_io_retry_state { 371 BDEV_IO_RETRY_STATE_INVALID, 372 BDEV_IO_RETRY_STATE_PULL, 373 BDEV_IO_RETRY_STATE_PULL_MD, 374 BDEV_IO_RETRY_STATE_SUBMIT, 375 BDEV_IO_RETRY_STATE_PUSH, 376 BDEV_IO_RETRY_STATE_PUSH_MD, 377 }; 378 379 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 380 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 381 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 382 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 383 384 static inline void bdev_io_complete(void *ctx); 385 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 386 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 387 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 388 389 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 390 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 391 392 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 393 struct spdk_io_channel *ch, void *_ctx); 394 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 395 396 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 397 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 398 uint64_t num_blocks, 399 struct spdk_memory_domain *domain, void *domain_ctx, 400 struct spdk_accel_sequence *seq, 401 spdk_bdev_io_completion_cb cb, void *cb_arg); 402 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 403 struct iovec *iov, int iovcnt, void *md_buf, 404 uint64_t offset_blocks, uint64_t num_blocks, 405 struct spdk_memory_domain *domain, void *domain_ctx, 406 struct spdk_accel_sequence *seq, 407 spdk_bdev_io_completion_cb cb, void *cb_arg); 408 409 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 410 uint64_t offset, uint64_t length, 411 lock_range_cb cb_fn, void *cb_arg); 412 413 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 414 uint64_t offset, uint64_t length, 415 lock_range_cb cb_fn, void *cb_arg); 416 417 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 418 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 419 420 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 421 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 422 static void claim_reset(struct spdk_bdev *bdev); 423 424 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 425 426 #define bdev_get_ext_io_opt(opts, field, defval) \ 427 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 428 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 429 430 void 431 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 432 { 433 if (!opts) { 434 SPDK_ERRLOG("opts should not be NULL\n"); 435 return; 436 } 437 438 if (!opts_size) { 439 SPDK_ERRLOG("opts_size should not be zero value\n"); 440 return; 441 } 442 443 opts->opts_size = opts_size; 444 445 #define SET_FIELD(field) \ 446 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 447 opts->field = g_bdev_opts.field; \ 448 } \ 449 450 SET_FIELD(bdev_io_pool_size); 451 SET_FIELD(bdev_io_cache_size); 452 SET_FIELD(bdev_auto_examine); 453 454 /* Do not remove this statement, you should always update this statement when you adding a new field, 455 * and do not forget to add the SET_FIELD statement for your added field. */ 456 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 457 458 #undef SET_FIELD 459 } 460 461 int 462 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 463 { 464 uint32_t min_pool_size; 465 466 if (!opts) { 467 SPDK_ERRLOG("opts cannot be NULL\n"); 468 return -1; 469 } 470 471 if (!opts->opts_size) { 472 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 473 return -1; 474 } 475 476 /* 477 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 478 * initialization. A second mgmt_ch will be created on the same thread when the application starts 479 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 480 */ 481 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 482 if (opts->bdev_io_pool_size < min_pool_size) { 483 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 484 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 485 spdk_thread_get_count()); 486 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 487 return -1; 488 } 489 490 #define SET_FIELD(field) \ 491 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 492 g_bdev_opts.field = opts->field; \ 493 } \ 494 495 SET_FIELD(bdev_io_pool_size); 496 SET_FIELD(bdev_io_cache_size); 497 SET_FIELD(bdev_auto_examine); 498 499 g_bdev_opts.opts_size = opts->opts_size; 500 501 #undef SET_FIELD 502 503 return 0; 504 } 505 506 static struct spdk_bdev * 507 bdev_get_by_name(const char *bdev_name) 508 { 509 struct spdk_bdev_name find; 510 struct spdk_bdev_name *res; 511 512 find.name = (char *)bdev_name; 513 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 514 if (res != NULL) { 515 return res->bdev; 516 } 517 518 return NULL; 519 } 520 521 struct spdk_bdev * 522 spdk_bdev_get_by_name(const char *bdev_name) 523 { 524 struct spdk_bdev *bdev; 525 526 spdk_spin_lock(&g_bdev_mgr.spinlock); 527 bdev = bdev_get_by_name(bdev_name); 528 spdk_spin_unlock(&g_bdev_mgr.spinlock); 529 530 return bdev; 531 } 532 533 struct bdev_io_status_string { 534 enum spdk_bdev_io_status status; 535 const char *str; 536 }; 537 538 static const struct bdev_io_status_string bdev_io_status_strings[] = { 539 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 540 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 541 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 542 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 543 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 544 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 545 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 546 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 547 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 548 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 549 }; 550 551 static const char * 552 bdev_io_status_get_string(enum spdk_bdev_io_status status) 553 { 554 uint32_t i; 555 556 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 557 if (bdev_io_status_strings[i].status == status) { 558 return bdev_io_status_strings[i].str; 559 } 560 } 561 562 return "reserved"; 563 } 564 565 struct spdk_bdev_wait_for_examine_ctx { 566 struct spdk_poller *poller; 567 spdk_bdev_wait_for_examine_cb cb_fn; 568 void *cb_arg; 569 }; 570 571 static bool bdev_module_all_actions_completed(void); 572 573 static int 574 bdev_wait_for_examine_cb(void *arg) 575 { 576 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 577 578 if (!bdev_module_all_actions_completed()) { 579 return SPDK_POLLER_IDLE; 580 } 581 582 spdk_poller_unregister(&ctx->poller); 583 ctx->cb_fn(ctx->cb_arg); 584 free(ctx); 585 586 return SPDK_POLLER_BUSY; 587 } 588 589 int 590 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 591 { 592 struct spdk_bdev_wait_for_examine_ctx *ctx; 593 594 ctx = calloc(1, sizeof(*ctx)); 595 if (ctx == NULL) { 596 return -ENOMEM; 597 } 598 ctx->cb_fn = cb_fn; 599 ctx->cb_arg = cb_arg; 600 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 601 602 return 0; 603 } 604 605 struct spdk_bdev_examine_item { 606 char *name; 607 TAILQ_ENTRY(spdk_bdev_examine_item) link; 608 }; 609 610 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 611 612 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 613 g_bdev_examine_allowlist); 614 615 static inline bool 616 bdev_examine_allowlist_check(const char *name) 617 { 618 struct spdk_bdev_examine_item *item; 619 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 620 if (strcmp(name, item->name) == 0) { 621 return true; 622 } 623 } 624 return false; 625 } 626 627 static inline void 628 bdev_examine_allowlist_free(void) 629 { 630 struct spdk_bdev_examine_item *item; 631 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 632 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 633 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 634 free(item->name); 635 free(item); 636 } 637 } 638 639 static inline bool 640 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 641 { 642 struct spdk_bdev_alias *tmp; 643 if (bdev_examine_allowlist_check(bdev->name)) { 644 return true; 645 } 646 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 647 if (bdev_examine_allowlist_check(tmp->alias.name)) { 648 return true; 649 } 650 } 651 return false; 652 } 653 654 static inline bool 655 bdev_ok_to_examine(struct spdk_bdev *bdev) 656 { 657 if (g_bdev_opts.bdev_auto_examine) { 658 return true; 659 } else { 660 return bdev_in_examine_allowlist(bdev); 661 } 662 } 663 664 static void 665 bdev_examine(struct spdk_bdev *bdev) 666 { 667 struct spdk_bdev_module *module; 668 struct spdk_bdev_module_claim *claim, *tmpclaim; 669 uint32_t action; 670 671 if (!bdev_ok_to_examine(bdev)) { 672 return; 673 } 674 675 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 676 if (module->examine_config) { 677 spdk_spin_lock(&module->internal.spinlock); 678 action = module->internal.action_in_progress; 679 module->internal.action_in_progress++; 680 spdk_spin_unlock(&module->internal.spinlock); 681 module->examine_config(bdev); 682 if (action != module->internal.action_in_progress) { 683 SPDK_ERRLOG("examine_config for module %s did not call " 684 "spdk_bdev_module_examine_done()\n", module->name); 685 } 686 } 687 } 688 689 spdk_spin_lock(&bdev->internal.spinlock); 690 691 switch (bdev->internal.claim_type) { 692 case SPDK_BDEV_CLAIM_NONE: 693 /* Examine by all bdev modules */ 694 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 695 if (module->examine_disk) { 696 spdk_spin_lock(&module->internal.spinlock); 697 module->internal.action_in_progress++; 698 spdk_spin_unlock(&module->internal.spinlock); 699 spdk_spin_unlock(&bdev->internal.spinlock); 700 module->examine_disk(bdev); 701 spdk_spin_lock(&bdev->internal.spinlock); 702 } 703 } 704 break; 705 case SPDK_BDEV_CLAIM_EXCL_WRITE: 706 /* Examine by the one bdev module with a v1 claim */ 707 module = bdev->internal.claim.v1.module; 708 if (module->examine_disk) { 709 spdk_spin_lock(&module->internal.spinlock); 710 module->internal.action_in_progress++; 711 spdk_spin_unlock(&module->internal.spinlock); 712 spdk_spin_unlock(&bdev->internal.spinlock); 713 module->examine_disk(bdev); 714 return; 715 } 716 break; 717 default: 718 /* Examine by all bdev modules with a v2 claim */ 719 assert(claim_type_is_v2(bdev->internal.claim_type)); 720 /* 721 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 722 * list, perhaps accessing freed memory. Without protection, this could happen 723 * while the lock is dropped during the examine callback. 724 */ 725 bdev->internal.examine_in_progress++; 726 727 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 728 module = claim->module; 729 730 if (module == NULL) { 731 /* This is a vestigial claim, held by examine_count */ 732 continue; 733 } 734 735 if (module->examine_disk == NULL) { 736 continue; 737 } 738 739 spdk_spin_lock(&module->internal.spinlock); 740 module->internal.action_in_progress++; 741 spdk_spin_unlock(&module->internal.spinlock); 742 743 /* Call examine_disk without holding internal.spinlock. */ 744 spdk_spin_unlock(&bdev->internal.spinlock); 745 module->examine_disk(bdev); 746 spdk_spin_lock(&bdev->internal.spinlock); 747 } 748 749 assert(bdev->internal.examine_in_progress > 0); 750 bdev->internal.examine_in_progress--; 751 if (bdev->internal.examine_in_progress == 0) { 752 /* Remove any claims that were released during examine_disk */ 753 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 754 if (claim->desc != NULL) { 755 continue; 756 } 757 758 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 759 free(claim); 760 } 761 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 762 claim_reset(bdev); 763 } 764 } 765 } 766 767 spdk_spin_unlock(&bdev->internal.spinlock); 768 } 769 770 int 771 spdk_bdev_examine(const char *name) 772 { 773 struct spdk_bdev *bdev; 774 struct spdk_bdev_examine_item *item; 775 struct spdk_thread *thread = spdk_get_thread(); 776 777 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 778 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 779 thread ? spdk_thread_get_name(thread) : "null"); 780 return -EINVAL; 781 } 782 783 if (g_bdev_opts.bdev_auto_examine) { 784 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 785 return -EINVAL; 786 } 787 788 if (bdev_examine_allowlist_check(name)) { 789 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 790 return -EEXIST; 791 } 792 793 item = calloc(1, sizeof(*item)); 794 if (!item) { 795 return -ENOMEM; 796 } 797 item->name = strdup(name); 798 if (!item->name) { 799 free(item); 800 return -ENOMEM; 801 } 802 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 803 804 bdev = spdk_bdev_get_by_name(name); 805 if (bdev) { 806 bdev_examine(bdev); 807 } 808 return 0; 809 } 810 811 static inline void 812 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 813 { 814 struct spdk_bdev_examine_item *item; 815 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 816 spdk_json_write_object_begin(w); 817 spdk_json_write_named_string(w, "method", "bdev_examine"); 818 spdk_json_write_named_object_begin(w, "params"); 819 spdk_json_write_named_string(w, "name", item->name); 820 spdk_json_write_object_end(w); 821 spdk_json_write_object_end(w); 822 } 823 } 824 825 struct spdk_bdev * 826 spdk_bdev_first(void) 827 { 828 struct spdk_bdev *bdev; 829 830 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 831 if (bdev) { 832 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 833 } 834 835 return bdev; 836 } 837 838 struct spdk_bdev * 839 spdk_bdev_next(struct spdk_bdev *prev) 840 { 841 struct spdk_bdev *bdev; 842 843 bdev = TAILQ_NEXT(prev, internal.link); 844 if (bdev) { 845 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 846 } 847 848 return bdev; 849 } 850 851 static struct spdk_bdev * 852 _bdev_next_leaf(struct spdk_bdev *bdev) 853 { 854 while (bdev != NULL) { 855 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 856 return bdev; 857 } else { 858 bdev = TAILQ_NEXT(bdev, internal.link); 859 } 860 } 861 862 return bdev; 863 } 864 865 struct spdk_bdev * 866 spdk_bdev_first_leaf(void) 867 { 868 struct spdk_bdev *bdev; 869 870 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 871 872 if (bdev) { 873 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 874 } 875 876 return bdev; 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_next_leaf(struct spdk_bdev *prev) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 885 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 static inline bool 894 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 895 { 896 return bdev_io->internal.memory_domain; 897 } 898 899 static inline bool 900 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 901 { 902 return bdev_io->internal.accel_sequence; 903 } 904 905 static inline void 906 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 907 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 908 { 909 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 910 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 911 * channels we will instead wait for half to complete. 912 */ 913 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 914 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 915 916 assert(state != BDEV_IO_RETRY_STATE_INVALID); 917 bdev_io->internal.retry_state = state; 918 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 919 } 920 921 static inline void 922 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 923 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 924 { 925 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 926 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 927 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 928 929 assert(state != BDEV_IO_RETRY_STATE_INVALID); 930 bdev_io->internal.retry_state = state; 931 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 932 } 933 934 void 935 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 936 { 937 struct iovec *iovs; 938 939 if (bdev_io->u.bdev.iovs == NULL) { 940 bdev_io->u.bdev.iovs = &bdev_io->iov; 941 bdev_io->u.bdev.iovcnt = 1; 942 } 943 944 iovs = bdev_io->u.bdev.iovs; 945 946 assert(iovs != NULL); 947 assert(bdev_io->u.bdev.iovcnt >= 1); 948 949 iovs[0].iov_base = buf; 950 iovs[0].iov_len = len; 951 } 952 953 void 954 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 955 { 956 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 957 bdev_io->u.bdev.md_buf = md_buf; 958 } 959 960 static bool 961 _is_buf_allocated(const struct iovec *iovs) 962 { 963 if (iovs == NULL) { 964 return false; 965 } 966 967 return iovs[0].iov_base != NULL; 968 } 969 970 static bool 971 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 972 { 973 int i; 974 uintptr_t iov_base; 975 976 if (spdk_likely(alignment == 1)) { 977 return true; 978 } 979 980 for (i = 0; i < iovcnt; i++) { 981 iov_base = (uintptr_t)iovs[i].iov_base; 982 if ((iov_base & (alignment - 1)) != 0) { 983 return false; 984 } 985 } 986 987 return true; 988 } 989 990 static inline bool 991 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 992 { 993 if (!bdev_io_use_accel_sequence(bdev_io)) { 994 return false; 995 } 996 997 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 998 * bdev module didn't support accel sequences */ 999 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1000 } 1001 1002 static inline void 1003 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1004 struct spdk_bdev_shared_resource *shared_resource) 1005 { 1006 bdev_ch->io_outstanding++; 1007 shared_resource->io_outstanding++; 1008 } 1009 1010 static inline void 1011 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1012 struct spdk_bdev_shared_resource *shared_resource) 1013 { 1014 assert(bdev_ch->io_outstanding > 0); 1015 assert(shared_resource->io_outstanding > 0); 1016 bdev_ch->io_outstanding--; 1017 shared_resource->io_outstanding--; 1018 } 1019 1020 static void 1021 bdev_io_submit_sequence_cb(void *ctx, int status) 1022 { 1023 struct spdk_bdev_io *bdev_io = ctx; 1024 1025 bdev_io->u.bdev.accel_sequence = NULL; 1026 bdev_io->internal.accel_sequence = NULL; 1027 1028 if (spdk_unlikely(status != 0)) { 1029 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1030 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1031 bdev_io_complete_unsubmitted(bdev_io); 1032 return; 1033 } 1034 1035 bdev_io_submit(bdev_io); 1036 } 1037 1038 static void 1039 bdev_io_exec_sequence_cb(void *ctx, int status) 1040 { 1041 struct spdk_bdev_io *bdev_io = ctx; 1042 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1043 1044 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1045 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1046 1047 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1048 bdev_ch_retry_io(ch); 1049 } 1050 1051 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1052 } 1053 1054 static void 1055 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1056 { 1057 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1058 1059 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1060 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1061 1062 /* Since the operations are appended during submission, they're in the opposite order than 1063 * how we want to execute them for reads (i.e. we need to execute the most recently added 1064 * operation first), so reverse the sequence before executing it. 1065 */ 1066 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1067 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1068 } 1069 1070 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1071 bdev_io_increment_outstanding(ch, ch->shared_resource); 1072 bdev_io->internal.data_transfer_cpl = cb_fn; 1073 1074 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1075 bdev_io_exec_sequence_cb, bdev_io); 1076 } 1077 1078 static void 1079 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1080 { 1081 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1082 void *buf; 1083 1084 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1085 buf = bdev_io->internal.buf; 1086 bdev_io->internal.buf = NULL; 1087 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1088 bdev_io->internal.get_aux_buf_cb = NULL; 1089 } else { 1090 assert(bdev_io->internal.get_buf_cb != NULL); 1091 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1092 bdev_io->internal.get_buf_cb = NULL; 1093 } 1094 } 1095 1096 static void 1097 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1098 { 1099 struct spdk_bdev_io *bdev_io = ctx; 1100 1101 if (rc) { 1102 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1103 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1104 } 1105 bdev_io_get_buf_complete(bdev_io, !rc); 1106 } 1107 1108 static void 1109 bdev_io_pull_md_buf_done(void *ctx, int status) 1110 { 1111 struct spdk_bdev_io *bdev_io = ctx; 1112 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1113 1114 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1115 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1116 1117 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1118 bdev_ch_retry_io(ch); 1119 } 1120 1121 assert(bdev_io->internal.data_transfer_cpl); 1122 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1123 } 1124 1125 static void 1126 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1127 { 1128 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1129 int rc = 0; 1130 1131 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1132 if (bdev_io_use_memory_domain(bdev_io)) { 1133 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1134 bdev_io_increment_outstanding(ch, ch->shared_resource); 1135 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1136 bdev_io->internal.memory_domain_ctx, 1137 &bdev_io->internal.orig_md_iov, 1, 1138 &bdev_io->internal.bounce_md_iov, 1, 1139 bdev_io_pull_md_buf_done, bdev_io); 1140 if (rc == 0) { 1141 /* Continue to submit IO in completion callback */ 1142 return; 1143 } 1144 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1145 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1146 if (rc != -ENOMEM) { 1147 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1148 spdk_memory_domain_get_dma_device_id( 1149 bdev_io->internal.memory_domain), rc); 1150 } 1151 } else { 1152 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1153 bdev_io->internal.orig_md_iov.iov_base, 1154 bdev_io->internal.orig_md_iov.iov_len); 1155 } 1156 } 1157 1158 if (spdk_unlikely(rc == -ENOMEM)) { 1159 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1160 } else { 1161 assert(bdev_io->internal.data_transfer_cpl); 1162 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1163 } 1164 } 1165 1166 static void 1167 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1168 { 1169 /* save original md_buf */ 1170 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1171 bdev_io->internal.orig_md_iov.iov_len = len; 1172 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1173 bdev_io->internal.bounce_md_iov.iov_len = len; 1174 /* set bounce md_buf */ 1175 bdev_io->u.bdev.md_buf = md_buf; 1176 1177 bdev_io_pull_md_buf(bdev_io); 1178 } 1179 1180 static void 1181 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1182 { 1183 struct spdk_bdev *bdev = bdev_io->bdev; 1184 uint64_t md_len; 1185 void *buf; 1186 1187 if (spdk_bdev_is_md_separate(bdev)) { 1188 assert(!bdev_io_use_accel_sequence(bdev_io)); 1189 1190 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1191 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1192 1193 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1194 1195 if (bdev_io->u.bdev.md_buf != NULL) { 1196 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1197 return; 1198 } else { 1199 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1200 } 1201 } 1202 1203 bdev_io_get_buf_complete(bdev_io, true); 1204 } 1205 1206 static inline void 1207 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1208 { 1209 if (rc) { 1210 SPDK_ERRLOG("Failed to get data buffer\n"); 1211 assert(bdev_io->internal.data_transfer_cpl); 1212 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1213 return; 1214 } 1215 1216 _bdev_io_set_md_buf(bdev_io); 1217 } 1218 1219 static void 1220 bdev_io_pull_data_done_and_track(void *ctx, int status) 1221 { 1222 struct spdk_bdev_io *bdev_io = ctx; 1223 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1224 1225 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1226 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1227 1228 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1229 bdev_ch_retry_io(ch); 1230 } 1231 1232 bdev_io_pull_data_done(bdev_io, status); 1233 } 1234 1235 static void 1236 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1237 { 1238 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1239 int rc = 0; 1240 1241 /* If we need to exec an accel sequence, append a copy operation making accel change the 1242 * src/dst buffers of the previous operation */ 1243 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1244 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1245 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1246 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1247 NULL, NULL, 1248 bdev_io->internal.orig_iovs, 1249 bdev_io->internal.orig_iovcnt, 1250 bdev_io->internal.memory_domain, 1251 bdev_io->internal.memory_domain_ctx, 1252 0, NULL, NULL); 1253 } else { 1254 /* We need to reverse the src/dst for reads */ 1255 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1256 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1257 bdev_io->internal.orig_iovs, 1258 bdev_io->internal.orig_iovcnt, 1259 bdev_io->internal.memory_domain, 1260 bdev_io->internal.memory_domain_ctx, 1261 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1262 NULL, NULL, 0, NULL, NULL); 1263 } 1264 1265 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1266 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1267 bdev_io->internal.accel_sequence); 1268 } 1269 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1270 /* if this is write path, copy data from original buffer to bounce buffer */ 1271 if (bdev_io_use_memory_domain(bdev_io)) { 1272 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1273 bdev_io_increment_outstanding(ch, ch->shared_resource); 1274 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1275 bdev_io->internal.memory_domain_ctx, 1276 bdev_io->internal.orig_iovs, 1277 (uint32_t) bdev_io->internal.orig_iovcnt, 1278 bdev_io->u.bdev.iovs, 1, 1279 bdev_io_pull_data_done_and_track, 1280 bdev_io); 1281 if (rc == 0) { 1282 /* Continue to submit IO in completion callback */ 1283 return; 1284 } 1285 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1286 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1287 if (rc != -ENOMEM) { 1288 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1289 spdk_memory_domain_get_dma_device_id( 1290 bdev_io->internal.memory_domain)); 1291 } 1292 } else { 1293 assert(bdev_io->u.bdev.iovcnt == 1); 1294 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1295 bdev_io->u.bdev.iovs[0].iov_len, 1296 bdev_io->internal.orig_iovs, 1297 bdev_io->internal.orig_iovcnt); 1298 } 1299 } 1300 1301 if (spdk_unlikely(rc == -ENOMEM)) { 1302 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1303 } else { 1304 bdev_io_pull_data_done(bdev_io, rc); 1305 } 1306 } 1307 1308 static void 1309 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1310 bdev_copy_bounce_buffer_cpl cpl_cb) 1311 { 1312 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1313 1314 bdev_io->internal.data_transfer_cpl = cpl_cb; 1315 /* save original iovec */ 1316 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1317 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1318 /* set bounce iov */ 1319 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1320 bdev_io->u.bdev.iovcnt = 1; 1321 /* set bounce buffer for this operation */ 1322 bdev_io->u.bdev.iovs[0].iov_base = buf; 1323 bdev_io->u.bdev.iovs[0].iov_len = len; 1324 1325 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1326 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1327 } else { 1328 bdev_io_pull_data(bdev_io); 1329 } 1330 } 1331 1332 static void 1333 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1334 { 1335 struct spdk_bdev *bdev = bdev_io->bdev; 1336 bool buf_allocated; 1337 uint64_t alignment; 1338 void *aligned_buf; 1339 1340 bdev_io->internal.buf = buf; 1341 1342 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1343 bdev_io_get_buf_complete(bdev_io, true); 1344 return; 1345 } 1346 1347 alignment = spdk_bdev_get_buf_align(bdev); 1348 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1349 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1350 1351 if (buf_allocated) { 1352 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1353 /* Continue in completion callback */ 1354 return; 1355 } else { 1356 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1357 } 1358 1359 _bdev_io_set_md_buf(bdev_io); 1360 } 1361 1362 static inline uint64_t 1363 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1364 { 1365 struct spdk_bdev *bdev = bdev_io->bdev; 1366 uint64_t md_len, alignment; 1367 1368 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1369 1370 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1371 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1372 1373 return len + alignment + md_len; 1374 } 1375 1376 static void 1377 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1378 { 1379 struct spdk_bdev_mgmt_channel *ch; 1380 1381 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1382 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1383 } 1384 1385 static void 1386 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1387 { 1388 assert(bdev_io->internal.buf != NULL); 1389 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1390 bdev_io->internal.buf = NULL; 1391 } 1392 1393 void 1394 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1395 { 1396 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1397 1398 assert(buf != NULL); 1399 _bdev_io_put_buf(bdev_io, buf, len); 1400 } 1401 1402 static inline void 1403 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1404 struct spdk_bdev_io *bdev_io) 1405 { 1406 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1407 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1408 * sequence pointer to make sure we won't touch it anymore. */ 1409 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1410 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1411 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1412 bdev_io->internal.accel_sequence = NULL; 1413 } 1414 1415 bdev->fn_table->submit_request(ioch, bdev_io); 1416 } 1417 1418 static inline void 1419 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1420 { 1421 struct spdk_bdev *bdev = bdev_ch->bdev; 1422 1423 bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource); 1424 bdev_io->internal.error.nvme.cdw0 = 0; 1425 bdev_io->num_retries++; 1426 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1427 } 1428 1429 static void 1430 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1431 { 1432 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1433 struct spdk_bdev_io *bdev_io; 1434 1435 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1436 /* 1437 * Allow some more I/O to complete before retrying the nomem_io queue. 1438 * Some drivers (such as nvme) cannot immediately take a new I/O in 1439 * the context of a completion, because the resources for the I/O are 1440 * not released until control returns to the bdev poller. Also, we 1441 * may require several small I/O to complete before a larger I/O 1442 * (that requires splitting) can be submitted. 1443 */ 1444 return; 1445 } 1446 1447 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1448 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1449 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1450 1451 switch (bdev_io->internal.retry_state) { 1452 case BDEV_IO_RETRY_STATE_SUBMIT: 1453 bdev_ch_resubmit_io(bdev_ch, bdev_io); 1454 break; 1455 case BDEV_IO_RETRY_STATE_PULL: 1456 bdev_io_pull_data(bdev_io); 1457 break; 1458 case BDEV_IO_RETRY_STATE_PULL_MD: 1459 bdev_io_pull_md_buf(bdev_io); 1460 break; 1461 case BDEV_IO_RETRY_STATE_PUSH: 1462 bdev_io_push_bounce_data(bdev_io); 1463 break; 1464 case BDEV_IO_RETRY_STATE_PUSH_MD: 1465 bdev_io_push_bounce_md_buf(bdev_io); 1466 break; 1467 default: 1468 assert(0 && "invalid retry state"); 1469 break; 1470 } 1471 1472 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1473 /* This IO completed again with NOMEM status, so break the loop and 1474 * don't try anymore. Note that a bdev_io that fails with NOMEM 1475 * always gets requeued at the front of the list, to maintain 1476 * ordering. 1477 */ 1478 break; 1479 } 1480 } 1481 } 1482 1483 static inline bool 1484 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1485 { 1486 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1487 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1488 1489 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1490 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1491 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1492 1493 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1494 * ownership of that sequence is transferred back to the bdev layer, so we need to 1495 * restore internal.accel_sequence to make sure that the sequence is handled 1496 * correctly in case the I/O is later aborted. */ 1497 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1498 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1499 assert(bdev_io->internal.accel_sequence == NULL); 1500 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1501 } 1502 1503 return true; 1504 } 1505 1506 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1507 bdev_ch_retry_io(bdev_ch); 1508 } 1509 1510 return false; 1511 } 1512 1513 static void 1514 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1515 { 1516 struct spdk_bdev_io *bdev_io = ctx; 1517 1518 if (rc) { 1519 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1520 } 1521 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1522 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1523 */ 1524 bdev_io_put_buf(bdev_io); 1525 1526 /* Continue with IO completion flow */ 1527 bdev_io_complete(bdev_io); 1528 } 1529 1530 static void 1531 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1532 { 1533 struct spdk_bdev_io *bdev_io = ctx; 1534 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1535 1536 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1537 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1538 1539 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1540 bdev_ch_retry_io(ch); 1541 } 1542 1543 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1544 } 1545 1546 static inline void 1547 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1548 { 1549 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1550 int rc = 0; 1551 1552 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1553 /* do the same for metadata buffer */ 1554 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1555 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1556 1557 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1558 if (bdev_io_use_memory_domain(bdev_io)) { 1559 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1560 bdev_io_increment_outstanding(ch, ch->shared_resource); 1561 /* If memory domain is used then we need to call async push function */ 1562 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1563 bdev_io->internal.memory_domain_ctx, 1564 &bdev_io->internal.orig_md_iov, 1565 (uint32_t)bdev_io->internal.orig_iovcnt, 1566 &bdev_io->internal.bounce_md_iov, 1, 1567 bdev_io_push_bounce_md_buf_done, 1568 bdev_io); 1569 if (rc == 0) { 1570 /* Continue IO completion in async callback */ 1571 return; 1572 } 1573 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1574 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1575 if (rc != -ENOMEM) { 1576 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1577 spdk_memory_domain_get_dma_device_id( 1578 bdev_io->internal.memory_domain)); 1579 } 1580 } else { 1581 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1582 bdev_io->internal.orig_md_iov.iov_len); 1583 } 1584 } 1585 } 1586 1587 if (spdk_unlikely(rc == -ENOMEM)) { 1588 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1589 } else { 1590 assert(bdev_io->internal.data_transfer_cpl); 1591 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1592 } 1593 } 1594 1595 static inline void 1596 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1597 { 1598 assert(bdev_io->internal.data_transfer_cpl); 1599 if (rc) { 1600 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1601 return; 1602 } 1603 1604 /* set original buffer for this io */ 1605 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1606 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1607 /* disable bouncing buffer for this io */ 1608 bdev_io->internal.orig_iovcnt = 0; 1609 bdev_io->internal.orig_iovs = NULL; 1610 1611 bdev_io_push_bounce_md_buf(bdev_io); 1612 } 1613 1614 static void 1615 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1616 { 1617 struct spdk_bdev_io *bdev_io = ctx; 1618 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1619 1620 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1621 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1622 1623 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1624 bdev_ch_retry_io(ch); 1625 } 1626 1627 bdev_io_push_bounce_data_done(bdev_io, status); 1628 } 1629 1630 static inline void 1631 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1632 { 1633 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1634 int rc = 0; 1635 1636 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1637 /* if this is read path, copy data from bounce buffer to original buffer */ 1638 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1639 if (bdev_io_use_memory_domain(bdev_io)) { 1640 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1641 bdev_io_increment_outstanding(ch, ch->shared_resource); 1642 /* If memory domain is used then we need to call async push function */ 1643 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1644 bdev_io->internal.memory_domain_ctx, 1645 bdev_io->internal.orig_iovs, 1646 (uint32_t)bdev_io->internal.orig_iovcnt, 1647 &bdev_io->internal.bounce_iov, 1, 1648 bdev_io_push_bounce_data_done_and_track, 1649 bdev_io); 1650 if (rc == 0) { 1651 /* Continue IO completion in async callback */ 1652 return; 1653 } 1654 1655 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1656 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1657 if (rc != -ENOMEM) { 1658 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1659 spdk_memory_domain_get_dma_device_id( 1660 bdev_io->internal.memory_domain)); 1661 } 1662 } else { 1663 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1664 bdev_io->internal.orig_iovcnt, 1665 bdev_io->internal.bounce_iov.iov_base, 1666 bdev_io->internal.bounce_iov.iov_len); 1667 } 1668 } 1669 1670 if (spdk_unlikely(rc == -ENOMEM)) { 1671 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1672 } else { 1673 bdev_io_push_bounce_data_done(bdev_io, rc); 1674 } 1675 } 1676 1677 static inline void 1678 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1679 { 1680 bdev_io->internal.data_transfer_cpl = cpl_cb; 1681 bdev_io_push_bounce_data(bdev_io); 1682 } 1683 1684 static void 1685 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1686 { 1687 struct spdk_bdev_io *bdev_io; 1688 1689 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1690 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1691 } 1692 1693 static void 1694 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1695 { 1696 struct spdk_bdev_mgmt_channel *mgmt_ch; 1697 uint64_t max_len; 1698 void *buf; 1699 1700 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1701 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1702 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1703 1704 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1705 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1706 bdev_io_get_buf_complete(bdev_io, false); 1707 return; 1708 } 1709 1710 bdev_io->internal.buf_len = len; 1711 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1712 bdev_io_get_iobuf_cb); 1713 if (buf != NULL) { 1714 _bdev_io_set_buf(bdev_io, buf, len); 1715 } 1716 } 1717 1718 void 1719 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1720 { 1721 struct spdk_bdev *bdev = bdev_io->bdev; 1722 uint64_t alignment; 1723 1724 assert(cb != NULL); 1725 bdev_io->internal.get_buf_cb = cb; 1726 1727 alignment = spdk_bdev_get_buf_align(bdev); 1728 1729 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1730 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1731 /* Buffer already present and aligned */ 1732 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1733 return; 1734 } 1735 1736 bdev_io_get_buf(bdev_io, len); 1737 } 1738 1739 static void 1740 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1741 bool success) 1742 { 1743 if (!success) { 1744 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1745 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1746 bdev_io_complete_unsubmitted(bdev_io); 1747 return; 1748 } 1749 1750 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1751 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1752 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1753 return; 1754 } 1755 /* For reads we'll execute the sequence after the data is read, so, for now, only 1756 * clear out accel_sequence pointer and submit the IO */ 1757 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1758 bdev_io->u.bdev.accel_sequence = NULL; 1759 } 1760 1761 bdev_io_submit(bdev_io); 1762 } 1763 1764 static void 1765 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1766 uint64_t len) 1767 { 1768 assert(cb != NULL); 1769 bdev_io->internal.get_buf_cb = cb; 1770 1771 bdev_io_get_buf(bdev_io, len); 1772 } 1773 1774 void 1775 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1776 { 1777 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1778 1779 assert(cb != NULL); 1780 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1781 bdev_io->internal.get_aux_buf_cb = cb; 1782 bdev_io_get_buf(bdev_io, len); 1783 } 1784 1785 static int 1786 bdev_module_get_max_ctx_size(void) 1787 { 1788 struct spdk_bdev_module *bdev_module; 1789 int max_bdev_module_size = 0; 1790 1791 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1792 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1793 max_bdev_module_size = bdev_module->get_ctx_size(); 1794 } 1795 } 1796 1797 return max_bdev_module_size; 1798 } 1799 1800 static void 1801 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1802 { 1803 int i; 1804 struct spdk_bdev_qos *qos = bdev->internal.qos; 1805 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1806 1807 if (!qos) { 1808 return; 1809 } 1810 1811 spdk_bdev_get_qos_rate_limits(bdev, limits); 1812 1813 spdk_json_write_object_begin(w); 1814 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1815 1816 spdk_json_write_named_object_begin(w, "params"); 1817 spdk_json_write_named_string(w, "name", bdev->name); 1818 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1819 if (limits[i] > 0) { 1820 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1821 } 1822 } 1823 spdk_json_write_object_end(w); 1824 1825 spdk_json_write_object_end(w); 1826 } 1827 1828 void 1829 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1830 { 1831 struct spdk_bdev_module *bdev_module; 1832 struct spdk_bdev *bdev; 1833 1834 assert(w != NULL); 1835 1836 spdk_json_write_array_begin(w); 1837 1838 spdk_json_write_object_begin(w); 1839 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1840 spdk_json_write_named_object_begin(w, "params"); 1841 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1842 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1843 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1844 spdk_json_write_object_end(w); 1845 spdk_json_write_object_end(w); 1846 1847 bdev_examine_allowlist_config_json(w); 1848 1849 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1850 if (bdev_module->config_json) { 1851 bdev_module->config_json(w); 1852 } 1853 } 1854 1855 spdk_spin_lock(&g_bdev_mgr.spinlock); 1856 1857 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1858 if (bdev->fn_table->write_config_json) { 1859 bdev->fn_table->write_config_json(bdev, w); 1860 } 1861 1862 bdev_qos_config_json(bdev, w); 1863 } 1864 1865 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1866 1867 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1868 spdk_json_write_object_begin(w); 1869 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1870 spdk_json_write_object_end(w); 1871 1872 spdk_json_write_array_end(w); 1873 } 1874 1875 static void 1876 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1877 { 1878 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1879 struct spdk_bdev_io *bdev_io; 1880 1881 spdk_iobuf_channel_fini(&ch->iobuf); 1882 1883 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1884 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1885 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1886 ch->per_thread_cache_count--; 1887 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1888 } 1889 1890 assert(ch->per_thread_cache_count == 0); 1891 } 1892 1893 static int 1894 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1895 { 1896 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1897 struct spdk_bdev_io *bdev_io; 1898 uint32_t i; 1899 int rc; 1900 1901 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1902 if (rc != 0) { 1903 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1904 return -1; 1905 } 1906 1907 STAILQ_INIT(&ch->per_thread_cache); 1908 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1909 1910 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1911 ch->per_thread_cache_count = 0; 1912 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1913 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1914 if (bdev_io == NULL) { 1915 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1916 assert(false); 1917 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1918 return -1; 1919 } 1920 ch->per_thread_cache_count++; 1921 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1922 } 1923 1924 TAILQ_INIT(&ch->shared_resources); 1925 TAILQ_INIT(&ch->io_wait_queue); 1926 1927 return 0; 1928 } 1929 1930 static void 1931 bdev_init_complete(int rc) 1932 { 1933 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1934 void *cb_arg = g_init_cb_arg; 1935 struct spdk_bdev_module *m; 1936 1937 g_bdev_mgr.init_complete = true; 1938 g_init_cb_fn = NULL; 1939 g_init_cb_arg = NULL; 1940 1941 /* 1942 * For modules that need to know when subsystem init is complete, 1943 * inform them now. 1944 */ 1945 if (rc == 0) { 1946 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1947 if (m->init_complete) { 1948 m->init_complete(); 1949 } 1950 } 1951 } 1952 1953 cb_fn(cb_arg, rc); 1954 } 1955 1956 static bool 1957 bdev_module_all_actions_completed(void) 1958 { 1959 struct spdk_bdev_module *m; 1960 1961 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1962 if (m->internal.action_in_progress > 0) { 1963 return false; 1964 } 1965 } 1966 return true; 1967 } 1968 1969 static void 1970 bdev_module_action_complete(void) 1971 { 1972 /* 1973 * Don't finish bdev subsystem initialization if 1974 * module pre-initialization is still in progress, or 1975 * the subsystem been already initialized. 1976 */ 1977 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1978 return; 1979 } 1980 1981 /* 1982 * Check all bdev modules for inits/examinations in progress. If any 1983 * exist, return immediately since we cannot finish bdev subsystem 1984 * initialization until all are completed. 1985 */ 1986 if (!bdev_module_all_actions_completed()) { 1987 return; 1988 } 1989 1990 /* 1991 * Modules already finished initialization - now that all 1992 * the bdev modules have finished their asynchronous I/O 1993 * processing, the entire bdev layer can be marked as complete. 1994 */ 1995 bdev_init_complete(0); 1996 } 1997 1998 static void 1999 bdev_module_action_done(struct spdk_bdev_module *module) 2000 { 2001 spdk_spin_lock(&module->internal.spinlock); 2002 assert(module->internal.action_in_progress > 0); 2003 module->internal.action_in_progress--; 2004 spdk_spin_unlock(&module->internal.spinlock); 2005 bdev_module_action_complete(); 2006 } 2007 2008 void 2009 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2010 { 2011 assert(module->async_init); 2012 bdev_module_action_done(module); 2013 } 2014 2015 void 2016 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2017 { 2018 bdev_module_action_done(module); 2019 } 2020 2021 /** The last initialized bdev module */ 2022 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2023 2024 static void 2025 bdev_init_failed(void *cb_arg) 2026 { 2027 struct spdk_bdev_module *module = cb_arg; 2028 2029 spdk_spin_lock(&module->internal.spinlock); 2030 assert(module->internal.action_in_progress > 0); 2031 module->internal.action_in_progress--; 2032 spdk_spin_unlock(&module->internal.spinlock); 2033 bdev_init_complete(-1); 2034 } 2035 2036 static int 2037 bdev_modules_init(void) 2038 { 2039 struct spdk_bdev_module *module; 2040 int rc = 0; 2041 2042 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2043 g_resume_bdev_module = module; 2044 if (module->async_init) { 2045 spdk_spin_lock(&module->internal.spinlock); 2046 module->internal.action_in_progress = 1; 2047 spdk_spin_unlock(&module->internal.spinlock); 2048 } 2049 rc = module->module_init(); 2050 if (rc != 0) { 2051 /* Bump action_in_progress to prevent other modules from completion of modules_init 2052 * Send message to defer application shutdown until resources are cleaned up */ 2053 spdk_spin_lock(&module->internal.spinlock); 2054 module->internal.action_in_progress = 1; 2055 spdk_spin_unlock(&module->internal.spinlock); 2056 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2057 return rc; 2058 } 2059 } 2060 2061 g_resume_bdev_module = NULL; 2062 return 0; 2063 } 2064 2065 void 2066 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2067 { 2068 int rc = 0; 2069 char mempool_name[32]; 2070 2071 assert(cb_fn != NULL); 2072 2073 g_init_cb_fn = cb_fn; 2074 g_init_cb_arg = cb_arg; 2075 2076 spdk_notify_type_register("bdev_register"); 2077 spdk_notify_type_register("bdev_unregister"); 2078 2079 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2080 2081 rc = spdk_iobuf_register_module("bdev"); 2082 if (rc != 0) { 2083 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2084 bdev_init_complete(-1); 2085 return; 2086 } 2087 2088 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2089 g_bdev_opts.bdev_io_pool_size, 2090 sizeof(struct spdk_bdev_io) + 2091 bdev_module_get_max_ctx_size(), 2092 0, 2093 SPDK_ENV_SOCKET_ID_ANY); 2094 2095 if (g_bdev_mgr.bdev_io_pool == NULL) { 2096 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2097 bdev_init_complete(-1); 2098 return; 2099 } 2100 2101 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2102 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2103 if (!g_bdev_mgr.zero_buffer) { 2104 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2105 bdev_init_complete(-1); 2106 return; 2107 } 2108 2109 #ifdef SPDK_CONFIG_VTUNE 2110 SPDK_LOG_DEPRECATED(vtune_support); 2111 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2112 #endif 2113 2114 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2115 bdev_mgmt_channel_destroy, 2116 sizeof(struct spdk_bdev_mgmt_channel), 2117 "bdev_mgr"); 2118 2119 rc = bdev_modules_init(); 2120 g_bdev_mgr.module_init_complete = true; 2121 if (rc != 0) { 2122 SPDK_ERRLOG("bdev modules init failed\n"); 2123 return; 2124 } 2125 2126 bdev_module_action_complete(); 2127 } 2128 2129 static void 2130 bdev_mgr_unregister_cb(void *io_device) 2131 { 2132 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2133 2134 if (g_bdev_mgr.bdev_io_pool) { 2135 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2136 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2137 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2138 g_bdev_opts.bdev_io_pool_size); 2139 } 2140 2141 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2142 } 2143 2144 spdk_free(g_bdev_mgr.zero_buffer); 2145 2146 bdev_examine_allowlist_free(); 2147 2148 cb_fn(g_fini_cb_arg); 2149 g_fini_cb_fn = NULL; 2150 g_fini_cb_arg = NULL; 2151 g_bdev_mgr.init_complete = false; 2152 g_bdev_mgr.module_init_complete = false; 2153 } 2154 2155 static void 2156 bdev_module_fini_iter(void *arg) 2157 { 2158 struct spdk_bdev_module *bdev_module; 2159 2160 /* FIXME: Handling initialization failures is broken now, 2161 * so we won't even try cleaning up after successfully 2162 * initialized modules. if module_init_complete is false, 2163 * just call spdk_bdev_mgr_unregister_cb 2164 */ 2165 if (!g_bdev_mgr.module_init_complete) { 2166 bdev_mgr_unregister_cb(NULL); 2167 return; 2168 } 2169 2170 /* Start iterating from the last touched module */ 2171 if (!g_resume_bdev_module) { 2172 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2173 } else { 2174 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2175 internal.tailq); 2176 } 2177 2178 while (bdev_module) { 2179 if (bdev_module->async_fini) { 2180 /* Save our place so we can resume later. We must 2181 * save the variable here, before calling module_fini() 2182 * below, because in some cases the module may immediately 2183 * call spdk_bdev_module_fini_done() and re-enter 2184 * this function to continue iterating. */ 2185 g_resume_bdev_module = bdev_module; 2186 } 2187 2188 if (bdev_module->module_fini) { 2189 bdev_module->module_fini(); 2190 } 2191 2192 if (bdev_module->async_fini) { 2193 return; 2194 } 2195 2196 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2197 internal.tailq); 2198 } 2199 2200 g_resume_bdev_module = NULL; 2201 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2202 } 2203 2204 void 2205 spdk_bdev_module_fini_done(void) 2206 { 2207 if (spdk_get_thread() != g_fini_thread) { 2208 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2209 } else { 2210 bdev_module_fini_iter(NULL); 2211 } 2212 } 2213 2214 static void 2215 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2216 { 2217 struct spdk_bdev *bdev = cb_arg; 2218 2219 if (bdeverrno && bdev) { 2220 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2221 bdev->name); 2222 2223 /* 2224 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2225 * bdev; try to continue by manually removing this bdev from the list and continue 2226 * with the next bdev in the list. 2227 */ 2228 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2229 } 2230 2231 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2232 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2233 /* 2234 * Bdev module finish need to be deferred as we might be in the middle of some context 2235 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2236 * after returning. 2237 */ 2238 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2239 return; 2240 } 2241 2242 /* 2243 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2244 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2245 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2246 * base bdevs. 2247 * 2248 * Also, walk the list in the reverse order. 2249 */ 2250 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2251 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2252 spdk_spin_lock(&bdev->internal.spinlock); 2253 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2254 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2255 spdk_spin_unlock(&bdev->internal.spinlock); 2256 continue; 2257 } 2258 spdk_spin_unlock(&bdev->internal.spinlock); 2259 2260 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2261 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2262 return; 2263 } 2264 2265 /* 2266 * If any bdev fails to unclaim underlying bdev properly, we may face the 2267 * case of bdev list consisting of claimed bdevs only (if claims are managed 2268 * correctly, this would mean there's a loop in the claims graph which is 2269 * clearly impossible). Warn and unregister last bdev on the list then. 2270 */ 2271 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2272 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2273 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2274 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2275 return; 2276 } 2277 } 2278 2279 static void 2280 bdev_module_fini_start_iter(void *arg) 2281 { 2282 struct spdk_bdev_module *bdev_module; 2283 2284 if (!g_resume_bdev_module) { 2285 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2286 } else { 2287 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2288 } 2289 2290 while (bdev_module) { 2291 if (bdev_module->async_fini_start) { 2292 /* Save our place so we can resume later. We must 2293 * save the variable here, before calling fini_start() 2294 * below, because in some cases the module may immediately 2295 * call spdk_bdev_module_fini_start_done() and re-enter 2296 * this function to continue iterating. */ 2297 g_resume_bdev_module = bdev_module; 2298 } 2299 2300 if (bdev_module->fini_start) { 2301 bdev_module->fini_start(); 2302 } 2303 2304 if (bdev_module->async_fini_start) { 2305 return; 2306 } 2307 2308 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2309 } 2310 2311 g_resume_bdev_module = NULL; 2312 2313 bdev_finish_unregister_bdevs_iter(NULL, 0); 2314 } 2315 2316 void 2317 spdk_bdev_module_fini_start_done(void) 2318 { 2319 if (spdk_get_thread() != g_fini_thread) { 2320 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2321 } else { 2322 bdev_module_fini_start_iter(NULL); 2323 } 2324 } 2325 2326 static void 2327 bdev_finish_wait_for_examine_done(void *cb_arg) 2328 { 2329 bdev_module_fini_start_iter(NULL); 2330 } 2331 2332 void 2333 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2334 { 2335 int rc; 2336 2337 assert(cb_fn != NULL); 2338 2339 g_fini_thread = spdk_get_thread(); 2340 2341 g_fini_cb_fn = cb_fn; 2342 g_fini_cb_arg = cb_arg; 2343 2344 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2345 if (rc != 0) { 2346 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2347 bdev_finish_wait_for_examine_done(NULL); 2348 } 2349 } 2350 2351 struct spdk_bdev_io * 2352 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2353 { 2354 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2355 struct spdk_bdev_io *bdev_io; 2356 2357 if (ch->per_thread_cache_count > 0) { 2358 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2359 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2360 ch->per_thread_cache_count--; 2361 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2362 /* 2363 * Don't try to look for bdev_ios in the global pool if there are 2364 * waiters on bdev_ios - we don't want this caller to jump the line. 2365 */ 2366 bdev_io = NULL; 2367 } else { 2368 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2369 } 2370 2371 return bdev_io; 2372 } 2373 2374 void 2375 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2376 { 2377 struct spdk_bdev_mgmt_channel *ch; 2378 2379 assert(bdev_io != NULL); 2380 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2381 2382 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2383 2384 if (bdev_io->internal.buf != NULL) { 2385 bdev_io_put_buf(bdev_io); 2386 } 2387 2388 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2389 ch->per_thread_cache_count++; 2390 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2391 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2392 struct spdk_bdev_io_wait_entry *entry; 2393 2394 entry = TAILQ_FIRST(&ch->io_wait_queue); 2395 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2396 entry->cb_fn(entry->cb_arg); 2397 } 2398 } else { 2399 /* We should never have a full cache with entries on the io wait queue. */ 2400 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2401 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2402 } 2403 } 2404 2405 static bool 2406 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2407 { 2408 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2409 2410 switch (limit) { 2411 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2412 return true; 2413 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2414 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2415 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2416 return false; 2417 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2418 default: 2419 return false; 2420 } 2421 } 2422 2423 static bool 2424 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2425 { 2426 switch (bdev_io->type) { 2427 case SPDK_BDEV_IO_TYPE_NVME_IO: 2428 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2429 case SPDK_BDEV_IO_TYPE_READ: 2430 case SPDK_BDEV_IO_TYPE_WRITE: 2431 return true; 2432 case SPDK_BDEV_IO_TYPE_ZCOPY: 2433 if (bdev_io->u.bdev.zcopy.start) { 2434 return true; 2435 } else { 2436 return false; 2437 } 2438 default: 2439 return false; 2440 } 2441 } 2442 2443 static bool 2444 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2445 { 2446 switch (bdev_io->type) { 2447 case SPDK_BDEV_IO_TYPE_NVME_IO: 2448 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2449 /* Bit 1 (0x2) set for read operation */ 2450 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2451 return true; 2452 } else { 2453 return false; 2454 } 2455 case SPDK_BDEV_IO_TYPE_READ: 2456 return true; 2457 case SPDK_BDEV_IO_TYPE_ZCOPY: 2458 /* Populate to read from disk */ 2459 if (bdev_io->u.bdev.zcopy.populate) { 2460 return true; 2461 } else { 2462 return false; 2463 } 2464 default: 2465 return false; 2466 } 2467 } 2468 2469 static uint64_t 2470 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2471 { 2472 struct spdk_bdev *bdev = bdev_io->bdev; 2473 2474 switch (bdev_io->type) { 2475 case SPDK_BDEV_IO_TYPE_NVME_IO: 2476 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2477 return bdev_io->u.nvme_passthru.nbytes; 2478 case SPDK_BDEV_IO_TYPE_READ: 2479 case SPDK_BDEV_IO_TYPE_WRITE: 2480 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2481 case SPDK_BDEV_IO_TYPE_ZCOPY: 2482 /* Track the data in the start phase only */ 2483 if (bdev_io->u.bdev.zcopy.start) { 2484 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2485 } else { 2486 return 0; 2487 } 2488 default: 2489 return 0; 2490 } 2491 } 2492 2493 static bool 2494 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2495 { 2496 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2497 return true; 2498 } else { 2499 return false; 2500 } 2501 } 2502 2503 static bool 2504 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2505 { 2506 if (bdev_is_read_io(io) == false) { 2507 return false; 2508 } 2509 2510 return bdev_qos_rw_queue_io(limit, io); 2511 } 2512 2513 static bool 2514 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2515 { 2516 if (bdev_is_read_io(io) == true) { 2517 return false; 2518 } 2519 2520 return bdev_qos_rw_queue_io(limit, io); 2521 } 2522 2523 static void 2524 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2525 { 2526 limit->remaining_this_timeslice--; 2527 } 2528 2529 static void 2530 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2531 { 2532 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2533 } 2534 2535 static void 2536 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2537 { 2538 if (bdev_is_read_io(io) == false) { 2539 return; 2540 } 2541 2542 return bdev_qos_rw_bps_update_quota(limit, io); 2543 } 2544 2545 static void 2546 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2547 { 2548 if (bdev_is_read_io(io) == true) { 2549 return; 2550 } 2551 2552 return bdev_qos_rw_bps_update_quota(limit, io); 2553 } 2554 2555 static void 2556 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2557 { 2558 int i; 2559 2560 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2561 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2562 qos->rate_limits[i].queue_io = NULL; 2563 qos->rate_limits[i].update_quota = NULL; 2564 continue; 2565 } 2566 2567 switch (i) { 2568 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2569 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2570 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2571 break; 2572 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2573 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2574 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2575 break; 2576 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2577 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2578 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2579 break; 2580 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2581 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2582 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2583 break; 2584 default: 2585 break; 2586 } 2587 } 2588 } 2589 2590 static void 2591 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2592 struct spdk_bdev_io *bdev_io, 2593 enum spdk_bdev_io_status status) 2594 { 2595 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2596 2597 bdev_io->internal.in_submit_request = true; 2598 bdev_ch->io_outstanding++; 2599 shared_resource->io_outstanding++; 2600 spdk_bdev_io_complete(bdev_io, status); 2601 bdev_io->internal.in_submit_request = false; 2602 } 2603 2604 static inline void 2605 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2606 { 2607 struct spdk_bdev *bdev = bdev_io->bdev; 2608 struct spdk_io_channel *ch = bdev_ch->channel; 2609 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2610 2611 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2612 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2613 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2614 2615 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2616 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2617 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2618 SPDK_BDEV_IO_STATUS_SUCCESS); 2619 return; 2620 } 2621 } 2622 2623 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2624 bdev_io->bdev->split_on_write_unit && 2625 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2626 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2627 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2628 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2629 return; 2630 } 2631 2632 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2633 bdev_ch->io_outstanding++; 2634 shared_resource->io_outstanding++; 2635 bdev_io->internal.in_submit_request = true; 2636 bdev_submit_request(bdev, ch, bdev_io); 2637 bdev_io->internal.in_submit_request = false; 2638 } else { 2639 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2640 } 2641 } 2642 2643 static bool 2644 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2645 { 2646 int i; 2647 2648 if (bdev_qos_io_to_limit(bdev_io) == true) { 2649 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2650 if (!qos->rate_limits[i].queue_io) { 2651 continue; 2652 } 2653 2654 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2655 bdev_io) == true) { 2656 return true; 2657 } 2658 } 2659 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2660 if (!qos->rate_limits[i].update_quota) { 2661 continue; 2662 } 2663 2664 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2665 } 2666 } 2667 2668 return false; 2669 } 2670 2671 static inline void 2672 _bdev_io_do_submit(void *ctx) 2673 { 2674 struct spdk_bdev_io *bdev_io = ctx; 2675 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2676 2677 bdev_io_do_submit(ch, bdev_io); 2678 } 2679 2680 static int 2681 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2682 { 2683 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2684 int submitted_ios = 0; 2685 2686 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2687 if (!bdev_qos_queue_io(qos, bdev_io)) { 2688 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2689 2690 if (bdev_io->internal.io_submit_ch) { 2691 /* Send back the IO to the original thread for the actual processing. */ 2692 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2693 bdev_io->internal.io_submit_ch = NULL; 2694 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2695 _bdev_io_do_submit, bdev_io); 2696 } else { 2697 bdev_io_do_submit(ch, bdev_io); 2698 } 2699 2700 submitted_ios++; 2701 } 2702 } 2703 2704 return submitted_ios; 2705 } 2706 2707 static void 2708 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2709 { 2710 int rc; 2711 2712 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2713 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2714 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2715 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2716 &bdev_io->internal.waitq_entry); 2717 if (rc != 0) { 2718 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2719 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2720 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2721 } 2722 } 2723 2724 static bool 2725 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2726 { 2727 uint32_t io_boundary; 2728 struct spdk_bdev *bdev = bdev_io->bdev; 2729 uint32_t max_size = bdev->max_segment_size; 2730 int max_segs = bdev->max_num_segments; 2731 2732 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2733 io_boundary = bdev->write_unit_size; 2734 } else if (bdev->split_on_optimal_io_boundary) { 2735 io_boundary = bdev->optimal_io_boundary; 2736 } else { 2737 io_boundary = 0; 2738 } 2739 2740 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2741 return false; 2742 } 2743 2744 if (io_boundary) { 2745 uint64_t start_stripe, end_stripe; 2746 2747 start_stripe = bdev_io->u.bdev.offset_blocks; 2748 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2749 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2750 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2751 start_stripe >>= spdk_u32log2(io_boundary); 2752 end_stripe >>= spdk_u32log2(io_boundary); 2753 } else { 2754 start_stripe /= io_boundary; 2755 end_stripe /= io_boundary; 2756 } 2757 2758 if (start_stripe != end_stripe) { 2759 return true; 2760 } 2761 } 2762 2763 if (max_segs) { 2764 if (bdev_io->u.bdev.iovcnt > max_segs) { 2765 return true; 2766 } 2767 } 2768 2769 if (max_size) { 2770 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2771 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2772 return true; 2773 } 2774 } 2775 } 2776 2777 return false; 2778 } 2779 2780 static bool 2781 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2782 { 2783 uint32_t num_unmap_segments; 2784 2785 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2786 return false; 2787 } 2788 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2789 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2790 return true; 2791 } 2792 2793 return false; 2794 } 2795 2796 static bool 2797 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2798 { 2799 if (!bdev_io->bdev->max_write_zeroes) { 2800 return false; 2801 } 2802 2803 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2804 return true; 2805 } 2806 2807 return false; 2808 } 2809 2810 static bool 2811 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2812 { 2813 if (bdev_io->bdev->max_copy != 0 && 2814 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2815 return true; 2816 } 2817 2818 return false; 2819 } 2820 2821 static bool 2822 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2823 { 2824 switch (bdev_io->type) { 2825 case SPDK_BDEV_IO_TYPE_READ: 2826 case SPDK_BDEV_IO_TYPE_WRITE: 2827 return bdev_rw_should_split(bdev_io); 2828 case SPDK_BDEV_IO_TYPE_UNMAP: 2829 return bdev_unmap_should_split(bdev_io); 2830 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2831 return bdev_write_zeroes_should_split(bdev_io); 2832 case SPDK_BDEV_IO_TYPE_COPY: 2833 return bdev_copy_should_split(bdev_io); 2834 default: 2835 return false; 2836 } 2837 } 2838 2839 static uint32_t 2840 _to_next_boundary(uint64_t offset, uint32_t boundary) 2841 { 2842 return (boundary - (offset % boundary)); 2843 } 2844 2845 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2846 2847 static void _bdev_rw_split(void *_bdev_io); 2848 2849 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2850 2851 static void 2852 _bdev_unmap_split(void *_bdev_io) 2853 { 2854 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2855 } 2856 2857 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2858 2859 static void 2860 _bdev_write_zeroes_split(void *_bdev_io) 2861 { 2862 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2863 } 2864 2865 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2866 2867 static void 2868 _bdev_copy_split(void *_bdev_io) 2869 { 2870 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2871 } 2872 2873 static int 2874 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2875 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2876 { 2877 int rc; 2878 uint64_t current_offset, current_remaining, current_src_offset; 2879 spdk_bdev_io_wait_cb io_wait_fn; 2880 2881 current_offset = *offset; 2882 current_remaining = *remaining; 2883 2884 bdev_io->u.bdev.split_outstanding++; 2885 2886 io_wait_fn = _bdev_rw_split; 2887 switch (bdev_io->type) { 2888 case SPDK_BDEV_IO_TYPE_READ: 2889 assert(bdev_io->u.bdev.accel_sequence == NULL); 2890 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2891 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2892 iov, iovcnt, md_buf, current_offset, 2893 num_blocks, bdev_io->internal.memory_domain, 2894 bdev_io->internal.memory_domain_ctx, NULL, 2895 bdev_io_split_done, bdev_io); 2896 break; 2897 case SPDK_BDEV_IO_TYPE_WRITE: 2898 assert(bdev_io->u.bdev.accel_sequence == NULL); 2899 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2900 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2901 iov, iovcnt, md_buf, current_offset, 2902 num_blocks, bdev_io->internal.memory_domain, 2903 bdev_io->internal.memory_domain_ctx, NULL, 2904 bdev_io_split_done, bdev_io); 2905 break; 2906 case SPDK_BDEV_IO_TYPE_UNMAP: 2907 io_wait_fn = _bdev_unmap_split; 2908 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2909 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2910 current_offset, num_blocks, 2911 bdev_io_split_done, bdev_io); 2912 break; 2913 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2914 io_wait_fn = _bdev_write_zeroes_split; 2915 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2916 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2917 current_offset, num_blocks, 2918 bdev_io_split_done, bdev_io); 2919 break; 2920 case SPDK_BDEV_IO_TYPE_COPY: 2921 io_wait_fn = _bdev_copy_split; 2922 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2923 (current_offset - bdev_io->u.bdev.offset_blocks); 2924 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2925 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2926 current_offset, current_src_offset, num_blocks, 2927 bdev_io_split_done, bdev_io); 2928 break; 2929 default: 2930 assert(false); 2931 rc = -EINVAL; 2932 break; 2933 } 2934 2935 if (rc == 0) { 2936 current_offset += num_blocks; 2937 current_remaining -= num_blocks; 2938 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2939 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2940 *offset = current_offset; 2941 *remaining = current_remaining; 2942 } else { 2943 bdev_io->u.bdev.split_outstanding--; 2944 if (rc == -ENOMEM) { 2945 if (bdev_io->u.bdev.split_outstanding == 0) { 2946 /* No I/O is outstanding. Hence we should wait here. */ 2947 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2948 } 2949 } else { 2950 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2951 if (bdev_io->u.bdev.split_outstanding == 0) { 2952 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2953 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2954 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2955 } 2956 } 2957 } 2958 2959 return rc; 2960 } 2961 2962 static void 2963 _bdev_rw_split(void *_bdev_io) 2964 { 2965 struct iovec *parent_iov, *iov; 2966 struct spdk_bdev_io *bdev_io = _bdev_io; 2967 struct spdk_bdev *bdev = bdev_io->bdev; 2968 uint64_t parent_offset, current_offset, remaining; 2969 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2970 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2971 uint32_t iovcnt, iov_len, child_iovsize; 2972 uint32_t blocklen = bdev->blocklen; 2973 uint32_t io_boundary; 2974 uint32_t max_segment_size = bdev->max_segment_size; 2975 uint32_t max_child_iovcnt = bdev->max_num_segments; 2976 void *md_buf = NULL; 2977 int rc; 2978 2979 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2980 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2981 SPDK_BDEV_IO_NUM_CHILD_IOV; 2982 2983 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2984 io_boundary = bdev->write_unit_size; 2985 } else if (bdev->split_on_optimal_io_boundary) { 2986 io_boundary = bdev->optimal_io_boundary; 2987 } else { 2988 io_boundary = UINT32_MAX; 2989 } 2990 2991 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2992 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2993 parent_offset = bdev_io->u.bdev.offset_blocks; 2994 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2995 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2996 2997 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2998 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2999 if (parent_iov_offset < parent_iov->iov_len) { 3000 break; 3001 } 3002 parent_iov_offset -= parent_iov->iov_len; 3003 } 3004 3005 child_iovcnt = 0; 3006 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3007 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3008 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3009 to_next_boundary = spdk_min(remaining, to_next_boundary); 3010 to_next_boundary_bytes = to_next_boundary * blocklen; 3011 3012 iov = &bdev_io->child_iov[child_iovcnt]; 3013 iovcnt = 0; 3014 3015 if (bdev_io->u.bdev.md_buf) { 3016 md_buf = (char *)bdev_io->u.bdev.md_buf + 3017 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3018 } 3019 3020 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3021 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3022 iovcnt < child_iovsize) { 3023 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3024 iov_len = parent_iov->iov_len - parent_iov_offset; 3025 3026 iov_len = spdk_min(iov_len, max_segment_size); 3027 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3028 to_next_boundary_bytes -= iov_len; 3029 3030 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3031 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3032 3033 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3034 parent_iov_offset += iov_len; 3035 } else { 3036 parent_iovpos++; 3037 parent_iov_offset = 0; 3038 } 3039 child_iovcnt++; 3040 iovcnt++; 3041 } 3042 3043 if (to_next_boundary_bytes > 0) { 3044 /* We had to stop this child I/O early because we ran out of 3045 * child_iov space or were limited by max_num_segments. 3046 * Ensure the iovs to be aligned with block size and 3047 * then adjust to_next_boundary before starting the 3048 * child I/O. 3049 */ 3050 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3051 iovcnt == child_iovsize); 3052 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3053 if (to_last_block_bytes != 0) { 3054 uint32_t child_iovpos = child_iovcnt - 1; 3055 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3056 * so the loop will naturally end 3057 */ 3058 3059 to_last_block_bytes = blocklen - to_last_block_bytes; 3060 to_next_boundary_bytes += to_last_block_bytes; 3061 while (to_last_block_bytes > 0 && iovcnt > 0) { 3062 iov_len = spdk_min(to_last_block_bytes, 3063 bdev_io->child_iov[child_iovpos].iov_len); 3064 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3065 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3066 child_iovpos--; 3067 if (--iovcnt == 0) { 3068 /* If the child IO is less than a block size just return. 3069 * If the first child IO of any split round is less than 3070 * a block size, an error exit. 3071 */ 3072 if (bdev_io->u.bdev.split_outstanding == 0) { 3073 SPDK_ERRLOG("The first child io was less than a block size\n"); 3074 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3075 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3076 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3077 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3078 } 3079 3080 return; 3081 } 3082 } 3083 3084 to_last_block_bytes -= iov_len; 3085 3086 if (parent_iov_offset == 0) { 3087 parent_iovpos--; 3088 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3089 } 3090 parent_iov_offset -= iov_len; 3091 } 3092 3093 assert(to_last_block_bytes == 0); 3094 } 3095 to_next_boundary -= to_next_boundary_bytes / blocklen; 3096 } 3097 3098 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3099 ¤t_offset, &remaining); 3100 if (spdk_unlikely(rc)) { 3101 return; 3102 } 3103 } 3104 } 3105 3106 static void 3107 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3108 { 3109 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3110 uint32_t num_children_reqs = 0; 3111 int rc; 3112 3113 offset = bdev_io->u.bdev.split_current_offset_blocks; 3114 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3115 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3116 3117 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3118 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3119 3120 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3121 &offset, &remaining); 3122 if (spdk_likely(rc == 0)) { 3123 num_children_reqs++; 3124 } else { 3125 return; 3126 } 3127 } 3128 } 3129 3130 static void 3131 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3132 { 3133 uint64_t offset, write_zeroes_blocks, remaining; 3134 uint32_t num_children_reqs = 0; 3135 int rc; 3136 3137 offset = bdev_io->u.bdev.split_current_offset_blocks; 3138 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3139 3140 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3141 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3142 3143 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3144 &offset, &remaining); 3145 if (spdk_likely(rc == 0)) { 3146 num_children_reqs++; 3147 } else { 3148 return; 3149 } 3150 } 3151 } 3152 3153 static void 3154 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3155 { 3156 uint64_t offset, copy_blocks, remaining; 3157 uint32_t num_children_reqs = 0; 3158 int rc; 3159 3160 offset = bdev_io->u.bdev.split_current_offset_blocks; 3161 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3162 3163 assert(bdev_io->bdev->max_copy != 0); 3164 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3165 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3166 3167 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3168 &offset, &remaining); 3169 if (spdk_likely(rc == 0)) { 3170 num_children_reqs++; 3171 } else { 3172 return; 3173 } 3174 } 3175 } 3176 3177 static void 3178 parent_bdev_io_complete(void *ctx, int rc) 3179 { 3180 struct spdk_bdev_io *parent_io = ctx; 3181 3182 if (rc) { 3183 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3184 } 3185 3186 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3187 parent_io->internal.caller_ctx); 3188 } 3189 3190 static void 3191 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3192 { 3193 struct spdk_bdev_io *bdev_io = ctx; 3194 3195 /* u.bdev.accel_sequence should have already been cleared at this point */ 3196 assert(bdev_io->u.bdev.accel_sequence == NULL); 3197 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3198 bdev_io->internal.accel_sequence = NULL; 3199 3200 if (spdk_unlikely(status != 0)) { 3201 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3202 } 3203 3204 parent_bdev_io_complete(bdev_io, status); 3205 } 3206 3207 static void 3208 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3209 { 3210 struct spdk_bdev_io *parent_io = cb_arg; 3211 3212 spdk_bdev_free_io(bdev_io); 3213 3214 if (!success) { 3215 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3216 /* If any child I/O failed, stop further splitting process. */ 3217 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3218 parent_io->u.bdev.split_remaining_num_blocks = 0; 3219 } 3220 parent_io->u.bdev.split_outstanding--; 3221 if (parent_io->u.bdev.split_outstanding != 0) { 3222 return; 3223 } 3224 3225 /* 3226 * Parent I/O finishes when all blocks are consumed. 3227 */ 3228 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3229 assert(parent_io->internal.cb != bdev_io_split_done); 3230 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3231 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3232 3233 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3234 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3235 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3236 return; 3237 } else if (parent_io->internal.orig_iovcnt != 0) { 3238 /* bdev IO will be completed in the callback */ 3239 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3240 return; 3241 } 3242 } 3243 3244 parent_bdev_io_complete(parent_io, 0); 3245 return; 3246 } 3247 3248 /* 3249 * Continue with the splitting process. This function will complete the parent I/O if the 3250 * splitting is done. 3251 */ 3252 switch (parent_io->type) { 3253 case SPDK_BDEV_IO_TYPE_READ: 3254 case SPDK_BDEV_IO_TYPE_WRITE: 3255 _bdev_rw_split(parent_io); 3256 break; 3257 case SPDK_BDEV_IO_TYPE_UNMAP: 3258 bdev_unmap_split(parent_io); 3259 break; 3260 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3261 bdev_write_zeroes_split(parent_io); 3262 break; 3263 case SPDK_BDEV_IO_TYPE_COPY: 3264 bdev_copy_split(parent_io); 3265 break; 3266 default: 3267 assert(false); 3268 break; 3269 } 3270 } 3271 3272 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3273 bool success); 3274 3275 static void 3276 bdev_io_split(struct spdk_bdev_io *bdev_io) 3277 { 3278 assert(bdev_io_should_split(bdev_io)); 3279 3280 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3281 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3282 bdev_io->u.bdev.split_outstanding = 0; 3283 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3284 3285 switch (bdev_io->type) { 3286 case SPDK_BDEV_IO_TYPE_READ: 3287 case SPDK_BDEV_IO_TYPE_WRITE: 3288 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3289 _bdev_rw_split(bdev_io); 3290 } else { 3291 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3292 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3293 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3294 } 3295 break; 3296 case SPDK_BDEV_IO_TYPE_UNMAP: 3297 bdev_unmap_split(bdev_io); 3298 break; 3299 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3300 bdev_write_zeroes_split(bdev_io); 3301 break; 3302 case SPDK_BDEV_IO_TYPE_COPY: 3303 bdev_copy_split(bdev_io); 3304 break; 3305 default: 3306 assert(false); 3307 break; 3308 } 3309 } 3310 3311 static void 3312 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3313 { 3314 if (!success) { 3315 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3316 return; 3317 } 3318 3319 _bdev_rw_split(bdev_io); 3320 } 3321 3322 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3323 * be inlined, at least on some compilers. 3324 */ 3325 static inline void 3326 _bdev_io_submit(void *ctx) 3327 { 3328 struct spdk_bdev_io *bdev_io = ctx; 3329 struct spdk_bdev *bdev = bdev_io->bdev; 3330 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3331 3332 if (spdk_likely(bdev_ch->flags == 0)) { 3333 bdev_io_do_submit(bdev_ch, bdev_io); 3334 return; 3335 } 3336 3337 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3338 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3339 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3340 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3341 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3342 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3343 } else { 3344 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3345 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3346 } 3347 } else { 3348 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3349 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3350 } 3351 } 3352 3353 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3354 3355 bool 3356 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3357 { 3358 if (range1->length == 0 || range2->length == 0) { 3359 return false; 3360 } 3361 3362 if (range1->offset + range1->length <= range2->offset) { 3363 return false; 3364 } 3365 3366 if (range2->offset + range2->length <= range1->offset) { 3367 return false; 3368 } 3369 3370 return true; 3371 } 3372 3373 static bool 3374 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3375 { 3376 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3377 struct lba_range r; 3378 3379 switch (bdev_io->type) { 3380 case SPDK_BDEV_IO_TYPE_NVME_IO: 3381 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3382 /* Don't try to decode the NVMe command - just assume worst-case and that 3383 * it overlaps a locked range. 3384 */ 3385 return true; 3386 case SPDK_BDEV_IO_TYPE_WRITE: 3387 case SPDK_BDEV_IO_TYPE_UNMAP: 3388 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3389 case SPDK_BDEV_IO_TYPE_ZCOPY: 3390 case SPDK_BDEV_IO_TYPE_COPY: 3391 r.offset = bdev_io->u.bdev.offset_blocks; 3392 r.length = bdev_io->u.bdev.num_blocks; 3393 if (!bdev_lba_range_overlapped(range, &r)) { 3394 /* This I/O doesn't overlap the specified LBA range. */ 3395 return false; 3396 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3397 /* This I/O overlaps, but the I/O is on the same channel that locked this 3398 * range, and the caller_ctx is the same as the locked_ctx. This means 3399 * that this I/O is associated with the lock, and is allowed to execute. 3400 */ 3401 return false; 3402 } else { 3403 return true; 3404 } 3405 default: 3406 return false; 3407 } 3408 } 3409 3410 void 3411 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3412 { 3413 struct spdk_bdev *bdev = bdev_io->bdev; 3414 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3415 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3416 3417 assert(thread != NULL); 3418 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3419 3420 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3421 struct lba_range *range; 3422 3423 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3424 if (bdev_io_range_is_locked(bdev_io, range)) { 3425 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3426 return; 3427 } 3428 } 3429 } 3430 3431 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3432 3433 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3434 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3435 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3436 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3437 spdk_bdev_get_name(bdev)); 3438 3439 if (bdev_io->internal.split) { 3440 bdev_io_split(bdev_io); 3441 return; 3442 } 3443 3444 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3445 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3446 _bdev_io_submit(bdev_io); 3447 } else { 3448 bdev_io->internal.io_submit_ch = ch; 3449 bdev_io->internal.ch = bdev->internal.qos->ch; 3450 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3451 } 3452 } else { 3453 _bdev_io_submit(bdev_io); 3454 } 3455 } 3456 3457 static inline void 3458 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3459 { 3460 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3461 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3462 * For write operation we need to pull buffers from memory domain before submitting IO. 3463 * Once read operation completes, we need to use memory_domain push functionality to 3464 * update data in original memory domain IO buffer 3465 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3466 bdev_io->u.bdev.memory_domain = NULL; 3467 bdev_io->u.bdev.memory_domain_ctx = NULL; 3468 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3469 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3470 } 3471 3472 static inline void 3473 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3474 { 3475 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3476 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3477 3478 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3479 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3480 bdev_io_complete_unsubmitted(bdev_io); 3481 return; 3482 } 3483 3484 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3485 * support them, but we need to execute an accel sequence and the data buffer is from accel 3486 * memory domain (to avoid doing a push/pull from that domain). 3487 */ 3488 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3489 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3490 _bdev_io_ext_use_bounce_buffer(bdev_io); 3491 return; 3492 } 3493 3494 if (needs_exec) { 3495 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3496 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3497 return; 3498 } 3499 /* For reads we'll execute the sequence after the data is read, so, for now, only 3500 * clear out accel_sequence pointer and submit the IO */ 3501 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3502 bdev_io->u.bdev.accel_sequence = NULL; 3503 } 3504 3505 bdev_io_submit(bdev_io); 3506 } 3507 3508 static void 3509 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3510 { 3511 struct spdk_bdev *bdev = bdev_io->bdev; 3512 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3513 struct spdk_io_channel *ch = bdev_ch->channel; 3514 3515 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3516 3517 bdev_io->internal.in_submit_request = true; 3518 bdev_submit_request(bdev, ch, bdev_io); 3519 bdev_io->internal.in_submit_request = false; 3520 } 3521 3522 void 3523 bdev_io_init(struct spdk_bdev_io *bdev_io, 3524 struct spdk_bdev *bdev, void *cb_arg, 3525 spdk_bdev_io_completion_cb cb) 3526 { 3527 bdev_io->bdev = bdev; 3528 bdev_io->internal.caller_ctx = cb_arg; 3529 bdev_io->internal.cb = cb; 3530 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3531 bdev_io->internal.in_submit_request = false; 3532 bdev_io->internal.buf = NULL; 3533 bdev_io->internal.io_submit_ch = NULL; 3534 bdev_io->internal.orig_iovs = NULL; 3535 bdev_io->internal.orig_iovcnt = 0; 3536 bdev_io->internal.orig_md_iov.iov_base = NULL; 3537 bdev_io->internal.error.nvme.cdw0 = 0; 3538 bdev_io->num_retries = 0; 3539 bdev_io->internal.get_buf_cb = NULL; 3540 bdev_io->internal.get_aux_buf_cb = NULL; 3541 bdev_io->internal.memory_domain = NULL; 3542 bdev_io->internal.memory_domain_ctx = NULL; 3543 bdev_io->internal.data_transfer_cpl = NULL; 3544 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3545 bdev_io->internal.accel_sequence = NULL; 3546 } 3547 3548 static bool 3549 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3550 { 3551 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3552 } 3553 3554 bool 3555 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3556 { 3557 bool supported; 3558 3559 supported = bdev_io_type_supported(bdev, io_type); 3560 3561 if (!supported) { 3562 switch (io_type) { 3563 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3564 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3565 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3566 break; 3567 default: 3568 break; 3569 } 3570 } 3571 3572 return supported; 3573 } 3574 3575 uint64_t 3576 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3577 { 3578 return bdev_io->internal.submit_tsc; 3579 } 3580 3581 int 3582 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3583 { 3584 if (bdev->fn_table->dump_info_json) { 3585 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3586 } 3587 3588 return 0; 3589 } 3590 3591 static void 3592 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3593 { 3594 uint32_t max_per_timeslice = 0; 3595 int i; 3596 3597 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3598 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3599 qos->rate_limits[i].max_per_timeslice = 0; 3600 continue; 3601 } 3602 3603 max_per_timeslice = qos->rate_limits[i].limit * 3604 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3605 3606 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3607 qos->rate_limits[i].min_per_timeslice); 3608 3609 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3610 } 3611 3612 bdev_qos_set_ops(qos); 3613 } 3614 3615 static int 3616 bdev_channel_poll_qos(void *arg) 3617 { 3618 struct spdk_bdev_qos *qos = arg; 3619 uint64_t now = spdk_get_ticks(); 3620 int i; 3621 3622 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3623 /* We received our callback earlier than expected - return 3624 * immediately and wait to do accounting until at least one 3625 * timeslice has actually expired. This should never happen 3626 * with a well-behaved timer implementation. 3627 */ 3628 return SPDK_POLLER_IDLE; 3629 } 3630 3631 /* Reset for next round of rate limiting */ 3632 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3633 /* We may have allowed the IOs or bytes to slightly overrun in the last 3634 * timeslice. remaining_this_timeslice is signed, so if it's negative 3635 * here, we'll account for the overrun so that the next timeslice will 3636 * be appropriately reduced. 3637 */ 3638 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3639 qos->rate_limits[i].remaining_this_timeslice = 0; 3640 } 3641 } 3642 3643 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3644 qos->last_timeslice += qos->timeslice_size; 3645 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3646 qos->rate_limits[i].remaining_this_timeslice += 3647 qos->rate_limits[i].max_per_timeslice; 3648 } 3649 } 3650 3651 return bdev_qos_io_submit(qos->ch, qos); 3652 } 3653 3654 static void 3655 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3656 { 3657 struct spdk_bdev_shared_resource *shared_resource; 3658 struct lba_range *range; 3659 3660 bdev_free_io_stat(ch->stat); 3661 #ifdef SPDK_CONFIG_VTUNE 3662 bdev_free_io_stat(ch->prev_stat); 3663 #endif 3664 3665 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3666 range = TAILQ_FIRST(&ch->locked_ranges); 3667 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3668 free(range); 3669 } 3670 3671 spdk_put_io_channel(ch->channel); 3672 spdk_put_io_channel(ch->accel_channel); 3673 3674 shared_resource = ch->shared_resource; 3675 3676 assert(TAILQ_EMPTY(&ch->io_locked)); 3677 assert(TAILQ_EMPTY(&ch->io_submitted)); 3678 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3679 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3680 assert(ch->io_outstanding == 0); 3681 assert(shared_resource->ref > 0); 3682 shared_resource->ref--; 3683 if (shared_resource->ref == 0) { 3684 assert(shared_resource->io_outstanding == 0); 3685 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3686 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3687 free(shared_resource); 3688 } 3689 } 3690 3691 static void 3692 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3693 { 3694 struct spdk_bdev_qos *qos = bdev->internal.qos; 3695 int i; 3696 3697 assert(spdk_spin_held(&bdev->internal.spinlock)); 3698 3699 /* Rate limiting on this bdev enabled */ 3700 if (qos) { 3701 if (qos->ch == NULL) { 3702 struct spdk_io_channel *io_ch; 3703 3704 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3705 bdev->name, spdk_get_thread()); 3706 3707 /* No qos channel has been selected, so set one up */ 3708 3709 /* Take another reference to ch */ 3710 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3711 assert(io_ch != NULL); 3712 qos->ch = ch; 3713 3714 qos->thread = spdk_io_channel_get_thread(io_ch); 3715 3716 TAILQ_INIT(&qos->queued); 3717 3718 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3719 if (bdev_qos_is_iops_rate_limit(i) == true) { 3720 qos->rate_limits[i].min_per_timeslice = 3721 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3722 } else { 3723 qos->rate_limits[i].min_per_timeslice = 3724 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3725 } 3726 3727 if (qos->rate_limits[i].limit == 0) { 3728 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3729 } 3730 } 3731 bdev_qos_update_max_quota_per_timeslice(qos); 3732 qos->timeslice_size = 3733 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3734 qos->last_timeslice = spdk_get_ticks(); 3735 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3736 qos, 3737 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3738 } 3739 3740 ch->flags |= BDEV_CH_QOS_ENABLED; 3741 } 3742 } 3743 3744 struct poll_timeout_ctx { 3745 struct spdk_bdev_desc *desc; 3746 uint64_t timeout_in_sec; 3747 spdk_bdev_io_timeout_cb cb_fn; 3748 void *cb_arg; 3749 }; 3750 3751 static void 3752 bdev_desc_free(struct spdk_bdev_desc *desc) 3753 { 3754 spdk_spin_destroy(&desc->spinlock); 3755 free(desc->media_events_buffer); 3756 free(desc); 3757 } 3758 3759 static void 3760 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3761 { 3762 struct poll_timeout_ctx *ctx = _ctx; 3763 struct spdk_bdev_desc *desc = ctx->desc; 3764 3765 free(ctx); 3766 3767 spdk_spin_lock(&desc->spinlock); 3768 desc->refs--; 3769 if (desc->closed == true && desc->refs == 0) { 3770 spdk_spin_unlock(&desc->spinlock); 3771 bdev_desc_free(desc); 3772 return; 3773 } 3774 spdk_spin_unlock(&desc->spinlock); 3775 } 3776 3777 static void 3778 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3779 struct spdk_io_channel *io_ch, void *_ctx) 3780 { 3781 struct poll_timeout_ctx *ctx = _ctx; 3782 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3783 struct spdk_bdev_desc *desc = ctx->desc; 3784 struct spdk_bdev_io *bdev_io; 3785 uint64_t now; 3786 3787 spdk_spin_lock(&desc->spinlock); 3788 if (desc->closed == true) { 3789 spdk_spin_unlock(&desc->spinlock); 3790 spdk_bdev_for_each_channel_continue(i, -1); 3791 return; 3792 } 3793 spdk_spin_unlock(&desc->spinlock); 3794 3795 now = spdk_get_ticks(); 3796 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3797 /* Exclude any I/O that are generated via splitting. */ 3798 if (bdev_io->internal.cb == bdev_io_split_done) { 3799 continue; 3800 } 3801 3802 /* Once we find an I/O that has not timed out, we can immediately 3803 * exit the loop. 3804 */ 3805 if (now < (bdev_io->internal.submit_tsc + 3806 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3807 goto end; 3808 } 3809 3810 if (bdev_io->internal.desc == desc) { 3811 ctx->cb_fn(ctx->cb_arg, bdev_io); 3812 } 3813 } 3814 3815 end: 3816 spdk_bdev_for_each_channel_continue(i, 0); 3817 } 3818 3819 static int 3820 bdev_poll_timeout_io(void *arg) 3821 { 3822 struct spdk_bdev_desc *desc = arg; 3823 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3824 struct poll_timeout_ctx *ctx; 3825 3826 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3827 if (!ctx) { 3828 SPDK_ERRLOG("failed to allocate memory\n"); 3829 return SPDK_POLLER_BUSY; 3830 } 3831 ctx->desc = desc; 3832 ctx->cb_arg = desc->cb_arg; 3833 ctx->cb_fn = desc->cb_fn; 3834 ctx->timeout_in_sec = desc->timeout_in_sec; 3835 3836 /* Take a ref on the descriptor in case it gets closed while we are checking 3837 * all of the channels. 3838 */ 3839 spdk_spin_lock(&desc->spinlock); 3840 desc->refs++; 3841 spdk_spin_unlock(&desc->spinlock); 3842 3843 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3844 bdev_channel_poll_timeout_io_done); 3845 3846 return SPDK_POLLER_BUSY; 3847 } 3848 3849 int 3850 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3851 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3852 { 3853 assert(desc->thread == spdk_get_thread()); 3854 3855 spdk_poller_unregister(&desc->io_timeout_poller); 3856 3857 if (timeout_in_sec) { 3858 assert(cb_fn != NULL); 3859 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3860 desc, 3861 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3862 1000); 3863 if (desc->io_timeout_poller == NULL) { 3864 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3865 return -1; 3866 } 3867 } 3868 3869 desc->cb_fn = cb_fn; 3870 desc->cb_arg = cb_arg; 3871 desc->timeout_in_sec = timeout_in_sec; 3872 3873 return 0; 3874 } 3875 3876 static int 3877 bdev_channel_create(void *io_device, void *ctx_buf) 3878 { 3879 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3880 struct spdk_bdev_channel *ch = ctx_buf; 3881 struct spdk_io_channel *mgmt_io_ch; 3882 struct spdk_bdev_mgmt_channel *mgmt_ch; 3883 struct spdk_bdev_shared_resource *shared_resource; 3884 struct lba_range *range; 3885 3886 ch->bdev = bdev; 3887 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3888 if (!ch->channel) { 3889 return -1; 3890 } 3891 3892 ch->accel_channel = spdk_accel_get_io_channel(); 3893 if (!ch->accel_channel) { 3894 spdk_put_io_channel(ch->channel); 3895 return -1; 3896 } 3897 3898 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3899 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3900 3901 assert(ch->histogram == NULL); 3902 if (bdev->internal.histogram_enabled) { 3903 ch->histogram = spdk_histogram_data_alloc(); 3904 if (ch->histogram == NULL) { 3905 SPDK_ERRLOG("Could not allocate histogram\n"); 3906 } 3907 } 3908 3909 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3910 if (!mgmt_io_ch) { 3911 spdk_put_io_channel(ch->channel); 3912 spdk_put_io_channel(ch->accel_channel); 3913 return -1; 3914 } 3915 3916 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3917 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3918 if (shared_resource->shared_ch == ch->channel) { 3919 spdk_put_io_channel(mgmt_io_ch); 3920 shared_resource->ref++; 3921 break; 3922 } 3923 } 3924 3925 if (shared_resource == NULL) { 3926 shared_resource = calloc(1, sizeof(*shared_resource)); 3927 if (shared_resource == NULL) { 3928 spdk_put_io_channel(ch->channel); 3929 spdk_put_io_channel(ch->accel_channel); 3930 spdk_put_io_channel(mgmt_io_ch); 3931 return -1; 3932 } 3933 3934 shared_resource->mgmt_ch = mgmt_ch; 3935 shared_resource->io_outstanding = 0; 3936 TAILQ_INIT(&shared_resource->nomem_io); 3937 shared_resource->nomem_threshold = 0; 3938 shared_resource->shared_ch = ch->channel; 3939 shared_resource->ref = 1; 3940 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3941 } 3942 3943 ch->io_outstanding = 0; 3944 TAILQ_INIT(&ch->queued_resets); 3945 TAILQ_INIT(&ch->locked_ranges); 3946 ch->flags = 0; 3947 ch->shared_resource = shared_resource; 3948 3949 TAILQ_INIT(&ch->io_submitted); 3950 TAILQ_INIT(&ch->io_locked); 3951 TAILQ_INIT(&ch->io_accel_exec); 3952 TAILQ_INIT(&ch->io_memory_domain); 3953 3954 ch->stat = bdev_alloc_io_stat(false); 3955 if (ch->stat == NULL) { 3956 bdev_channel_destroy_resource(ch); 3957 return -1; 3958 } 3959 3960 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3961 3962 #ifdef SPDK_CONFIG_VTUNE 3963 { 3964 char *name; 3965 __itt_init_ittlib(NULL, 0); 3966 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3967 if (!name) { 3968 bdev_channel_destroy_resource(ch); 3969 return -1; 3970 } 3971 ch->handle = __itt_string_handle_create(name); 3972 free(name); 3973 ch->start_tsc = spdk_get_ticks(); 3974 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3975 ch->prev_stat = bdev_alloc_io_stat(false); 3976 if (ch->prev_stat == NULL) { 3977 bdev_channel_destroy_resource(ch); 3978 return -1; 3979 } 3980 } 3981 #endif 3982 3983 spdk_spin_lock(&bdev->internal.spinlock); 3984 bdev_enable_qos(bdev, ch); 3985 3986 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3987 struct lba_range *new_range; 3988 3989 new_range = calloc(1, sizeof(*new_range)); 3990 if (new_range == NULL) { 3991 spdk_spin_unlock(&bdev->internal.spinlock); 3992 bdev_channel_destroy_resource(ch); 3993 return -1; 3994 } 3995 new_range->length = range->length; 3996 new_range->offset = range->offset; 3997 new_range->locked_ctx = range->locked_ctx; 3998 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3999 } 4000 4001 spdk_spin_unlock(&bdev->internal.spinlock); 4002 4003 return 0; 4004 } 4005 4006 static int 4007 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4008 void *cb_ctx) 4009 { 4010 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4011 struct spdk_bdev_io *bdev_io; 4012 uint64_t buf_len; 4013 4014 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4015 if (bdev_io->internal.ch == bdev_ch) { 4016 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4017 spdk_iobuf_entry_abort(ch, entry, buf_len); 4018 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4019 } 4020 4021 return 0; 4022 } 4023 4024 /* 4025 * Abort I/O that are waiting on a data buffer. 4026 */ 4027 static void 4028 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4029 { 4030 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4031 bdev_abort_all_buf_io_cb, ch); 4032 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4033 bdev_abort_all_buf_io_cb, ch); 4034 } 4035 4036 /* 4037 * Abort I/O that are queued waiting for submission. These types of I/O are 4038 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4039 */ 4040 static void 4041 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4042 { 4043 struct spdk_bdev_io *bdev_io, *tmp; 4044 4045 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4046 if (bdev_io->internal.ch == ch) { 4047 TAILQ_REMOVE(queue, bdev_io, internal.link); 4048 /* 4049 * spdk_bdev_io_complete() assumes that the completed I/O had 4050 * been submitted to the bdev module. Since in this case it 4051 * hadn't, bump io_outstanding to account for the decrement 4052 * that spdk_bdev_io_complete() will do. 4053 */ 4054 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4055 ch->io_outstanding++; 4056 ch->shared_resource->io_outstanding++; 4057 } 4058 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4059 } 4060 } 4061 } 4062 4063 static bool 4064 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4065 { 4066 struct spdk_bdev_io *bdev_io; 4067 4068 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4069 if (bdev_io == bio_to_abort) { 4070 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4071 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4072 return true; 4073 } 4074 } 4075 4076 return false; 4077 } 4078 4079 static int 4080 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4081 { 4082 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4083 uint64_t buf_len; 4084 4085 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4086 if (bdev_io == bio_to_abort) { 4087 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4088 spdk_iobuf_entry_abort(ch, entry, buf_len); 4089 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4090 return 1; 4091 } 4092 4093 return 0; 4094 } 4095 4096 static bool 4097 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4098 { 4099 int rc; 4100 4101 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4102 bdev_abort_buf_io_cb, bio_to_abort); 4103 if (rc == 1) { 4104 return true; 4105 } 4106 4107 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4108 bdev_abort_buf_io_cb, bio_to_abort); 4109 return rc == 1; 4110 } 4111 4112 static void 4113 bdev_qos_channel_destroy(void *cb_arg) 4114 { 4115 struct spdk_bdev_qos *qos = cb_arg; 4116 4117 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4118 spdk_poller_unregister(&qos->poller); 4119 4120 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4121 4122 free(qos); 4123 } 4124 4125 static int 4126 bdev_qos_destroy(struct spdk_bdev *bdev) 4127 { 4128 int i; 4129 4130 /* 4131 * Cleanly shutting down the QoS poller is tricky, because 4132 * during the asynchronous operation the user could open 4133 * a new descriptor and create a new channel, spawning 4134 * a new QoS poller. 4135 * 4136 * The strategy is to create a new QoS structure here and swap it 4137 * in. The shutdown path then continues to refer to the old one 4138 * until it completes and then releases it. 4139 */ 4140 struct spdk_bdev_qos *new_qos, *old_qos; 4141 4142 old_qos = bdev->internal.qos; 4143 4144 new_qos = calloc(1, sizeof(*new_qos)); 4145 if (!new_qos) { 4146 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4147 return -ENOMEM; 4148 } 4149 4150 /* Copy the old QoS data into the newly allocated structure */ 4151 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4152 4153 /* Zero out the key parts of the QoS structure */ 4154 new_qos->ch = NULL; 4155 new_qos->thread = NULL; 4156 new_qos->poller = NULL; 4157 TAILQ_INIT(&new_qos->queued); 4158 /* 4159 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4160 * It will be used later for the new QoS structure. 4161 */ 4162 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4163 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4164 new_qos->rate_limits[i].min_per_timeslice = 0; 4165 new_qos->rate_limits[i].max_per_timeslice = 0; 4166 } 4167 4168 bdev->internal.qos = new_qos; 4169 4170 if (old_qos->thread == NULL) { 4171 free(old_qos); 4172 } else { 4173 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4174 } 4175 4176 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4177 * been destroyed yet. The destruction path will end up waiting for the final 4178 * channel to be put before it releases resources. */ 4179 4180 return 0; 4181 } 4182 4183 void 4184 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4185 { 4186 total->bytes_read += add->bytes_read; 4187 total->num_read_ops += add->num_read_ops; 4188 total->bytes_written += add->bytes_written; 4189 total->num_write_ops += add->num_write_ops; 4190 total->bytes_unmapped += add->bytes_unmapped; 4191 total->num_unmap_ops += add->num_unmap_ops; 4192 total->bytes_copied += add->bytes_copied; 4193 total->num_copy_ops += add->num_copy_ops; 4194 total->read_latency_ticks += add->read_latency_ticks; 4195 total->write_latency_ticks += add->write_latency_ticks; 4196 total->unmap_latency_ticks += add->unmap_latency_ticks; 4197 total->copy_latency_ticks += add->copy_latency_ticks; 4198 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4199 total->max_read_latency_ticks = add->max_read_latency_ticks; 4200 } 4201 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4202 total->min_read_latency_ticks = add->min_read_latency_ticks; 4203 } 4204 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4205 total->max_write_latency_ticks = add->max_write_latency_ticks; 4206 } 4207 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4208 total->min_write_latency_ticks = add->min_write_latency_ticks; 4209 } 4210 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4211 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4212 } 4213 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4214 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4215 } 4216 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4217 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4218 } 4219 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4220 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4221 } 4222 } 4223 4224 static void 4225 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4226 { 4227 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4228 4229 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4230 memcpy(to_stat->io_error, from_stat->io_error, 4231 sizeof(struct spdk_bdev_io_error_stat)); 4232 } 4233 } 4234 4235 void 4236 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4237 { 4238 stat->max_read_latency_ticks = 0; 4239 stat->min_read_latency_ticks = UINT64_MAX; 4240 stat->max_write_latency_ticks = 0; 4241 stat->min_write_latency_ticks = UINT64_MAX; 4242 stat->max_unmap_latency_ticks = 0; 4243 stat->min_unmap_latency_ticks = UINT64_MAX; 4244 stat->max_copy_latency_ticks = 0; 4245 stat->min_copy_latency_ticks = UINT64_MAX; 4246 4247 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4248 return; 4249 } 4250 4251 stat->bytes_read = 0; 4252 stat->num_read_ops = 0; 4253 stat->bytes_written = 0; 4254 stat->num_write_ops = 0; 4255 stat->bytes_unmapped = 0; 4256 stat->num_unmap_ops = 0; 4257 stat->bytes_copied = 0; 4258 stat->num_copy_ops = 0; 4259 stat->read_latency_ticks = 0; 4260 stat->write_latency_ticks = 0; 4261 stat->unmap_latency_ticks = 0; 4262 stat->copy_latency_ticks = 0; 4263 4264 if (stat->io_error != NULL) { 4265 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4266 } 4267 } 4268 4269 struct spdk_bdev_io_stat * 4270 bdev_alloc_io_stat(bool io_error_stat) 4271 { 4272 struct spdk_bdev_io_stat *stat; 4273 4274 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4275 if (stat == NULL) { 4276 return NULL; 4277 } 4278 4279 if (io_error_stat) { 4280 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4281 if (stat->io_error == NULL) { 4282 free(stat); 4283 return NULL; 4284 } 4285 } else { 4286 stat->io_error = NULL; 4287 } 4288 4289 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4290 4291 return stat; 4292 } 4293 4294 void 4295 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4296 { 4297 if (stat != NULL) { 4298 free(stat->io_error); 4299 free(stat); 4300 } 4301 } 4302 4303 void 4304 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4305 { 4306 int i; 4307 4308 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4309 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4310 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4311 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4312 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4313 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4314 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4315 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4316 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4317 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4318 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4319 stat->min_read_latency_ticks != UINT64_MAX ? 4320 stat->min_read_latency_ticks : 0); 4321 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4322 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4323 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4324 stat->min_write_latency_ticks != UINT64_MAX ? 4325 stat->min_write_latency_ticks : 0); 4326 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4327 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4328 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4329 stat->min_unmap_latency_ticks != UINT64_MAX ? 4330 stat->min_unmap_latency_ticks : 0); 4331 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4332 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4333 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4334 stat->min_copy_latency_ticks != UINT64_MAX ? 4335 stat->min_copy_latency_ticks : 0); 4336 4337 if (stat->io_error != NULL) { 4338 spdk_json_write_named_object_begin(w, "io_error"); 4339 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4340 if (stat->io_error->error_status[i] != 0) { 4341 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4342 stat->io_error->error_status[i]); 4343 } 4344 } 4345 spdk_json_write_object_end(w); 4346 } 4347 } 4348 4349 static void 4350 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4351 { 4352 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4353 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4354 4355 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4356 bdev_abort_all_buf_io(mgmt_ch, ch); 4357 } 4358 4359 static void 4360 bdev_channel_destroy(void *io_device, void *ctx_buf) 4361 { 4362 struct spdk_bdev_channel *ch = ctx_buf; 4363 4364 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4365 spdk_get_thread()); 4366 4367 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4368 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4369 4370 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4371 spdk_spin_lock(&ch->bdev->internal.spinlock); 4372 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4373 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4374 4375 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4376 4377 bdev_channel_abort_queued_ios(ch); 4378 4379 if (ch->histogram) { 4380 spdk_histogram_data_free(ch->histogram); 4381 } 4382 4383 bdev_channel_destroy_resource(ch); 4384 } 4385 4386 /* 4387 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4388 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4389 */ 4390 static int 4391 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4392 { 4393 struct spdk_bdev_name *tmp; 4394 4395 bdev_name->name = strdup(name); 4396 if (bdev_name->name == NULL) { 4397 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4398 return -ENOMEM; 4399 } 4400 4401 bdev_name->bdev = bdev; 4402 4403 spdk_spin_lock(&g_bdev_mgr.spinlock); 4404 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4405 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4406 4407 if (tmp != NULL) { 4408 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4409 free(bdev_name->name); 4410 return -EEXIST; 4411 } 4412 4413 return 0; 4414 } 4415 4416 static void 4417 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4418 { 4419 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4420 free(bdev_name->name); 4421 } 4422 4423 static void 4424 bdev_name_del(struct spdk_bdev_name *bdev_name) 4425 { 4426 spdk_spin_lock(&g_bdev_mgr.spinlock); 4427 bdev_name_del_unsafe(bdev_name); 4428 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4429 } 4430 4431 int 4432 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4433 { 4434 struct spdk_bdev_alias *tmp; 4435 int ret; 4436 4437 if (alias == NULL) { 4438 SPDK_ERRLOG("Empty alias passed\n"); 4439 return -EINVAL; 4440 } 4441 4442 tmp = calloc(1, sizeof(*tmp)); 4443 if (tmp == NULL) { 4444 SPDK_ERRLOG("Unable to allocate alias\n"); 4445 return -ENOMEM; 4446 } 4447 4448 ret = bdev_name_add(&tmp->alias, bdev, alias); 4449 if (ret != 0) { 4450 free(tmp); 4451 return ret; 4452 } 4453 4454 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4455 4456 return 0; 4457 } 4458 4459 static int 4460 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4461 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4462 { 4463 struct spdk_bdev_alias *tmp; 4464 4465 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4466 if (strcmp(alias, tmp->alias.name) == 0) { 4467 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4468 alias_del_fn(&tmp->alias); 4469 free(tmp); 4470 return 0; 4471 } 4472 } 4473 4474 return -ENOENT; 4475 } 4476 4477 int 4478 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4479 { 4480 int rc; 4481 4482 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4483 if (rc == -ENOENT) { 4484 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4485 } 4486 4487 return rc; 4488 } 4489 4490 void 4491 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4492 { 4493 struct spdk_bdev_alias *p, *tmp; 4494 4495 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4496 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4497 bdev_name_del(&p->alias); 4498 free(p); 4499 } 4500 } 4501 4502 struct spdk_io_channel * 4503 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4504 { 4505 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4506 } 4507 4508 void * 4509 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4510 { 4511 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4512 void *ctx = NULL; 4513 4514 if (bdev->fn_table->get_module_ctx) { 4515 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4516 } 4517 4518 return ctx; 4519 } 4520 4521 const char * 4522 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4523 { 4524 return bdev->module->name; 4525 } 4526 4527 const char * 4528 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4529 { 4530 return bdev->name; 4531 } 4532 4533 const char * 4534 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4535 { 4536 return bdev->product_name; 4537 } 4538 4539 const struct spdk_bdev_aliases_list * 4540 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4541 { 4542 return &bdev->aliases; 4543 } 4544 4545 uint32_t 4546 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4547 { 4548 return bdev->blocklen; 4549 } 4550 4551 uint32_t 4552 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4553 { 4554 return bdev->write_unit_size; 4555 } 4556 4557 uint64_t 4558 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4559 { 4560 return bdev->blockcnt; 4561 } 4562 4563 const char * 4564 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4565 { 4566 return qos_rpc_type[type]; 4567 } 4568 4569 void 4570 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4571 { 4572 int i; 4573 4574 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4575 4576 spdk_spin_lock(&bdev->internal.spinlock); 4577 if (bdev->internal.qos) { 4578 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4579 if (bdev->internal.qos->rate_limits[i].limit != 4580 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4581 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4582 if (bdev_qos_is_iops_rate_limit(i) == false) { 4583 /* Change from Byte to Megabyte which is user visible. */ 4584 limits[i] = limits[i] / 1024 / 1024; 4585 } 4586 } 4587 } 4588 } 4589 spdk_spin_unlock(&bdev->internal.spinlock); 4590 } 4591 4592 size_t 4593 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4594 { 4595 return 1 << bdev->required_alignment; 4596 } 4597 4598 uint32_t 4599 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4600 { 4601 return bdev->optimal_io_boundary; 4602 } 4603 4604 bool 4605 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4606 { 4607 return bdev->write_cache; 4608 } 4609 4610 const struct spdk_uuid * 4611 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4612 { 4613 return &bdev->uuid; 4614 } 4615 4616 uint16_t 4617 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4618 { 4619 return bdev->acwu; 4620 } 4621 4622 uint32_t 4623 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4624 { 4625 return bdev->md_len; 4626 } 4627 4628 bool 4629 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4630 { 4631 return (bdev->md_len != 0) && bdev->md_interleave; 4632 } 4633 4634 bool 4635 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4636 { 4637 return (bdev->md_len != 0) && !bdev->md_interleave; 4638 } 4639 4640 bool 4641 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4642 { 4643 return bdev->zoned; 4644 } 4645 4646 uint32_t 4647 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4648 { 4649 if (spdk_bdev_is_md_interleaved(bdev)) { 4650 return bdev->blocklen - bdev->md_len; 4651 } else { 4652 return bdev->blocklen; 4653 } 4654 } 4655 4656 uint32_t 4657 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4658 { 4659 return bdev->phys_blocklen; 4660 } 4661 4662 static uint32_t 4663 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4664 { 4665 if (!spdk_bdev_is_md_interleaved(bdev)) { 4666 return bdev->blocklen + bdev->md_len; 4667 } else { 4668 return bdev->blocklen; 4669 } 4670 } 4671 4672 /* We have to use the typedef in the function declaration to appease astyle. */ 4673 typedef enum spdk_dif_type spdk_dif_type_t; 4674 4675 spdk_dif_type_t 4676 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4677 { 4678 if (bdev->md_len != 0) { 4679 return bdev->dif_type; 4680 } else { 4681 return SPDK_DIF_DISABLE; 4682 } 4683 } 4684 4685 bool 4686 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4687 { 4688 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4689 return bdev->dif_is_head_of_md; 4690 } else { 4691 return false; 4692 } 4693 } 4694 4695 bool 4696 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4697 enum spdk_dif_check_type check_type) 4698 { 4699 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4700 return false; 4701 } 4702 4703 switch (check_type) { 4704 case SPDK_DIF_CHECK_TYPE_REFTAG: 4705 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4706 case SPDK_DIF_CHECK_TYPE_APPTAG: 4707 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4708 case SPDK_DIF_CHECK_TYPE_GUARD: 4709 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4710 default: 4711 return false; 4712 } 4713 } 4714 4715 static uint32_t 4716 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4717 { 4718 uint64_t aligned_length, max_write_blocks; 4719 4720 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4721 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4722 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4723 4724 return max_write_blocks; 4725 } 4726 4727 uint32_t 4728 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4729 { 4730 return bdev->max_copy; 4731 } 4732 4733 uint64_t 4734 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4735 { 4736 return bdev->internal.measured_queue_depth; 4737 } 4738 4739 uint64_t 4740 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4741 { 4742 return bdev->internal.period; 4743 } 4744 4745 uint64_t 4746 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4747 { 4748 return bdev->internal.weighted_io_time; 4749 } 4750 4751 uint64_t 4752 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4753 { 4754 return bdev->internal.io_time; 4755 } 4756 4757 static void bdev_update_qd_sampling_period(void *ctx); 4758 4759 static void 4760 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4761 { 4762 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4763 4764 if (bdev->internal.measured_queue_depth) { 4765 bdev->internal.io_time += bdev->internal.period; 4766 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4767 } 4768 4769 bdev->internal.qd_poll_in_progress = false; 4770 4771 bdev_update_qd_sampling_period(bdev); 4772 } 4773 4774 static void 4775 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4776 struct spdk_io_channel *io_ch, void *_ctx) 4777 { 4778 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4779 4780 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4781 spdk_bdev_for_each_channel_continue(i, 0); 4782 } 4783 4784 static int 4785 bdev_calculate_measured_queue_depth(void *ctx) 4786 { 4787 struct spdk_bdev *bdev = ctx; 4788 4789 bdev->internal.qd_poll_in_progress = true; 4790 bdev->internal.temporary_queue_depth = 0; 4791 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4792 return SPDK_POLLER_BUSY; 4793 } 4794 4795 static void 4796 bdev_update_qd_sampling_period(void *ctx) 4797 { 4798 struct spdk_bdev *bdev = ctx; 4799 4800 if (bdev->internal.period == bdev->internal.new_period) { 4801 return; 4802 } 4803 4804 if (bdev->internal.qd_poll_in_progress) { 4805 return; 4806 } 4807 4808 bdev->internal.period = bdev->internal.new_period; 4809 4810 spdk_poller_unregister(&bdev->internal.qd_poller); 4811 if (bdev->internal.period != 0) { 4812 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4813 bdev, bdev->internal.period); 4814 } else { 4815 spdk_bdev_close(bdev->internal.qd_desc); 4816 bdev->internal.qd_desc = NULL; 4817 } 4818 } 4819 4820 static void 4821 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4822 { 4823 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4824 } 4825 4826 void 4827 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4828 { 4829 int rc; 4830 4831 if (bdev->internal.new_period == period) { 4832 return; 4833 } 4834 4835 bdev->internal.new_period = period; 4836 4837 if (bdev->internal.qd_desc != NULL) { 4838 assert(bdev->internal.period != 0); 4839 4840 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4841 bdev_update_qd_sampling_period, bdev); 4842 return; 4843 } 4844 4845 assert(bdev->internal.period == 0); 4846 4847 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4848 NULL, &bdev->internal.qd_desc); 4849 if (rc != 0) { 4850 return; 4851 } 4852 4853 bdev->internal.period = period; 4854 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4855 bdev, period); 4856 } 4857 4858 struct bdev_get_current_qd_ctx { 4859 uint64_t current_qd; 4860 spdk_bdev_get_current_qd_cb cb_fn; 4861 void *cb_arg; 4862 }; 4863 4864 static void 4865 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4866 { 4867 struct bdev_get_current_qd_ctx *ctx = _ctx; 4868 4869 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4870 4871 free(ctx); 4872 } 4873 4874 static void 4875 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4876 struct spdk_io_channel *io_ch, void *_ctx) 4877 { 4878 struct bdev_get_current_qd_ctx *ctx = _ctx; 4879 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4880 4881 ctx->current_qd += bdev_ch->io_outstanding; 4882 4883 spdk_bdev_for_each_channel_continue(i, 0); 4884 } 4885 4886 void 4887 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4888 void *cb_arg) 4889 { 4890 struct bdev_get_current_qd_ctx *ctx; 4891 4892 assert(cb_fn != NULL); 4893 4894 ctx = calloc(1, sizeof(*ctx)); 4895 if (ctx == NULL) { 4896 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4897 return; 4898 } 4899 4900 ctx->cb_fn = cb_fn; 4901 ctx->cb_arg = cb_arg; 4902 4903 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4904 } 4905 4906 static void 4907 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4908 { 4909 assert(desc->thread == spdk_get_thread()); 4910 4911 spdk_spin_lock(&desc->spinlock); 4912 desc->refs--; 4913 if (!desc->closed) { 4914 spdk_spin_unlock(&desc->spinlock); 4915 desc->callback.event_fn(type, 4916 desc->bdev, 4917 desc->callback.ctx); 4918 return; 4919 } else if (desc->refs == 0) { 4920 /* This descriptor was closed after this event_notify message was sent. 4921 * spdk_bdev_close() could not free the descriptor since this message was 4922 * in flight, so we free it now using bdev_desc_free(). 4923 */ 4924 spdk_spin_unlock(&desc->spinlock); 4925 bdev_desc_free(desc); 4926 return; 4927 } 4928 spdk_spin_unlock(&desc->spinlock); 4929 } 4930 4931 static void 4932 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4933 { 4934 spdk_spin_lock(&desc->spinlock); 4935 desc->refs++; 4936 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4937 spdk_spin_unlock(&desc->spinlock); 4938 } 4939 4940 static void 4941 _resize_notify(void *ctx) 4942 { 4943 struct spdk_bdev_desc *desc = ctx; 4944 4945 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4946 } 4947 4948 int 4949 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4950 { 4951 struct spdk_bdev_desc *desc; 4952 int ret; 4953 4954 if (size == bdev->blockcnt) { 4955 return 0; 4956 } 4957 4958 spdk_spin_lock(&bdev->internal.spinlock); 4959 4960 /* bdev has open descriptors */ 4961 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4962 bdev->blockcnt > size) { 4963 ret = -EBUSY; 4964 } else { 4965 bdev->blockcnt = size; 4966 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4967 event_notify(desc, _resize_notify); 4968 } 4969 ret = 0; 4970 } 4971 4972 spdk_spin_unlock(&bdev->internal.spinlock); 4973 4974 return ret; 4975 } 4976 4977 /* 4978 * Convert I/O offset and length from bytes to blocks. 4979 * 4980 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4981 */ 4982 static uint64_t 4983 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4984 uint64_t num_bytes, uint64_t *num_blocks) 4985 { 4986 uint32_t block_size = bdev->blocklen; 4987 uint8_t shift_cnt; 4988 4989 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4990 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4991 shift_cnt = spdk_u32log2(block_size); 4992 *offset_blocks = offset_bytes >> shift_cnt; 4993 *num_blocks = num_bytes >> shift_cnt; 4994 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4995 (num_bytes - (*num_blocks << shift_cnt)); 4996 } else { 4997 *offset_blocks = offset_bytes / block_size; 4998 *num_blocks = num_bytes / block_size; 4999 return (offset_bytes % block_size) | (num_bytes % block_size); 5000 } 5001 } 5002 5003 static bool 5004 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5005 { 5006 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5007 * has been an overflow and hence the offset has been wrapped around */ 5008 if (offset_blocks + num_blocks < offset_blocks) { 5009 return false; 5010 } 5011 5012 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5013 if (offset_blocks + num_blocks > bdev->blockcnt) { 5014 return false; 5015 } 5016 5017 return true; 5018 } 5019 5020 static void 5021 bdev_seek_complete_cb(void *ctx) 5022 { 5023 struct spdk_bdev_io *bdev_io = ctx; 5024 5025 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5026 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5027 } 5028 5029 static int 5030 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5031 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5032 spdk_bdev_io_completion_cb cb, void *cb_arg) 5033 { 5034 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5035 struct spdk_bdev_io *bdev_io; 5036 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5037 5038 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5039 5040 /* Check if offset_blocks is valid looking at the validity of one block */ 5041 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5042 return -EINVAL; 5043 } 5044 5045 bdev_io = bdev_channel_get_io(channel); 5046 if (!bdev_io) { 5047 return -ENOMEM; 5048 } 5049 5050 bdev_io->internal.ch = channel; 5051 bdev_io->internal.desc = desc; 5052 bdev_io->type = io_type; 5053 bdev_io->u.bdev.offset_blocks = offset_blocks; 5054 bdev_io->u.bdev.memory_domain = NULL; 5055 bdev_io->u.bdev.memory_domain_ctx = NULL; 5056 bdev_io->u.bdev.accel_sequence = NULL; 5057 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5058 5059 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5060 /* In case bdev doesn't support seek to next data/hole offset, 5061 * it is assumed that only data and no holes are present */ 5062 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5063 bdev_io->u.bdev.seek.offset = offset_blocks; 5064 } else { 5065 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5066 } 5067 5068 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5069 return 0; 5070 } 5071 5072 bdev_io_submit(bdev_io); 5073 return 0; 5074 } 5075 5076 int 5077 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5078 uint64_t offset_blocks, 5079 spdk_bdev_io_completion_cb cb, void *cb_arg) 5080 { 5081 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5082 } 5083 5084 int 5085 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5086 uint64_t offset_blocks, 5087 spdk_bdev_io_completion_cb cb, void *cb_arg) 5088 { 5089 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5090 } 5091 5092 uint64_t 5093 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5094 { 5095 return bdev_io->u.bdev.seek.offset; 5096 } 5097 5098 static int 5099 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5100 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5101 spdk_bdev_io_completion_cb cb, void *cb_arg) 5102 { 5103 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5104 struct spdk_bdev_io *bdev_io; 5105 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5106 5107 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5108 return -EINVAL; 5109 } 5110 5111 bdev_io = bdev_channel_get_io(channel); 5112 if (!bdev_io) { 5113 return -ENOMEM; 5114 } 5115 5116 bdev_io->internal.ch = channel; 5117 bdev_io->internal.desc = desc; 5118 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5119 bdev_io->u.bdev.iovs = &bdev_io->iov; 5120 bdev_io->u.bdev.iovs[0].iov_base = buf; 5121 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5122 bdev_io->u.bdev.iovcnt = 1; 5123 bdev_io->u.bdev.md_buf = md_buf; 5124 bdev_io->u.bdev.num_blocks = num_blocks; 5125 bdev_io->u.bdev.offset_blocks = offset_blocks; 5126 bdev_io->u.bdev.memory_domain = NULL; 5127 bdev_io->u.bdev.memory_domain_ctx = NULL; 5128 bdev_io->u.bdev.accel_sequence = NULL; 5129 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5130 5131 bdev_io_submit(bdev_io); 5132 return 0; 5133 } 5134 5135 int 5136 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5137 void *buf, uint64_t offset, uint64_t nbytes, 5138 spdk_bdev_io_completion_cb cb, void *cb_arg) 5139 { 5140 uint64_t offset_blocks, num_blocks; 5141 5142 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5143 nbytes, &num_blocks) != 0) { 5144 return -EINVAL; 5145 } 5146 5147 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5148 } 5149 5150 int 5151 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5152 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5153 spdk_bdev_io_completion_cb cb, void *cb_arg) 5154 { 5155 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5156 } 5157 5158 int 5159 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5160 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5161 spdk_bdev_io_completion_cb cb, void *cb_arg) 5162 { 5163 struct iovec iov = { 5164 .iov_base = buf, 5165 }; 5166 5167 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5168 return -EINVAL; 5169 } 5170 5171 if (md_buf && !_is_buf_allocated(&iov)) { 5172 return -EINVAL; 5173 } 5174 5175 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5176 cb, cb_arg); 5177 } 5178 5179 int 5180 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5181 struct iovec *iov, int iovcnt, 5182 uint64_t offset, uint64_t nbytes, 5183 spdk_bdev_io_completion_cb cb, void *cb_arg) 5184 { 5185 uint64_t offset_blocks, num_blocks; 5186 5187 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5188 nbytes, &num_blocks) != 0) { 5189 return -EINVAL; 5190 } 5191 5192 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5193 } 5194 5195 static int 5196 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5197 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5198 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5199 struct spdk_accel_sequence *seq, 5200 spdk_bdev_io_completion_cb cb, void *cb_arg) 5201 { 5202 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5203 struct spdk_bdev_io *bdev_io; 5204 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5205 5206 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5207 return -EINVAL; 5208 } 5209 5210 bdev_io = bdev_channel_get_io(channel); 5211 if (!bdev_io) { 5212 return -ENOMEM; 5213 } 5214 5215 bdev_io->internal.ch = channel; 5216 bdev_io->internal.desc = desc; 5217 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5218 bdev_io->u.bdev.iovs = iov; 5219 bdev_io->u.bdev.iovcnt = iovcnt; 5220 bdev_io->u.bdev.md_buf = md_buf; 5221 bdev_io->u.bdev.num_blocks = num_blocks; 5222 bdev_io->u.bdev.offset_blocks = offset_blocks; 5223 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5224 bdev_io->internal.memory_domain = domain; 5225 bdev_io->internal.memory_domain_ctx = domain_ctx; 5226 bdev_io->internal.accel_sequence = seq; 5227 bdev_io->u.bdev.memory_domain = domain; 5228 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5229 bdev_io->u.bdev.accel_sequence = seq; 5230 5231 _bdev_io_submit_ext(desc, bdev_io); 5232 5233 return 0; 5234 } 5235 5236 int 5237 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5238 struct iovec *iov, int iovcnt, 5239 uint64_t offset_blocks, uint64_t num_blocks, 5240 spdk_bdev_io_completion_cb cb, void *cb_arg) 5241 { 5242 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5243 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5244 } 5245 5246 int 5247 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5248 struct iovec *iov, int iovcnt, void *md_buf, 5249 uint64_t offset_blocks, uint64_t num_blocks, 5250 spdk_bdev_io_completion_cb cb, void *cb_arg) 5251 { 5252 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5253 return -EINVAL; 5254 } 5255 5256 if (md_buf && !_is_buf_allocated(iov)) { 5257 return -EINVAL; 5258 } 5259 5260 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5261 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5262 } 5263 5264 static inline bool 5265 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5266 { 5267 /* 5268 * We check if opts size is at least of size when we first introduced 5269 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5270 * are not checked internal. 5271 */ 5272 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5273 sizeof(opts->metadata) && 5274 opts->size <= sizeof(*opts) && 5275 /* When memory domain is used, the user must provide data buffers */ 5276 (!opts->memory_domain || (iov && iov[0].iov_base)); 5277 } 5278 5279 int 5280 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5281 struct iovec *iov, int iovcnt, 5282 uint64_t offset_blocks, uint64_t num_blocks, 5283 spdk_bdev_io_completion_cb cb, void *cb_arg, 5284 struct spdk_bdev_ext_io_opts *opts) 5285 { 5286 void *md = NULL; 5287 5288 if (opts) { 5289 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5290 return -EINVAL; 5291 } 5292 md = opts->metadata; 5293 } 5294 5295 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5296 return -EINVAL; 5297 } 5298 5299 if (md && !_is_buf_allocated(iov)) { 5300 return -EINVAL; 5301 } 5302 5303 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5304 num_blocks, 5305 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5306 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5307 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5308 cb, cb_arg); 5309 } 5310 5311 static int 5312 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5313 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5314 spdk_bdev_io_completion_cb cb, void *cb_arg) 5315 { 5316 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5317 struct spdk_bdev_io *bdev_io; 5318 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5319 5320 if (!desc->write) { 5321 return -EBADF; 5322 } 5323 5324 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5325 return -EINVAL; 5326 } 5327 5328 bdev_io = bdev_channel_get_io(channel); 5329 if (!bdev_io) { 5330 return -ENOMEM; 5331 } 5332 5333 bdev_io->internal.ch = channel; 5334 bdev_io->internal.desc = desc; 5335 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5336 bdev_io->u.bdev.iovs = &bdev_io->iov; 5337 bdev_io->u.bdev.iovs[0].iov_base = buf; 5338 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5339 bdev_io->u.bdev.iovcnt = 1; 5340 bdev_io->u.bdev.md_buf = md_buf; 5341 bdev_io->u.bdev.num_blocks = num_blocks; 5342 bdev_io->u.bdev.offset_blocks = offset_blocks; 5343 bdev_io->u.bdev.memory_domain = NULL; 5344 bdev_io->u.bdev.memory_domain_ctx = NULL; 5345 bdev_io->u.bdev.accel_sequence = NULL; 5346 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5347 5348 bdev_io_submit(bdev_io); 5349 return 0; 5350 } 5351 5352 int 5353 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5354 void *buf, uint64_t offset, uint64_t nbytes, 5355 spdk_bdev_io_completion_cb cb, void *cb_arg) 5356 { 5357 uint64_t offset_blocks, num_blocks; 5358 5359 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5360 nbytes, &num_blocks) != 0) { 5361 return -EINVAL; 5362 } 5363 5364 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5365 } 5366 5367 int 5368 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5369 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5370 spdk_bdev_io_completion_cb cb, void *cb_arg) 5371 { 5372 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5373 cb, cb_arg); 5374 } 5375 5376 int 5377 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5378 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5379 spdk_bdev_io_completion_cb cb, void *cb_arg) 5380 { 5381 struct iovec iov = { 5382 .iov_base = buf, 5383 }; 5384 5385 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5386 return -EINVAL; 5387 } 5388 5389 if (md_buf && !_is_buf_allocated(&iov)) { 5390 return -EINVAL; 5391 } 5392 5393 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5394 cb, cb_arg); 5395 } 5396 5397 static int 5398 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5399 struct iovec *iov, int iovcnt, void *md_buf, 5400 uint64_t offset_blocks, uint64_t num_blocks, 5401 struct spdk_memory_domain *domain, void *domain_ctx, 5402 struct spdk_accel_sequence *seq, 5403 spdk_bdev_io_completion_cb cb, void *cb_arg) 5404 { 5405 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5406 struct spdk_bdev_io *bdev_io; 5407 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5408 5409 if (!desc->write) { 5410 return -EBADF; 5411 } 5412 5413 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5414 return -EINVAL; 5415 } 5416 5417 bdev_io = bdev_channel_get_io(channel); 5418 if (!bdev_io) { 5419 return -ENOMEM; 5420 } 5421 5422 bdev_io->internal.ch = channel; 5423 bdev_io->internal.desc = desc; 5424 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5425 bdev_io->u.bdev.iovs = iov; 5426 bdev_io->u.bdev.iovcnt = iovcnt; 5427 bdev_io->u.bdev.md_buf = md_buf; 5428 bdev_io->u.bdev.num_blocks = num_blocks; 5429 bdev_io->u.bdev.offset_blocks = offset_blocks; 5430 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5431 bdev_io->internal.memory_domain = domain; 5432 bdev_io->internal.memory_domain_ctx = domain_ctx; 5433 bdev_io->internal.accel_sequence = seq; 5434 bdev_io->u.bdev.memory_domain = domain; 5435 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5436 bdev_io->u.bdev.accel_sequence = seq; 5437 5438 _bdev_io_submit_ext(desc, bdev_io); 5439 5440 return 0; 5441 } 5442 5443 int 5444 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5445 struct iovec *iov, int iovcnt, 5446 uint64_t offset, uint64_t len, 5447 spdk_bdev_io_completion_cb cb, void *cb_arg) 5448 { 5449 uint64_t offset_blocks, num_blocks; 5450 5451 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5452 len, &num_blocks) != 0) { 5453 return -EINVAL; 5454 } 5455 5456 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5457 } 5458 5459 int 5460 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5461 struct iovec *iov, int iovcnt, 5462 uint64_t offset_blocks, uint64_t num_blocks, 5463 spdk_bdev_io_completion_cb cb, void *cb_arg) 5464 { 5465 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5466 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5467 } 5468 5469 int 5470 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5471 struct iovec *iov, int iovcnt, void *md_buf, 5472 uint64_t offset_blocks, uint64_t num_blocks, 5473 spdk_bdev_io_completion_cb cb, void *cb_arg) 5474 { 5475 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5476 return -EINVAL; 5477 } 5478 5479 if (md_buf && !_is_buf_allocated(iov)) { 5480 return -EINVAL; 5481 } 5482 5483 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5484 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5485 } 5486 5487 int 5488 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5489 struct iovec *iov, int iovcnt, 5490 uint64_t offset_blocks, uint64_t num_blocks, 5491 spdk_bdev_io_completion_cb cb, void *cb_arg, 5492 struct spdk_bdev_ext_io_opts *opts) 5493 { 5494 void *md = NULL; 5495 5496 if (opts) { 5497 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5498 return -EINVAL; 5499 } 5500 md = opts->metadata; 5501 } 5502 5503 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5504 return -EINVAL; 5505 } 5506 5507 if (md && !_is_buf_allocated(iov)) { 5508 return -EINVAL; 5509 } 5510 5511 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5512 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5513 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5514 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5515 cb, cb_arg); 5516 } 5517 5518 static void 5519 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5520 { 5521 struct spdk_bdev_io *parent_io = cb_arg; 5522 struct spdk_bdev *bdev = parent_io->bdev; 5523 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5524 int i, rc = 0; 5525 5526 if (!success) { 5527 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5528 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5529 spdk_bdev_free_io(bdev_io); 5530 return; 5531 } 5532 5533 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5534 rc = memcmp(read_buf, 5535 parent_io->u.bdev.iovs[i].iov_base, 5536 parent_io->u.bdev.iovs[i].iov_len); 5537 if (rc) { 5538 break; 5539 } 5540 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5541 } 5542 5543 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5544 rc = memcmp(bdev_io->u.bdev.md_buf, 5545 parent_io->u.bdev.md_buf, 5546 spdk_bdev_get_md_size(bdev)); 5547 } 5548 5549 spdk_bdev_free_io(bdev_io); 5550 5551 if (rc == 0) { 5552 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5553 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5554 } else { 5555 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5556 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5557 } 5558 } 5559 5560 static void 5561 bdev_compare_do_read(void *_bdev_io) 5562 { 5563 struct spdk_bdev_io *bdev_io = _bdev_io; 5564 int rc; 5565 5566 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5567 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5568 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5569 bdev_compare_do_read_done, bdev_io); 5570 5571 if (rc == -ENOMEM) { 5572 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5573 } else if (rc != 0) { 5574 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5575 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5576 } 5577 } 5578 5579 static int 5580 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5581 struct iovec *iov, int iovcnt, void *md_buf, 5582 uint64_t offset_blocks, uint64_t num_blocks, 5583 spdk_bdev_io_completion_cb cb, void *cb_arg) 5584 { 5585 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5586 struct spdk_bdev_io *bdev_io; 5587 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5588 5589 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5590 return -EINVAL; 5591 } 5592 5593 bdev_io = bdev_channel_get_io(channel); 5594 if (!bdev_io) { 5595 return -ENOMEM; 5596 } 5597 5598 bdev_io->internal.ch = channel; 5599 bdev_io->internal.desc = desc; 5600 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5601 bdev_io->u.bdev.iovs = iov; 5602 bdev_io->u.bdev.iovcnt = iovcnt; 5603 bdev_io->u.bdev.md_buf = md_buf; 5604 bdev_io->u.bdev.num_blocks = num_blocks; 5605 bdev_io->u.bdev.offset_blocks = offset_blocks; 5606 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5607 bdev_io->u.bdev.memory_domain = NULL; 5608 bdev_io->u.bdev.memory_domain_ctx = NULL; 5609 bdev_io->u.bdev.accel_sequence = NULL; 5610 5611 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5612 bdev_io_submit(bdev_io); 5613 return 0; 5614 } 5615 5616 bdev_compare_do_read(bdev_io); 5617 5618 return 0; 5619 } 5620 5621 int 5622 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5623 struct iovec *iov, int iovcnt, 5624 uint64_t offset_blocks, uint64_t num_blocks, 5625 spdk_bdev_io_completion_cb cb, void *cb_arg) 5626 { 5627 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5628 num_blocks, cb, cb_arg); 5629 } 5630 5631 int 5632 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5633 struct iovec *iov, int iovcnt, void *md_buf, 5634 uint64_t offset_blocks, uint64_t num_blocks, 5635 spdk_bdev_io_completion_cb cb, void *cb_arg) 5636 { 5637 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5638 return -EINVAL; 5639 } 5640 5641 if (md_buf && !_is_buf_allocated(iov)) { 5642 return -EINVAL; 5643 } 5644 5645 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5646 num_blocks, cb, cb_arg); 5647 } 5648 5649 static int 5650 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5651 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5652 spdk_bdev_io_completion_cb cb, void *cb_arg) 5653 { 5654 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5655 struct spdk_bdev_io *bdev_io; 5656 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5657 5658 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5659 return -EINVAL; 5660 } 5661 5662 bdev_io = bdev_channel_get_io(channel); 5663 if (!bdev_io) { 5664 return -ENOMEM; 5665 } 5666 5667 bdev_io->internal.ch = channel; 5668 bdev_io->internal.desc = desc; 5669 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5670 bdev_io->u.bdev.iovs = &bdev_io->iov; 5671 bdev_io->u.bdev.iovs[0].iov_base = buf; 5672 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5673 bdev_io->u.bdev.iovcnt = 1; 5674 bdev_io->u.bdev.md_buf = md_buf; 5675 bdev_io->u.bdev.num_blocks = num_blocks; 5676 bdev_io->u.bdev.offset_blocks = offset_blocks; 5677 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5678 bdev_io->u.bdev.memory_domain = NULL; 5679 bdev_io->u.bdev.memory_domain_ctx = NULL; 5680 bdev_io->u.bdev.accel_sequence = NULL; 5681 5682 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5683 bdev_io_submit(bdev_io); 5684 return 0; 5685 } 5686 5687 bdev_compare_do_read(bdev_io); 5688 5689 return 0; 5690 } 5691 5692 int 5693 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5694 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5695 spdk_bdev_io_completion_cb cb, void *cb_arg) 5696 { 5697 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5698 cb, cb_arg); 5699 } 5700 5701 int 5702 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5703 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5704 spdk_bdev_io_completion_cb cb, void *cb_arg) 5705 { 5706 struct iovec iov = { 5707 .iov_base = buf, 5708 }; 5709 5710 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5711 return -EINVAL; 5712 } 5713 5714 if (md_buf && !_is_buf_allocated(&iov)) { 5715 return -EINVAL; 5716 } 5717 5718 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5719 cb, cb_arg); 5720 } 5721 5722 static void 5723 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5724 { 5725 struct spdk_bdev_io *bdev_io = ctx; 5726 5727 if (unlock_status) { 5728 SPDK_ERRLOG("LBA range unlock failed\n"); 5729 } 5730 5731 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5732 false, bdev_io->internal.caller_ctx); 5733 } 5734 5735 static void 5736 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5737 { 5738 bdev_io->internal.status = status; 5739 5740 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5741 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5742 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5743 } 5744 5745 static void 5746 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5747 { 5748 struct spdk_bdev_io *parent_io = cb_arg; 5749 5750 if (!success) { 5751 SPDK_ERRLOG("Compare and write operation failed\n"); 5752 } 5753 5754 spdk_bdev_free_io(bdev_io); 5755 5756 bdev_comparev_and_writev_blocks_unlock(parent_io, 5757 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5758 } 5759 5760 static void 5761 bdev_compare_and_write_do_write(void *_bdev_io) 5762 { 5763 struct spdk_bdev_io *bdev_io = _bdev_io; 5764 int rc; 5765 5766 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5767 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5768 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5769 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5770 bdev_compare_and_write_do_write_done, bdev_io); 5771 5772 5773 if (rc == -ENOMEM) { 5774 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5775 } else if (rc != 0) { 5776 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5777 } 5778 } 5779 5780 static void 5781 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5782 { 5783 struct spdk_bdev_io *parent_io = cb_arg; 5784 5785 spdk_bdev_free_io(bdev_io); 5786 5787 if (!success) { 5788 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5789 return; 5790 } 5791 5792 bdev_compare_and_write_do_write(parent_io); 5793 } 5794 5795 static void 5796 bdev_compare_and_write_do_compare(void *_bdev_io) 5797 { 5798 struct spdk_bdev_io *bdev_io = _bdev_io; 5799 int rc; 5800 5801 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5802 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5803 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5804 bdev_compare_and_write_do_compare_done, bdev_io); 5805 5806 if (rc == -ENOMEM) { 5807 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5808 } else if (rc != 0) { 5809 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5810 } 5811 } 5812 5813 static void 5814 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5815 { 5816 struct spdk_bdev_io *bdev_io = ctx; 5817 5818 if (status) { 5819 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5820 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5821 return; 5822 } 5823 5824 bdev_compare_and_write_do_compare(bdev_io); 5825 } 5826 5827 int 5828 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5829 struct iovec *compare_iov, int compare_iovcnt, 5830 struct iovec *write_iov, int write_iovcnt, 5831 uint64_t offset_blocks, uint64_t num_blocks, 5832 spdk_bdev_io_completion_cb cb, void *cb_arg) 5833 { 5834 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5835 struct spdk_bdev_io *bdev_io; 5836 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5837 5838 if (!desc->write) { 5839 return -EBADF; 5840 } 5841 5842 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5843 return -EINVAL; 5844 } 5845 5846 if (num_blocks > bdev->acwu) { 5847 return -EINVAL; 5848 } 5849 5850 bdev_io = bdev_channel_get_io(channel); 5851 if (!bdev_io) { 5852 return -ENOMEM; 5853 } 5854 5855 bdev_io->internal.ch = channel; 5856 bdev_io->internal.desc = desc; 5857 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5858 bdev_io->u.bdev.iovs = compare_iov; 5859 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5860 bdev_io->u.bdev.fused_iovs = write_iov; 5861 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5862 bdev_io->u.bdev.md_buf = NULL; 5863 bdev_io->u.bdev.num_blocks = num_blocks; 5864 bdev_io->u.bdev.offset_blocks = offset_blocks; 5865 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5866 bdev_io->u.bdev.memory_domain = NULL; 5867 bdev_io->u.bdev.memory_domain_ctx = NULL; 5868 bdev_io->u.bdev.accel_sequence = NULL; 5869 5870 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5871 bdev_io_submit(bdev_io); 5872 return 0; 5873 } 5874 5875 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5876 bdev_comparev_and_writev_blocks_locked, bdev_io); 5877 } 5878 5879 int 5880 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5881 struct iovec *iov, int iovcnt, 5882 uint64_t offset_blocks, uint64_t num_blocks, 5883 bool populate, 5884 spdk_bdev_io_completion_cb cb, void *cb_arg) 5885 { 5886 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5887 struct spdk_bdev_io *bdev_io; 5888 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5889 5890 if (!desc->write) { 5891 return -EBADF; 5892 } 5893 5894 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5895 return -EINVAL; 5896 } 5897 5898 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5899 return -ENOTSUP; 5900 } 5901 5902 bdev_io = bdev_channel_get_io(channel); 5903 if (!bdev_io) { 5904 return -ENOMEM; 5905 } 5906 5907 bdev_io->internal.ch = channel; 5908 bdev_io->internal.desc = desc; 5909 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5910 bdev_io->u.bdev.num_blocks = num_blocks; 5911 bdev_io->u.bdev.offset_blocks = offset_blocks; 5912 bdev_io->u.bdev.iovs = iov; 5913 bdev_io->u.bdev.iovcnt = iovcnt; 5914 bdev_io->u.bdev.md_buf = NULL; 5915 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5916 bdev_io->u.bdev.zcopy.commit = 0; 5917 bdev_io->u.bdev.zcopy.start = 1; 5918 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5919 bdev_io->u.bdev.memory_domain = NULL; 5920 bdev_io->u.bdev.memory_domain_ctx = NULL; 5921 bdev_io->u.bdev.accel_sequence = NULL; 5922 5923 bdev_io_submit(bdev_io); 5924 5925 return 0; 5926 } 5927 5928 int 5929 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5930 spdk_bdev_io_completion_cb cb, void *cb_arg) 5931 { 5932 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5933 return -EINVAL; 5934 } 5935 5936 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5937 bdev_io->u.bdev.zcopy.start = 0; 5938 bdev_io->internal.caller_ctx = cb_arg; 5939 bdev_io->internal.cb = cb; 5940 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5941 5942 bdev_io_submit(bdev_io); 5943 5944 return 0; 5945 } 5946 5947 int 5948 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5949 uint64_t offset, uint64_t len, 5950 spdk_bdev_io_completion_cb cb, void *cb_arg) 5951 { 5952 uint64_t offset_blocks, num_blocks; 5953 5954 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5955 len, &num_blocks) != 0) { 5956 return -EINVAL; 5957 } 5958 5959 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5960 } 5961 5962 int 5963 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5964 uint64_t offset_blocks, uint64_t num_blocks, 5965 spdk_bdev_io_completion_cb cb, void *cb_arg) 5966 { 5967 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5968 struct spdk_bdev_io *bdev_io; 5969 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5970 5971 if (!desc->write) { 5972 return -EBADF; 5973 } 5974 5975 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5976 return -EINVAL; 5977 } 5978 5979 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5980 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5981 return -ENOTSUP; 5982 } 5983 5984 bdev_io = bdev_channel_get_io(channel); 5985 5986 if (!bdev_io) { 5987 return -ENOMEM; 5988 } 5989 5990 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5991 bdev_io->internal.ch = channel; 5992 bdev_io->internal.desc = desc; 5993 bdev_io->u.bdev.offset_blocks = offset_blocks; 5994 bdev_io->u.bdev.num_blocks = num_blocks; 5995 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5996 bdev_io->u.bdev.memory_domain = NULL; 5997 bdev_io->u.bdev.memory_domain_ctx = NULL; 5998 bdev_io->u.bdev.accel_sequence = NULL; 5999 6000 /* If the write_zeroes size is large and should be split, use the generic split 6001 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6002 * 6003 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6004 * or emulate it using regular write request otherwise. 6005 */ 6006 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6007 bdev_io->internal.split) { 6008 bdev_io_submit(bdev_io); 6009 return 0; 6010 } 6011 6012 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6013 6014 return bdev_write_zero_buffer(bdev_io); 6015 } 6016 6017 int 6018 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6019 uint64_t offset, uint64_t nbytes, 6020 spdk_bdev_io_completion_cb cb, void *cb_arg) 6021 { 6022 uint64_t offset_blocks, num_blocks; 6023 6024 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6025 nbytes, &num_blocks) != 0) { 6026 return -EINVAL; 6027 } 6028 6029 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6030 } 6031 6032 int 6033 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6034 uint64_t offset_blocks, uint64_t num_blocks, 6035 spdk_bdev_io_completion_cb cb, void *cb_arg) 6036 { 6037 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6038 struct spdk_bdev_io *bdev_io; 6039 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6040 6041 if (!desc->write) { 6042 return -EBADF; 6043 } 6044 6045 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6046 return -EINVAL; 6047 } 6048 6049 if (num_blocks == 0) { 6050 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6051 return -EINVAL; 6052 } 6053 6054 bdev_io = bdev_channel_get_io(channel); 6055 if (!bdev_io) { 6056 return -ENOMEM; 6057 } 6058 6059 bdev_io->internal.ch = channel; 6060 bdev_io->internal.desc = desc; 6061 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6062 6063 bdev_io->u.bdev.iovs = &bdev_io->iov; 6064 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6065 bdev_io->u.bdev.iovs[0].iov_len = 0; 6066 bdev_io->u.bdev.iovcnt = 1; 6067 6068 bdev_io->u.bdev.offset_blocks = offset_blocks; 6069 bdev_io->u.bdev.num_blocks = num_blocks; 6070 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6071 bdev_io->u.bdev.memory_domain = NULL; 6072 bdev_io->u.bdev.memory_domain_ctx = NULL; 6073 bdev_io->u.bdev.accel_sequence = NULL; 6074 6075 bdev_io_submit(bdev_io); 6076 return 0; 6077 } 6078 6079 int 6080 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6081 uint64_t offset, uint64_t length, 6082 spdk_bdev_io_completion_cb cb, void *cb_arg) 6083 { 6084 uint64_t offset_blocks, num_blocks; 6085 6086 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6087 length, &num_blocks) != 0) { 6088 return -EINVAL; 6089 } 6090 6091 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6092 } 6093 6094 int 6095 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6096 uint64_t offset_blocks, uint64_t num_blocks, 6097 spdk_bdev_io_completion_cb cb, void *cb_arg) 6098 { 6099 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6100 struct spdk_bdev_io *bdev_io; 6101 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6102 6103 if (!desc->write) { 6104 return -EBADF; 6105 } 6106 6107 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6108 return -EINVAL; 6109 } 6110 6111 bdev_io = bdev_channel_get_io(channel); 6112 if (!bdev_io) { 6113 return -ENOMEM; 6114 } 6115 6116 bdev_io->internal.ch = channel; 6117 bdev_io->internal.desc = desc; 6118 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6119 bdev_io->u.bdev.iovs = NULL; 6120 bdev_io->u.bdev.iovcnt = 0; 6121 bdev_io->u.bdev.offset_blocks = offset_blocks; 6122 bdev_io->u.bdev.num_blocks = num_blocks; 6123 bdev_io->u.bdev.memory_domain = NULL; 6124 bdev_io->u.bdev.memory_domain_ctx = NULL; 6125 bdev_io->u.bdev.accel_sequence = NULL; 6126 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6127 6128 bdev_io_submit(bdev_io); 6129 return 0; 6130 } 6131 6132 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6133 6134 static void 6135 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6136 { 6137 struct spdk_bdev_channel *ch = _ctx; 6138 struct spdk_bdev_io *bdev_io; 6139 6140 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6141 6142 if (status == -EBUSY) { 6143 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6144 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6145 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6146 } else { 6147 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6148 6149 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6150 /* If outstanding IOs are still present and reset_io_drain_timeout 6151 * seconds passed, start the reset. */ 6152 bdev_io_submit_reset(bdev_io); 6153 } else { 6154 /* We still have in progress memory domain pull/push or we're 6155 * executing accel sequence. Since we cannot abort either of those 6156 * operaions, fail the reset request. */ 6157 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6158 } 6159 } 6160 } else { 6161 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6162 SPDK_DEBUGLOG(bdev, 6163 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6164 ch->bdev->name); 6165 /* Mark the completion status as a SUCCESS and complete the reset. */ 6166 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6167 } 6168 } 6169 6170 static void 6171 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6172 struct spdk_io_channel *io_ch, void *_ctx) 6173 { 6174 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6175 int status = 0; 6176 6177 if (cur_ch->io_outstanding > 0 || 6178 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6179 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6180 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6181 * further iteration over the rest of the channels and pass non-zero status 6182 * to the callback function. */ 6183 status = -EBUSY; 6184 } 6185 spdk_bdev_for_each_channel_continue(i, status); 6186 } 6187 6188 static int 6189 bdev_reset_poll_for_outstanding_io(void *ctx) 6190 { 6191 struct spdk_bdev_channel *ch = ctx; 6192 struct spdk_bdev_io *bdev_io; 6193 6194 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6195 6196 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6197 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6198 bdev_reset_check_outstanding_io_done); 6199 6200 return SPDK_POLLER_BUSY; 6201 } 6202 6203 static void 6204 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6205 { 6206 struct spdk_bdev_channel *ch = _ctx; 6207 struct spdk_bdev_io *bdev_io; 6208 6209 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6210 6211 if (bdev->reset_io_drain_timeout == 0) { 6212 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6213 6214 bdev_io_submit_reset(bdev_io); 6215 return; 6216 } 6217 6218 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6219 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6220 6221 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6222 * submit the reset to the underlying module only if outstanding I/O 6223 * remain after reset_io_drain_timeout seconds have passed. */ 6224 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6225 bdev_reset_check_outstanding_io_done); 6226 } 6227 6228 static void 6229 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6230 struct spdk_io_channel *ch, void *_ctx) 6231 { 6232 struct spdk_bdev_channel *channel; 6233 struct spdk_bdev_mgmt_channel *mgmt_channel; 6234 struct spdk_bdev_shared_resource *shared_resource; 6235 bdev_io_tailq_t tmp_queued; 6236 6237 TAILQ_INIT(&tmp_queued); 6238 6239 channel = __io_ch_to_bdev_ch(ch); 6240 shared_resource = channel->shared_resource; 6241 mgmt_channel = shared_resource->mgmt_ch; 6242 6243 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6244 6245 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6246 /* The QoS object is always valid and readable while 6247 * the channel flag is set, so the lock here should not 6248 * be necessary. We're not in the fast path though, so 6249 * just take it anyway. */ 6250 spdk_spin_lock(&channel->bdev->internal.spinlock); 6251 if (channel->bdev->internal.qos->ch == channel) { 6252 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6253 } 6254 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6255 } 6256 6257 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6258 bdev_abort_all_buf_io(mgmt_channel, channel); 6259 bdev_abort_all_queued_io(&tmp_queued, channel); 6260 6261 spdk_bdev_for_each_channel_continue(i, 0); 6262 } 6263 6264 static void 6265 bdev_start_reset(void *ctx) 6266 { 6267 struct spdk_bdev_channel *ch = ctx; 6268 6269 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6270 bdev_reset_freeze_channel_done); 6271 } 6272 6273 static void 6274 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6275 { 6276 struct spdk_bdev *bdev = ch->bdev; 6277 6278 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6279 6280 spdk_spin_lock(&bdev->internal.spinlock); 6281 if (bdev->internal.reset_in_progress == NULL) { 6282 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6283 /* 6284 * Take a channel reference for the target bdev for the life of this 6285 * reset. This guards against the channel getting destroyed while 6286 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6287 * progress. We will release the reference when this reset is 6288 * completed. 6289 */ 6290 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6291 bdev_start_reset(ch); 6292 } 6293 spdk_spin_unlock(&bdev->internal.spinlock); 6294 } 6295 6296 int 6297 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6298 spdk_bdev_io_completion_cb cb, void *cb_arg) 6299 { 6300 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6301 struct spdk_bdev_io *bdev_io; 6302 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6303 6304 bdev_io = bdev_channel_get_io(channel); 6305 if (!bdev_io) { 6306 return -ENOMEM; 6307 } 6308 6309 bdev_io->internal.ch = channel; 6310 bdev_io->internal.desc = desc; 6311 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6312 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6313 bdev_io->u.reset.ch_ref = NULL; 6314 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6315 6316 spdk_spin_lock(&bdev->internal.spinlock); 6317 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6318 spdk_spin_unlock(&bdev->internal.spinlock); 6319 6320 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6321 internal.ch_link); 6322 6323 bdev_channel_start_reset(channel); 6324 6325 return 0; 6326 } 6327 6328 void 6329 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6330 struct spdk_bdev_io_stat *stat) 6331 { 6332 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6333 6334 bdev_get_io_stat(stat, channel->stat); 6335 } 6336 6337 static void 6338 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6339 { 6340 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6341 6342 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6343 bdev_iostat_ctx->cb_arg, 0); 6344 free(bdev_iostat_ctx); 6345 } 6346 6347 static void 6348 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6349 struct spdk_io_channel *ch, void *_ctx) 6350 { 6351 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6352 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6353 6354 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6355 spdk_bdev_for_each_channel_continue(i, 0); 6356 } 6357 6358 void 6359 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6360 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6361 { 6362 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6363 6364 assert(bdev != NULL); 6365 assert(stat != NULL); 6366 assert(cb != NULL); 6367 6368 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6369 if (bdev_iostat_ctx == NULL) { 6370 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6371 cb(bdev, stat, cb_arg, -ENOMEM); 6372 return; 6373 } 6374 6375 bdev_iostat_ctx->stat = stat; 6376 bdev_iostat_ctx->cb = cb; 6377 bdev_iostat_ctx->cb_arg = cb_arg; 6378 6379 /* Start with the statistics from previously deleted channels. */ 6380 spdk_spin_lock(&bdev->internal.spinlock); 6381 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6382 spdk_spin_unlock(&bdev->internal.spinlock); 6383 6384 /* Then iterate and add the statistics from each existing channel. */ 6385 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6386 bdev_get_device_stat_done); 6387 } 6388 6389 struct bdev_iostat_reset_ctx { 6390 enum spdk_bdev_reset_stat_mode mode; 6391 bdev_reset_device_stat_cb cb; 6392 void *cb_arg; 6393 }; 6394 6395 static void 6396 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6397 { 6398 struct bdev_iostat_reset_ctx *ctx = _ctx; 6399 6400 ctx->cb(bdev, ctx->cb_arg, 0); 6401 6402 free(ctx); 6403 } 6404 6405 static void 6406 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6407 struct spdk_io_channel *ch, void *_ctx) 6408 { 6409 struct bdev_iostat_reset_ctx *ctx = _ctx; 6410 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6411 6412 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6413 6414 spdk_bdev_for_each_channel_continue(i, 0); 6415 } 6416 6417 void 6418 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6419 bdev_reset_device_stat_cb cb, void *cb_arg) 6420 { 6421 struct bdev_iostat_reset_ctx *ctx; 6422 6423 assert(bdev != NULL); 6424 assert(cb != NULL); 6425 6426 ctx = calloc(1, sizeof(*ctx)); 6427 if (ctx == NULL) { 6428 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6429 cb(bdev, cb_arg, -ENOMEM); 6430 return; 6431 } 6432 6433 ctx->mode = mode; 6434 ctx->cb = cb; 6435 ctx->cb_arg = cb_arg; 6436 6437 spdk_spin_lock(&bdev->internal.spinlock); 6438 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6439 spdk_spin_unlock(&bdev->internal.spinlock); 6440 6441 spdk_bdev_for_each_channel(bdev, 6442 bdev_reset_each_channel_stat, 6443 ctx, 6444 bdev_reset_device_stat_done); 6445 } 6446 6447 int 6448 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6449 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6450 spdk_bdev_io_completion_cb cb, void *cb_arg) 6451 { 6452 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6453 struct spdk_bdev_io *bdev_io; 6454 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6455 6456 if (!desc->write) { 6457 return -EBADF; 6458 } 6459 6460 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6461 return -ENOTSUP; 6462 } 6463 6464 bdev_io = bdev_channel_get_io(channel); 6465 if (!bdev_io) { 6466 return -ENOMEM; 6467 } 6468 6469 bdev_io->internal.ch = channel; 6470 bdev_io->internal.desc = desc; 6471 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6472 bdev_io->u.nvme_passthru.cmd = *cmd; 6473 bdev_io->u.nvme_passthru.buf = buf; 6474 bdev_io->u.nvme_passthru.nbytes = nbytes; 6475 bdev_io->u.nvme_passthru.md_buf = NULL; 6476 bdev_io->u.nvme_passthru.md_len = 0; 6477 6478 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6479 6480 bdev_io_submit(bdev_io); 6481 return 0; 6482 } 6483 6484 int 6485 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6486 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6487 spdk_bdev_io_completion_cb cb, void *cb_arg) 6488 { 6489 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6490 struct spdk_bdev_io *bdev_io; 6491 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6492 6493 if (!desc->write) { 6494 /* 6495 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6496 * to easily determine if the command is a read or write, but for now just 6497 * do not allow io_passthru with a read-only descriptor. 6498 */ 6499 return -EBADF; 6500 } 6501 6502 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6503 return -ENOTSUP; 6504 } 6505 6506 bdev_io = bdev_channel_get_io(channel); 6507 if (!bdev_io) { 6508 return -ENOMEM; 6509 } 6510 6511 bdev_io->internal.ch = channel; 6512 bdev_io->internal.desc = desc; 6513 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6514 bdev_io->u.nvme_passthru.cmd = *cmd; 6515 bdev_io->u.nvme_passthru.buf = buf; 6516 bdev_io->u.nvme_passthru.nbytes = nbytes; 6517 bdev_io->u.nvme_passthru.md_buf = NULL; 6518 bdev_io->u.nvme_passthru.md_len = 0; 6519 6520 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6521 6522 bdev_io_submit(bdev_io); 6523 return 0; 6524 } 6525 6526 int 6527 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6528 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6529 spdk_bdev_io_completion_cb cb, void *cb_arg) 6530 { 6531 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6532 struct spdk_bdev_io *bdev_io; 6533 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6534 6535 if (!desc->write) { 6536 /* 6537 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6538 * to easily determine if the command is a read or write, but for now just 6539 * do not allow io_passthru with a read-only descriptor. 6540 */ 6541 return -EBADF; 6542 } 6543 6544 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6545 return -ENOTSUP; 6546 } 6547 6548 bdev_io = bdev_channel_get_io(channel); 6549 if (!bdev_io) { 6550 return -ENOMEM; 6551 } 6552 6553 bdev_io->internal.ch = channel; 6554 bdev_io->internal.desc = desc; 6555 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6556 bdev_io->u.nvme_passthru.cmd = *cmd; 6557 bdev_io->u.nvme_passthru.buf = buf; 6558 bdev_io->u.nvme_passthru.nbytes = nbytes; 6559 bdev_io->u.nvme_passthru.md_buf = md_buf; 6560 bdev_io->u.nvme_passthru.md_len = md_len; 6561 6562 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6563 6564 bdev_io_submit(bdev_io); 6565 return 0; 6566 } 6567 6568 static void bdev_abort_retry(void *ctx); 6569 static void bdev_abort(struct spdk_bdev_io *parent_io); 6570 6571 static void 6572 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6573 { 6574 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6575 struct spdk_bdev_io *parent_io = cb_arg; 6576 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6577 6578 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6579 6580 spdk_bdev_free_io(bdev_io); 6581 6582 if (!success) { 6583 /* Check if the target I/O completed in the meantime. */ 6584 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6585 if (tmp_io == bio_to_abort) { 6586 break; 6587 } 6588 } 6589 6590 /* If the target I/O still exists, set the parent to failed. */ 6591 if (tmp_io != NULL) { 6592 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6593 } 6594 } 6595 6596 parent_io->u.bdev.split_outstanding--; 6597 if (parent_io->u.bdev.split_outstanding == 0) { 6598 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6599 bdev_abort_retry(parent_io); 6600 } else { 6601 bdev_io_complete(parent_io); 6602 } 6603 } 6604 } 6605 6606 static int 6607 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6608 struct spdk_bdev_io *bio_to_abort, 6609 spdk_bdev_io_completion_cb cb, void *cb_arg) 6610 { 6611 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6612 struct spdk_bdev_io *bdev_io; 6613 6614 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6615 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6616 /* TODO: Abort reset or abort request. */ 6617 return -ENOTSUP; 6618 } 6619 6620 bdev_io = bdev_channel_get_io(channel); 6621 if (bdev_io == NULL) { 6622 return -ENOMEM; 6623 } 6624 6625 bdev_io->internal.ch = channel; 6626 bdev_io->internal.desc = desc; 6627 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6628 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6629 6630 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6631 assert(bdev_io_should_split(bio_to_abort)); 6632 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6633 6634 /* Parent abort request is not submitted directly, but to manage its 6635 * execution add it to the submitted list here. 6636 */ 6637 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6638 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6639 6640 bdev_abort(bdev_io); 6641 6642 return 0; 6643 } 6644 6645 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6646 6647 /* Submit the abort request to the underlying bdev module. */ 6648 bdev_io_submit(bdev_io); 6649 6650 return 0; 6651 } 6652 6653 static bool 6654 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6655 { 6656 struct spdk_bdev_io *iter; 6657 6658 TAILQ_FOREACH(iter, tailq, internal.link) { 6659 if (iter == bdev_io) { 6660 return true; 6661 } 6662 } 6663 6664 return false; 6665 } 6666 6667 static uint32_t 6668 _bdev_abort(struct spdk_bdev_io *parent_io) 6669 { 6670 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6671 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6672 void *bio_cb_arg; 6673 struct spdk_bdev_io *bio_to_abort; 6674 uint32_t matched_ios; 6675 int rc; 6676 6677 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6678 6679 /* matched_ios is returned and will be kept by the caller. 6680 * 6681 * This function will be used for two cases, 1) the same cb_arg is used for 6682 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6683 * Incrementing split_outstanding directly here may confuse readers especially 6684 * for the 1st case. 6685 * 6686 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6687 * works as expected. 6688 */ 6689 matched_ios = 0; 6690 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6691 6692 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6693 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6694 continue; 6695 } 6696 6697 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6698 /* Any I/O which was submitted after this abort command should be excluded. */ 6699 continue; 6700 } 6701 6702 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6703 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6704 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6705 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6706 break; 6707 } 6708 6709 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6710 if (rc != 0) { 6711 if (rc == -ENOMEM) { 6712 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6713 } else { 6714 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6715 } 6716 break; 6717 } 6718 matched_ios++; 6719 } 6720 6721 return matched_ios; 6722 } 6723 6724 static void 6725 bdev_abort_retry(void *ctx) 6726 { 6727 struct spdk_bdev_io *parent_io = ctx; 6728 uint32_t matched_ios; 6729 6730 matched_ios = _bdev_abort(parent_io); 6731 6732 if (matched_ios == 0) { 6733 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6734 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6735 } else { 6736 /* For retry, the case that no target I/O was found is success 6737 * because it means target I/Os completed in the meantime. 6738 */ 6739 bdev_io_complete(parent_io); 6740 } 6741 return; 6742 } 6743 6744 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6745 parent_io->u.bdev.split_outstanding = matched_ios; 6746 } 6747 6748 static void 6749 bdev_abort(struct spdk_bdev_io *parent_io) 6750 { 6751 uint32_t matched_ios; 6752 6753 matched_ios = _bdev_abort(parent_io); 6754 6755 if (matched_ios == 0) { 6756 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6757 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6758 } else { 6759 /* The case the no target I/O was found is failure. */ 6760 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6761 bdev_io_complete(parent_io); 6762 } 6763 return; 6764 } 6765 6766 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6767 parent_io->u.bdev.split_outstanding = matched_ios; 6768 } 6769 6770 int 6771 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6772 void *bio_cb_arg, 6773 spdk_bdev_io_completion_cb cb, void *cb_arg) 6774 { 6775 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6776 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6777 struct spdk_bdev_io *bdev_io; 6778 6779 if (bio_cb_arg == NULL) { 6780 return -EINVAL; 6781 } 6782 6783 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6784 return -ENOTSUP; 6785 } 6786 6787 bdev_io = bdev_channel_get_io(channel); 6788 if (bdev_io == NULL) { 6789 return -ENOMEM; 6790 } 6791 6792 bdev_io->internal.ch = channel; 6793 bdev_io->internal.desc = desc; 6794 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6795 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6796 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6797 6798 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6799 6800 /* Parent abort request is not submitted directly, but to manage its execution, 6801 * add it to the submitted list here. 6802 */ 6803 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6804 6805 bdev_abort(bdev_io); 6806 6807 return 0; 6808 } 6809 6810 int 6811 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6812 struct spdk_bdev_io_wait_entry *entry) 6813 { 6814 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6815 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6816 6817 if (bdev != entry->bdev) { 6818 SPDK_ERRLOG("bdevs do not match\n"); 6819 return -EINVAL; 6820 } 6821 6822 if (mgmt_ch->per_thread_cache_count > 0) { 6823 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6824 return -EINVAL; 6825 } 6826 6827 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6828 return 0; 6829 } 6830 6831 static inline void 6832 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6833 { 6834 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6835 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6836 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6837 uint32_t blocklen = bdev_io->bdev->blocklen; 6838 6839 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6840 switch (bdev_io->type) { 6841 case SPDK_BDEV_IO_TYPE_READ: 6842 io_stat->bytes_read += num_blocks * blocklen; 6843 io_stat->num_read_ops++; 6844 io_stat->read_latency_ticks += tsc_diff; 6845 if (io_stat->max_read_latency_ticks < tsc_diff) { 6846 io_stat->max_read_latency_ticks = tsc_diff; 6847 } 6848 if (io_stat->min_read_latency_ticks > tsc_diff) { 6849 io_stat->min_read_latency_ticks = tsc_diff; 6850 } 6851 break; 6852 case SPDK_BDEV_IO_TYPE_WRITE: 6853 io_stat->bytes_written += num_blocks * blocklen; 6854 io_stat->num_write_ops++; 6855 io_stat->write_latency_ticks += tsc_diff; 6856 if (io_stat->max_write_latency_ticks < tsc_diff) { 6857 io_stat->max_write_latency_ticks = tsc_diff; 6858 } 6859 if (io_stat->min_write_latency_ticks > tsc_diff) { 6860 io_stat->min_write_latency_ticks = tsc_diff; 6861 } 6862 break; 6863 case SPDK_BDEV_IO_TYPE_UNMAP: 6864 io_stat->bytes_unmapped += num_blocks * blocklen; 6865 io_stat->num_unmap_ops++; 6866 io_stat->unmap_latency_ticks += tsc_diff; 6867 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6868 io_stat->max_unmap_latency_ticks = tsc_diff; 6869 } 6870 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6871 io_stat->min_unmap_latency_ticks = tsc_diff; 6872 } 6873 break; 6874 case SPDK_BDEV_IO_TYPE_ZCOPY: 6875 /* Track the data in the start phase only */ 6876 if (bdev_io->u.bdev.zcopy.start) { 6877 if (bdev_io->u.bdev.zcopy.populate) { 6878 io_stat->bytes_read += num_blocks * blocklen; 6879 io_stat->num_read_ops++; 6880 io_stat->read_latency_ticks += tsc_diff; 6881 if (io_stat->max_read_latency_ticks < tsc_diff) { 6882 io_stat->max_read_latency_ticks = tsc_diff; 6883 } 6884 if (io_stat->min_read_latency_ticks > tsc_diff) { 6885 io_stat->min_read_latency_ticks = tsc_diff; 6886 } 6887 } else { 6888 io_stat->bytes_written += num_blocks * blocklen; 6889 io_stat->num_write_ops++; 6890 io_stat->write_latency_ticks += tsc_diff; 6891 if (io_stat->max_write_latency_ticks < tsc_diff) { 6892 io_stat->max_write_latency_ticks = tsc_diff; 6893 } 6894 if (io_stat->min_write_latency_ticks > tsc_diff) { 6895 io_stat->min_write_latency_ticks = tsc_diff; 6896 } 6897 } 6898 } 6899 break; 6900 case SPDK_BDEV_IO_TYPE_COPY: 6901 io_stat->bytes_copied += num_blocks * blocklen; 6902 io_stat->num_copy_ops++; 6903 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6904 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6905 io_stat->max_copy_latency_ticks = tsc_diff; 6906 } 6907 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6908 io_stat->min_copy_latency_ticks = tsc_diff; 6909 } 6910 break; 6911 default: 6912 break; 6913 } 6914 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6915 io_stat = bdev_io->bdev->internal.stat; 6916 assert(io_stat->io_error != NULL); 6917 6918 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6919 io_stat->io_error->error_status[-io_status - 1]++; 6920 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6921 } 6922 6923 #ifdef SPDK_CONFIG_VTUNE 6924 uint64_t now_tsc = spdk_get_ticks(); 6925 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6926 uint64_t data[5]; 6927 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6928 6929 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6930 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6931 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6932 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6933 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6934 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6935 6936 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6937 __itt_metadata_u64, 5, data); 6938 6939 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6940 bdev_io->internal.ch->start_tsc = now_tsc; 6941 } 6942 #endif 6943 } 6944 6945 static inline void 6946 _bdev_io_complete(void *ctx) 6947 { 6948 struct spdk_bdev_io *bdev_io = ctx; 6949 6950 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6951 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6952 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6953 } 6954 6955 assert(bdev_io->internal.cb != NULL); 6956 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6957 6958 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6959 bdev_io->internal.caller_ctx); 6960 } 6961 6962 static inline void 6963 bdev_io_complete(void *ctx) 6964 { 6965 struct spdk_bdev_io *bdev_io = ctx; 6966 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6967 uint64_t tsc, tsc_diff; 6968 6969 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6970 /* 6971 * Defer completion to avoid potential infinite recursion if the 6972 * user's completion callback issues a new I/O. 6973 */ 6974 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6975 bdev_io_complete, bdev_io); 6976 return; 6977 } 6978 6979 tsc = spdk_get_ticks(); 6980 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6981 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6982 bdev_io->internal.caller_ctx); 6983 6984 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6985 6986 if (bdev_io->internal.ch->histogram) { 6987 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6988 } 6989 6990 bdev_io_update_io_stat(bdev_io, tsc_diff); 6991 _bdev_io_complete(bdev_io); 6992 } 6993 6994 /* The difference between this function and bdev_io_complete() is that this should be called to 6995 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 6996 * io_submitted list and don't have submit_tsc updated. 6997 */ 6998 static inline void 6999 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7000 { 7001 /* Since the IO hasn't been submitted it's bound to be failed */ 7002 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7003 7004 /* At this point we don't know if the IO is completed from submission context or not, but, 7005 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7006 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7007 _bdev_io_complete, bdev_io); 7008 } 7009 7010 static void bdev_destroy_cb(void *io_device); 7011 7012 static void 7013 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7014 { 7015 struct spdk_bdev_io *bdev_io = _ctx; 7016 7017 if (bdev_io->u.reset.ch_ref != NULL) { 7018 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7019 bdev_io->u.reset.ch_ref = NULL; 7020 } 7021 7022 bdev_io_complete(bdev_io); 7023 7024 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7025 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7026 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7027 } 7028 } 7029 7030 static void 7031 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7032 struct spdk_io_channel *_ch, void *_ctx) 7033 { 7034 struct spdk_bdev_io *bdev_io = _ctx; 7035 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7036 struct spdk_bdev_io *queued_reset; 7037 7038 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7039 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7040 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7041 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7042 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7043 } 7044 7045 spdk_bdev_for_each_channel_continue(i, 0); 7046 } 7047 7048 static void 7049 bdev_io_complete_sequence_cb(void *ctx, int status) 7050 { 7051 struct spdk_bdev_io *bdev_io = ctx; 7052 7053 /* u.bdev.accel_sequence should have already been cleared at this point */ 7054 assert(bdev_io->u.bdev.accel_sequence == NULL); 7055 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7056 bdev_io->internal.accel_sequence = NULL; 7057 7058 if (spdk_unlikely(status != 0)) { 7059 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7060 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7061 } 7062 7063 bdev_io_complete(bdev_io); 7064 } 7065 7066 void 7067 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7068 { 7069 struct spdk_bdev *bdev = bdev_io->bdev; 7070 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7071 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7072 7073 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7074 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7075 spdk_bdev_get_module_name(bdev), 7076 bdev_io_status_get_string(bdev_io->internal.status)); 7077 assert(false); 7078 } 7079 bdev_io->internal.status = status; 7080 7081 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7082 bool unlock_channels = false; 7083 7084 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7085 SPDK_ERRLOG("NOMEM returned for reset\n"); 7086 } 7087 spdk_spin_lock(&bdev->internal.spinlock); 7088 if (bdev_io == bdev->internal.reset_in_progress) { 7089 bdev->internal.reset_in_progress = NULL; 7090 unlock_channels = true; 7091 } 7092 spdk_spin_unlock(&bdev->internal.spinlock); 7093 7094 if (unlock_channels) { 7095 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7096 bdev_reset_complete); 7097 return; 7098 } 7099 } else { 7100 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7101 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7102 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7103 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7104 return; 7105 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 7106 _bdev_io_push_bounce_data_buffer(bdev_io, 7107 _bdev_io_complete_push_bounce_done); 7108 /* bdev IO will be completed in the callback */ 7109 return; 7110 } 7111 } 7112 7113 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7114 return; 7115 } 7116 } 7117 7118 bdev_io_complete(bdev_io); 7119 } 7120 7121 void 7122 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7123 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7124 { 7125 enum spdk_bdev_io_status status; 7126 7127 if (sc == SPDK_SCSI_STATUS_GOOD) { 7128 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7129 } else { 7130 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7131 bdev_io->internal.error.scsi.sc = sc; 7132 bdev_io->internal.error.scsi.sk = sk; 7133 bdev_io->internal.error.scsi.asc = asc; 7134 bdev_io->internal.error.scsi.ascq = ascq; 7135 } 7136 7137 spdk_bdev_io_complete(bdev_io, status); 7138 } 7139 7140 void 7141 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7142 int *sc, int *sk, int *asc, int *ascq) 7143 { 7144 assert(sc != NULL); 7145 assert(sk != NULL); 7146 assert(asc != NULL); 7147 assert(ascq != NULL); 7148 7149 switch (bdev_io->internal.status) { 7150 case SPDK_BDEV_IO_STATUS_SUCCESS: 7151 *sc = SPDK_SCSI_STATUS_GOOD; 7152 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7153 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7154 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7155 break; 7156 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7157 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7158 break; 7159 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7160 *sc = bdev_io->internal.error.scsi.sc; 7161 *sk = bdev_io->internal.error.scsi.sk; 7162 *asc = bdev_io->internal.error.scsi.asc; 7163 *ascq = bdev_io->internal.error.scsi.ascq; 7164 break; 7165 default: 7166 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7167 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7168 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7169 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7170 break; 7171 } 7172 } 7173 7174 void 7175 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7176 { 7177 enum spdk_bdev_io_status status; 7178 7179 if (aio_result == 0) { 7180 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7181 } else { 7182 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7183 } 7184 7185 bdev_io->internal.error.aio_result = aio_result; 7186 7187 spdk_bdev_io_complete(bdev_io, status); 7188 } 7189 7190 void 7191 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7192 { 7193 assert(aio_result != NULL); 7194 7195 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7196 *aio_result = bdev_io->internal.error.aio_result; 7197 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7198 *aio_result = 0; 7199 } else { 7200 *aio_result = -EIO; 7201 } 7202 } 7203 7204 void 7205 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7206 { 7207 enum spdk_bdev_io_status status; 7208 7209 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7210 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7211 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7212 status = SPDK_BDEV_IO_STATUS_ABORTED; 7213 } else { 7214 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7215 } 7216 7217 bdev_io->internal.error.nvme.cdw0 = cdw0; 7218 bdev_io->internal.error.nvme.sct = sct; 7219 bdev_io->internal.error.nvme.sc = sc; 7220 7221 spdk_bdev_io_complete(bdev_io, status); 7222 } 7223 7224 void 7225 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7226 { 7227 assert(sct != NULL); 7228 assert(sc != NULL); 7229 assert(cdw0 != NULL); 7230 7231 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7232 *sct = SPDK_NVME_SCT_GENERIC; 7233 *sc = SPDK_NVME_SC_SUCCESS; 7234 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7235 *cdw0 = 0; 7236 } else { 7237 *cdw0 = 1U; 7238 } 7239 return; 7240 } 7241 7242 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7243 *sct = bdev_io->internal.error.nvme.sct; 7244 *sc = bdev_io->internal.error.nvme.sc; 7245 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7246 *sct = SPDK_NVME_SCT_GENERIC; 7247 *sc = SPDK_NVME_SC_SUCCESS; 7248 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7249 *sct = SPDK_NVME_SCT_GENERIC; 7250 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7251 } else { 7252 *sct = SPDK_NVME_SCT_GENERIC; 7253 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7254 } 7255 7256 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7257 } 7258 7259 void 7260 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7261 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7262 { 7263 assert(first_sct != NULL); 7264 assert(first_sc != NULL); 7265 assert(second_sct != NULL); 7266 assert(second_sc != NULL); 7267 assert(cdw0 != NULL); 7268 7269 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7270 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7271 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7272 *first_sct = bdev_io->internal.error.nvme.sct; 7273 *first_sc = bdev_io->internal.error.nvme.sc; 7274 *second_sct = SPDK_NVME_SCT_GENERIC; 7275 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7276 } else { 7277 *first_sct = SPDK_NVME_SCT_GENERIC; 7278 *first_sc = SPDK_NVME_SC_SUCCESS; 7279 *second_sct = bdev_io->internal.error.nvme.sct; 7280 *second_sc = bdev_io->internal.error.nvme.sc; 7281 } 7282 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7283 *first_sct = SPDK_NVME_SCT_GENERIC; 7284 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7285 *second_sct = SPDK_NVME_SCT_GENERIC; 7286 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7287 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7288 *first_sct = SPDK_NVME_SCT_GENERIC; 7289 *first_sc = SPDK_NVME_SC_SUCCESS; 7290 *second_sct = SPDK_NVME_SCT_GENERIC; 7291 *second_sc = SPDK_NVME_SC_SUCCESS; 7292 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7293 *first_sct = SPDK_NVME_SCT_GENERIC; 7294 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7295 *second_sct = SPDK_NVME_SCT_GENERIC; 7296 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7297 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7298 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7299 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7300 *second_sct = SPDK_NVME_SCT_GENERIC; 7301 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7302 } else { 7303 *first_sct = SPDK_NVME_SCT_GENERIC; 7304 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7305 *second_sct = SPDK_NVME_SCT_GENERIC; 7306 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7307 } 7308 7309 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7310 } 7311 7312 struct spdk_thread * 7313 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7314 { 7315 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7316 } 7317 7318 struct spdk_io_channel * 7319 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7320 { 7321 return bdev_io->internal.ch->channel; 7322 } 7323 7324 static int 7325 bdev_register(struct spdk_bdev *bdev) 7326 { 7327 char *bdev_name; 7328 char uuid[SPDK_UUID_STRING_LEN]; 7329 struct spdk_iobuf_opts iobuf_opts; 7330 int ret, i; 7331 7332 assert(bdev->module != NULL); 7333 7334 if (!bdev->name) { 7335 SPDK_ERRLOG("Bdev name is NULL\n"); 7336 return -EINVAL; 7337 } 7338 7339 if (!strlen(bdev->name)) { 7340 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7341 return -EINVAL; 7342 } 7343 7344 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7345 if (bdev->fn_table->accel_sequence_supported == NULL) { 7346 continue; 7347 } 7348 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7349 (enum spdk_bdev_io_type)i)) { 7350 continue; 7351 } 7352 7353 if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) { 7354 SPDK_ERRLOG("bdev supporting accel sequence is required to support " 7355 "memory domains\n"); 7356 return -EINVAL; 7357 } 7358 7359 if (spdk_bdev_is_md_separate(bdev)) { 7360 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7361 "accel sequence support\n"); 7362 return -EINVAL; 7363 } 7364 } 7365 7366 /* Users often register their own I/O devices using the bdev name. In 7367 * order to avoid conflicts, prepend bdev_. */ 7368 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7369 if (!bdev_name) { 7370 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7371 return -ENOMEM; 7372 } 7373 7374 bdev->internal.stat = bdev_alloc_io_stat(true); 7375 if (!bdev->internal.stat) { 7376 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7377 free(bdev_name); 7378 return -ENOMEM; 7379 } 7380 7381 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7382 bdev->internal.measured_queue_depth = UINT64_MAX; 7383 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7384 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7385 bdev->internal.qd_poller = NULL; 7386 bdev->internal.qos = NULL; 7387 7388 TAILQ_INIT(&bdev->internal.open_descs); 7389 TAILQ_INIT(&bdev->internal.locked_ranges); 7390 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7391 TAILQ_INIT(&bdev->aliases); 7392 7393 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7394 if (ret != 0) { 7395 bdev_free_io_stat(bdev->internal.stat); 7396 free(bdev_name); 7397 return ret; 7398 } 7399 7400 /* UUID may be specified by the user or defined by bdev itself. 7401 * Otherwise it will be generated here, so this field will never be empty. */ 7402 if (spdk_uuid_is_null(&bdev->uuid)) { 7403 spdk_uuid_generate(&bdev->uuid); 7404 } 7405 7406 /* Add the UUID alias only if it's different than the name */ 7407 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7408 if (strcmp(bdev->name, uuid) != 0) { 7409 ret = spdk_bdev_alias_add(bdev, uuid); 7410 if (ret != 0) { 7411 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7412 bdev_name_del(&bdev->internal.bdev_name); 7413 bdev_free_io_stat(bdev->internal.stat); 7414 free(bdev_name); 7415 return ret; 7416 } 7417 } 7418 7419 if (spdk_bdev_get_buf_align(bdev) > 1) { 7420 if (bdev->split_on_optimal_io_boundary) { 7421 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7422 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7423 } else { 7424 bdev->split_on_optimal_io_boundary = true; 7425 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7426 } 7427 } 7428 7429 /* If the user didn't specify a write unit size, set it to one. */ 7430 if (bdev->write_unit_size == 0) { 7431 bdev->write_unit_size = 1; 7432 } 7433 7434 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7435 if (bdev->acwu == 0) { 7436 bdev->acwu = bdev->write_unit_size; 7437 } 7438 7439 if (bdev->phys_blocklen == 0) { 7440 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7441 } 7442 7443 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7444 spdk_iobuf_get_opts(&iobuf_opts); 7445 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7446 } 7447 7448 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7449 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7450 } 7451 7452 bdev->internal.reset_in_progress = NULL; 7453 bdev->internal.qd_poll_in_progress = false; 7454 bdev->internal.period = 0; 7455 bdev->internal.new_period = 0; 7456 7457 spdk_io_device_register(__bdev_to_io_dev(bdev), 7458 bdev_channel_create, bdev_channel_destroy, 7459 sizeof(struct spdk_bdev_channel), 7460 bdev_name); 7461 7462 free(bdev_name); 7463 7464 spdk_spin_init(&bdev->internal.spinlock); 7465 7466 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7467 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7468 7469 return 0; 7470 } 7471 7472 static void 7473 bdev_destroy_cb(void *io_device) 7474 { 7475 int rc; 7476 struct spdk_bdev *bdev; 7477 spdk_bdev_unregister_cb cb_fn; 7478 void *cb_arg; 7479 7480 bdev = __bdev_from_io_dev(io_device); 7481 7482 if (bdev->internal.unregister_td != spdk_get_thread()) { 7483 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7484 return; 7485 } 7486 7487 cb_fn = bdev->internal.unregister_cb; 7488 cb_arg = bdev->internal.unregister_ctx; 7489 7490 spdk_spin_destroy(&bdev->internal.spinlock); 7491 free(bdev->internal.qos); 7492 bdev_free_io_stat(bdev->internal.stat); 7493 7494 rc = bdev->fn_table->destruct(bdev->ctxt); 7495 if (rc < 0) { 7496 SPDK_ERRLOG("destruct failed\n"); 7497 } 7498 if (rc <= 0 && cb_fn != NULL) { 7499 cb_fn(cb_arg, rc); 7500 } 7501 } 7502 7503 void 7504 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7505 { 7506 if (bdev->internal.unregister_cb != NULL) { 7507 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7508 } 7509 } 7510 7511 static void 7512 _remove_notify(void *arg) 7513 { 7514 struct spdk_bdev_desc *desc = arg; 7515 7516 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7517 } 7518 7519 /* returns: 0 - bdev removed and ready to be destructed. 7520 * -EBUSY - bdev can't be destructed yet. */ 7521 static int 7522 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7523 { 7524 struct spdk_bdev_desc *desc, *tmp; 7525 int rc = 0; 7526 char uuid[SPDK_UUID_STRING_LEN]; 7527 7528 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7529 assert(spdk_spin_held(&bdev->internal.spinlock)); 7530 7531 /* Notify each descriptor about hotremoval */ 7532 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7533 rc = -EBUSY; 7534 /* 7535 * Defer invocation of the event_cb to a separate message that will 7536 * run later on its thread. This ensures this context unwinds and 7537 * we don't recursively unregister this bdev again if the event_cb 7538 * immediately closes its descriptor. 7539 */ 7540 event_notify(desc, _remove_notify); 7541 } 7542 7543 /* If there are no descriptors, proceed removing the bdev */ 7544 if (rc == 0) { 7545 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7546 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7547 7548 /* Delete the name and the UUID alias */ 7549 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7550 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7551 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7552 7553 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7554 7555 if (bdev->internal.reset_in_progress != NULL) { 7556 /* If reset is in progress, let the completion callback for reset 7557 * unregister the bdev. 7558 */ 7559 rc = -EBUSY; 7560 } 7561 } 7562 7563 return rc; 7564 } 7565 7566 static void 7567 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7568 struct spdk_io_channel *io_ch, void *_ctx) 7569 { 7570 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7571 7572 bdev_channel_abort_queued_ios(bdev_ch); 7573 spdk_bdev_for_each_channel_continue(i, 0); 7574 } 7575 7576 static void 7577 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7578 { 7579 int rc; 7580 7581 spdk_spin_lock(&g_bdev_mgr.spinlock); 7582 spdk_spin_lock(&bdev->internal.spinlock); 7583 /* 7584 * Set the status to REMOVING after completing to abort channels. Otherwise, 7585 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7586 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7587 * may fail. 7588 */ 7589 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7590 rc = bdev_unregister_unsafe(bdev); 7591 spdk_spin_unlock(&bdev->internal.spinlock); 7592 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7593 7594 if (rc == 0) { 7595 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7596 } 7597 } 7598 7599 void 7600 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7601 { 7602 struct spdk_thread *thread; 7603 7604 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7605 7606 thread = spdk_get_thread(); 7607 if (!thread) { 7608 /* The user called this from a non-SPDK thread. */ 7609 if (cb_fn != NULL) { 7610 cb_fn(cb_arg, -ENOTSUP); 7611 } 7612 return; 7613 } 7614 7615 spdk_spin_lock(&g_bdev_mgr.spinlock); 7616 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7617 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7618 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7619 if (cb_fn) { 7620 cb_fn(cb_arg, -EBUSY); 7621 } 7622 return; 7623 } 7624 7625 spdk_spin_lock(&bdev->internal.spinlock); 7626 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7627 bdev->internal.unregister_cb = cb_fn; 7628 bdev->internal.unregister_ctx = cb_arg; 7629 bdev->internal.unregister_td = thread; 7630 spdk_spin_unlock(&bdev->internal.spinlock); 7631 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7632 7633 spdk_bdev_set_qd_sampling_period(bdev, 0); 7634 7635 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7636 bdev_unregister); 7637 } 7638 7639 int 7640 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7641 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7642 { 7643 struct spdk_bdev_desc *desc; 7644 struct spdk_bdev *bdev; 7645 int rc; 7646 7647 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7648 if (rc != 0) { 7649 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7650 return rc; 7651 } 7652 7653 bdev = spdk_bdev_desc_get_bdev(desc); 7654 7655 if (bdev->module != module) { 7656 spdk_bdev_close(desc); 7657 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7658 bdev_name); 7659 return -ENODEV; 7660 } 7661 7662 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7663 7664 spdk_bdev_close(desc); 7665 7666 return 0; 7667 } 7668 7669 static int 7670 bdev_start_qos(struct spdk_bdev *bdev) 7671 { 7672 struct set_qos_limit_ctx *ctx; 7673 7674 /* Enable QoS */ 7675 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7676 ctx = calloc(1, sizeof(*ctx)); 7677 if (ctx == NULL) { 7678 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7679 return -ENOMEM; 7680 } 7681 ctx->bdev = bdev; 7682 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7683 } 7684 7685 return 0; 7686 } 7687 7688 static void 7689 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7690 struct spdk_bdev *bdev) 7691 { 7692 enum spdk_bdev_claim_type type; 7693 const char *typename, *modname; 7694 extern struct spdk_log_flag SPDK_LOG_bdev; 7695 7696 assert(spdk_spin_held(&bdev->internal.spinlock)); 7697 7698 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7699 return; 7700 } 7701 7702 type = bdev->internal.claim_type; 7703 typename = spdk_bdev_claim_get_name(type); 7704 7705 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7706 modname = bdev->internal.claim.v1.module->name; 7707 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7708 bdev->name, detail, typename, modname); 7709 return; 7710 } 7711 7712 if (claim_type_is_v2(type)) { 7713 struct spdk_bdev_module_claim *claim; 7714 7715 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7716 modname = claim->module->name; 7717 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7718 bdev->name, detail, typename, modname); 7719 } 7720 return; 7721 } 7722 7723 assert(false); 7724 } 7725 7726 static int 7727 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7728 { 7729 struct spdk_thread *thread; 7730 int rc = 0; 7731 7732 thread = spdk_get_thread(); 7733 if (!thread) { 7734 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7735 return -ENOTSUP; 7736 } 7737 7738 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7739 spdk_get_thread()); 7740 7741 desc->bdev = bdev; 7742 desc->thread = thread; 7743 desc->write = write; 7744 7745 spdk_spin_lock(&bdev->internal.spinlock); 7746 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7747 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7748 spdk_spin_unlock(&bdev->internal.spinlock); 7749 return -ENODEV; 7750 } 7751 7752 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7753 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7754 spdk_spin_unlock(&bdev->internal.spinlock); 7755 return -EPERM; 7756 } 7757 7758 rc = bdev_start_qos(bdev); 7759 if (rc != 0) { 7760 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7761 spdk_spin_unlock(&bdev->internal.spinlock); 7762 return rc; 7763 } 7764 7765 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7766 7767 spdk_spin_unlock(&bdev->internal.spinlock); 7768 7769 return 0; 7770 } 7771 7772 static int 7773 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7774 struct spdk_bdev_desc **_desc) 7775 { 7776 struct spdk_bdev_desc *desc; 7777 unsigned int i; 7778 7779 desc = calloc(1, sizeof(*desc)); 7780 if (desc == NULL) { 7781 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7782 return -ENOMEM; 7783 } 7784 7785 TAILQ_INIT(&desc->pending_media_events); 7786 TAILQ_INIT(&desc->free_media_events); 7787 7788 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7789 desc->callback.event_fn = event_cb; 7790 desc->callback.ctx = event_ctx; 7791 spdk_spin_init(&desc->spinlock); 7792 7793 if (bdev->media_events) { 7794 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7795 sizeof(*desc->media_events_buffer)); 7796 if (desc->media_events_buffer == NULL) { 7797 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7798 bdev_desc_free(desc); 7799 return -ENOMEM; 7800 } 7801 7802 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7803 TAILQ_INSERT_TAIL(&desc->free_media_events, 7804 &desc->media_events_buffer[i], tailq); 7805 } 7806 } 7807 7808 if (bdev->fn_table->accel_sequence_supported != NULL) { 7809 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7810 desc->accel_sequence_supported[i] = 7811 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7812 (enum spdk_bdev_io_type)i); 7813 } 7814 } 7815 7816 *_desc = desc; 7817 7818 return 0; 7819 } 7820 7821 int 7822 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7823 void *event_ctx, struct spdk_bdev_desc **_desc) 7824 { 7825 struct spdk_bdev_desc *desc; 7826 struct spdk_bdev *bdev; 7827 int rc; 7828 7829 if (event_cb == NULL) { 7830 SPDK_ERRLOG("Missing event callback function\n"); 7831 return -EINVAL; 7832 } 7833 7834 spdk_spin_lock(&g_bdev_mgr.spinlock); 7835 7836 bdev = bdev_get_by_name(bdev_name); 7837 7838 if (bdev == NULL) { 7839 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7840 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7841 return -ENODEV; 7842 } 7843 7844 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7845 if (rc != 0) { 7846 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7847 return rc; 7848 } 7849 7850 rc = bdev_open(bdev, write, desc); 7851 if (rc != 0) { 7852 bdev_desc_free(desc); 7853 desc = NULL; 7854 } 7855 7856 *_desc = desc; 7857 7858 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7859 7860 return rc; 7861 } 7862 7863 static void 7864 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7865 { 7866 int rc; 7867 7868 spdk_spin_lock(&bdev->internal.spinlock); 7869 spdk_spin_lock(&desc->spinlock); 7870 7871 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7872 7873 desc->closed = true; 7874 7875 if (desc->claim != NULL) { 7876 bdev_desc_release_claims(desc); 7877 } 7878 7879 if (0 == desc->refs) { 7880 spdk_spin_unlock(&desc->spinlock); 7881 bdev_desc_free(desc); 7882 } else { 7883 spdk_spin_unlock(&desc->spinlock); 7884 } 7885 7886 /* If no more descriptors, kill QoS channel */ 7887 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7888 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7889 bdev->name, spdk_get_thread()); 7890 7891 if (bdev_qos_destroy(bdev)) { 7892 /* There isn't anything we can do to recover here. Just let the 7893 * old QoS poller keep running. The QoS handling won't change 7894 * cores when the user allocates a new channel, but it won't break. */ 7895 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7896 } 7897 } 7898 7899 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7900 rc = bdev_unregister_unsafe(bdev); 7901 spdk_spin_unlock(&bdev->internal.spinlock); 7902 7903 if (rc == 0) { 7904 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7905 } 7906 } else { 7907 spdk_spin_unlock(&bdev->internal.spinlock); 7908 } 7909 } 7910 7911 void 7912 spdk_bdev_close(struct spdk_bdev_desc *desc) 7913 { 7914 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7915 7916 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7917 spdk_get_thread()); 7918 7919 assert(desc->thread == spdk_get_thread()); 7920 7921 spdk_poller_unregister(&desc->io_timeout_poller); 7922 7923 spdk_spin_lock(&g_bdev_mgr.spinlock); 7924 7925 bdev_close(bdev, desc); 7926 7927 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7928 } 7929 7930 static void 7931 bdev_register_finished(void *arg) 7932 { 7933 struct spdk_bdev_desc *desc = arg; 7934 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7935 7936 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7937 7938 spdk_spin_lock(&g_bdev_mgr.spinlock); 7939 7940 bdev_close(bdev, desc); 7941 7942 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7943 } 7944 7945 int 7946 spdk_bdev_register(struct spdk_bdev *bdev) 7947 { 7948 struct spdk_bdev_desc *desc; 7949 struct spdk_thread *thread = spdk_get_thread(); 7950 int rc; 7951 7952 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 7953 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7954 thread ? spdk_thread_get_name(thread) : "null"); 7955 return -EINVAL; 7956 } 7957 7958 rc = bdev_register(bdev); 7959 if (rc != 0) { 7960 return rc; 7961 } 7962 7963 /* A descriptor is opened to prevent bdev deletion during examination */ 7964 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7965 if (rc != 0) { 7966 spdk_bdev_unregister(bdev, NULL, NULL); 7967 return rc; 7968 } 7969 7970 rc = bdev_open(bdev, false, desc); 7971 if (rc != 0) { 7972 bdev_desc_free(desc); 7973 spdk_bdev_unregister(bdev, NULL, NULL); 7974 return rc; 7975 } 7976 7977 /* Examine configuration before initializing I/O */ 7978 bdev_examine(bdev); 7979 7980 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7981 if (rc != 0) { 7982 bdev_close(bdev, desc); 7983 spdk_bdev_unregister(bdev, NULL, NULL); 7984 } 7985 7986 return rc; 7987 } 7988 7989 int 7990 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7991 struct spdk_bdev_module *module) 7992 { 7993 spdk_spin_lock(&bdev->internal.spinlock); 7994 7995 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7996 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7997 spdk_spin_unlock(&bdev->internal.spinlock); 7998 return -EPERM; 7999 } 8000 8001 if (desc && !desc->write) { 8002 desc->write = true; 8003 } 8004 8005 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8006 bdev->internal.claim.v1.module = module; 8007 8008 spdk_spin_unlock(&bdev->internal.spinlock); 8009 return 0; 8010 } 8011 8012 void 8013 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8014 { 8015 spdk_spin_lock(&bdev->internal.spinlock); 8016 8017 assert(bdev->internal.claim.v1.module != NULL); 8018 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8019 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8020 bdev->internal.claim.v1.module = NULL; 8021 8022 spdk_spin_unlock(&bdev->internal.spinlock); 8023 } 8024 8025 /* 8026 * Start claims v2 8027 */ 8028 8029 const char * 8030 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8031 { 8032 switch (type) { 8033 case SPDK_BDEV_CLAIM_NONE: 8034 return "not_claimed"; 8035 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8036 return "exclusive_write"; 8037 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8038 return "read_many_write_one"; 8039 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8040 return "read_many_write_none"; 8041 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8042 return "read_many_write_many"; 8043 default: 8044 break; 8045 } 8046 return "invalid_claim"; 8047 } 8048 8049 static bool 8050 claim_type_is_v2(enum spdk_bdev_claim_type type) 8051 { 8052 switch (type) { 8053 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8054 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8055 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8056 return true; 8057 default: 8058 break; 8059 } 8060 return false; 8061 } 8062 8063 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8064 static bool 8065 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8066 { 8067 switch (type) { 8068 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8069 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8070 return true; 8071 default: 8072 break; 8073 } 8074 return false; 8075 } 8076 8077 void 8078 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8079 { 8080 if (opts == NULL) { 8081 SPDK_ERRLOG("opts should not be NULL\n"); 8082 assert(opts != NULL); 8083 return; 8084 } 8085 if (size == 0) { 8086 SPDK_ERRLOG("size should not be zero\n"); 8087 assert(size != 0); 8088 return; 8089 } 8090 8091 memset(opts, 0, size); 8092 opts->opts_size = size; 8093 8094 #define FIELD_OK(field) \ 8095 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8096 8097 #define SET_FIELD(field, value) \ 8098 if (FIELD_OK(field)) { \ 8099 opts->field = value; \ 8100 } \ 8101 8102 SET_FIELD(shared_claim_key, 0); 8103 8104 #undef FIELD_OK 8105 #undef SET_FIELD 8106 } 8107 8108 static int 8109 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8110 { 8111 if (src->opts_size == 0) { 8112 SPDK_ERRLOG("size should not be zero\n"); 8113 return -1; 8114 } 8115 8116 memset(dst, 0, sizeof(*dst)); 8117 dst->opts_size = src->opts_size; 8118 8119 #define FIELD_OK(field) \ 8120 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8121 8122 #define SET_FIELD(field) \ 8123 if (FIELD_OK(field)) { \ 8124 dst->field = src->field; \ 8125 } \ 8126 8127 if (FIELD_OK(name)) { 8128 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8129 } 8130 8131 SET_FIELD(shared_claim_key); 8132 8133 /* You should not remove this statement, but need to update the assert statement 8134 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8135 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8136 8137 #undef FIELD_OK 8138 #undef SET_FIELD 8139 return 0; 8140 } 8141 8142 /* Returns 0 if a read-write-once claim can be taken. */ 8143 static int 8144 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8145 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8146 { 8147 struct spdk_bdev *bdev = desc->bdev; 8148 struct spdk_bdev_desc *open_desc; 8149 8150 assert(spdk_spin_held(&bdev->internal.spinlock)); 8151 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8152 8153 if (opts->shared_claim_key != 0) { 8154 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8155 bdev->name); 8156 return -EINVAL; 8157 } 8158 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8159 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8160 return -EPERM; 8161 } 8162 if (desc->claim != NULL) { 8163 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8164 bdev->name, desc->claim->module->name); 8165 return -EPERM; 8166 } 8167 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8168 if (desc != open_desc && open_desc->write) { 8169 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8170 "another descriptor is open for writing\n", 8171 bdev->name); 8172 return -EPERM; 8173 } 8174 } 8175 8176 return 0; 8177 } 8178 8179 /* Returns 0 if a read-only-many claim can be taken. */ 8180 static int 8181 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8182 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8183 { 8184 struct spdk_bdev *bdev = desc->bdev; 8185 struct spdk_bdev_desc *open_desc; 8186 8187 assert(spdk_spin_held(&bdev->internal.spinlock)); 8188 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8189 assert(desc->claim == NULL); 8190 8191 if (desc->write) { 8192 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8193 bdev->name); 8194 return -EINVAL; 8195 } 8196 if (opts->shared_claim_key != 0) { 8197 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8198 return -EINVAL; 8199 } 8200 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8201 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8202 if (open_desc->write) { 8203 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8204 "another descriptor is open for writing\n", 8205 bdev->name); 8206 return -EPERM; 8207 } 8208 } 8209 } 8210 8211 return 0; 8212 } 8213 8214 /* Returns 0 if a read-write-many claim can be taken. */ 8215 static int 8216 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8217 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8218 { 8219 struct spdk_bdev *bdev = desc->bdev; 8220 struct spdk_bdev_desc *open_desc; 8221 8222 assert(spdk_spin_held(&bdev->internal.spinlock)); 8223 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8224 assert(desc->claim == NULL); 8225 8226 if (opts->shared_claim_key == 0) { 8227 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8228 bdev->name); 8229 return -EINVAL; 8230 } 8231 switch (bdev->internal.claim_type) { 8232 case SPDK_BDEV_CLAIM_NONE: 8233 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8234 if (open_desc == desc) { 8235 continue; 8236 } 8237 if (open_desc->write) { 8238 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8239 "another descriptor is open for writing without a " 8240 "claim\n", bdev->name); 8241 return -EPERM; 8242 } 8243 } 8244 break; 8245 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8246 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8247 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8248 return -EPERM; 8249 } 8250 break; 8251 default: 8252 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8253 return -EBUSY; 8254 } 8255 8256 return 0; 8257 } 8258 8259 /* Updates desc and its bdev with a v2 claim. */ 8260 static int 8261 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8262 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8263 { 8264 struct spdk_bdev *bdev = desc->bdev; 8265 struct spdk_bdev_module_claim *claim; 8266 8267 assert(spdk_spin_held(&bdev->internal.spinlock)); 8268 assert(claim_type_is_v2(type)); 8269 assert(desc->claim == NULL); 8270 8271 claim = calloc(1, sizeof(*desc->claim)); 8272 if (claim == NULL) { 8273 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8274 return -ENOMEM; 8275 } 8276 claim->module = module; 8277 claim->desc = desc; 8278 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8279 memcpy(claim->name, opts->name, sizeof(claim->name)); 8280 desc->claim = claim; 8281 8282 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8283 bdev->internal.claim_type = type; 8284 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8285 bdev->internal.claim.v2.key = opts->shared_claim_key; 8286 } 8287 assert(type == bdev->internal.claim_type); 8288 8289 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8290 8291 if (!desc->write && claim_type_promotes_to_write(type)) { 8292 desc->write = true; 8293 } 8294 8295 return 0; 8296 } 8297 8298 int 8299 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8300 struct spdk_bdev_claim_opts *_opts, 8301 struct spdk_bdev_module *module) 8302 { 8303 struct spdk_bdev *bdev; 8304 struct spdk_bdev_claim_opts opts; 8305 int rc = 0; 8306 8307 if (desc == NULL) { 8308 SPDK_ERRLOG("descriptor must not be NULL\n"); 8309 return -EINVAL; 8310 } 8311 8312 bdev = desc->bdev; 8313 8314 if (_opts == NULL) { 8315 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8316 } else if (claim_opts_copy(_opts, &opts) != 0) { 8317 return -EINVAL; 8318 } 8319 8320 spdk_spin_lock(&bdev->internal.spinlock); 8321 8322 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8323 bdev->internal.claim_type != type) { 8324 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8325 spdk_spin_unlock(&bdev->internal.spinlock); 8326 return -EPERM; 8327 } 8328 8329 if (claim_type_is_v2(type) && desc->claim != NULL) { 8330 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8331 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8332 spdk_spin_unlock(&bdev->internal.spinlock); 8333 return -EPERM; 8334 } 8335 8336 switch (type) { 8337 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8338 spdk_spin_unlock(&bdev->internal.spinlock); 8339 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8340 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8341 rc = claim_verify_rwo(desc, type, &opts, module); 8342 break; 8343 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8344 rc = claim_verify_rom(desc, type, &opts, module); 8345 break; 8346 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8347 rc = claim_verify_rwm(desc, type, &opts, module); 8348 break; 8349 default: 8350 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8351 rc = -ENOTSUP; 8352 } 8353 8354 if (rc == 0) { 8355 rc = claim_bdev(desc, type, &opts, module); 8356 } 8357 8358 spdk_spin_unlock(&bdev->internal.spinlock); 8359 return rc; 8360 } 8361 8362 static void 8363 claim_reset(struct spdk_bdev *bdev) 8364 { 8365 assert(spdk_spin_held(&bdev->internal.spinlock)); 8366 assert(claim_type_is_v2(bdev->internal.claim_type)); 8367 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8368 8369 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8370 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8371 } 8372 8373 static void 8374 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8375 { 8376 struct spdk_bdev *bdev = desc->bdev; 8377 8378 assert(spdk_spin_held(&bdev->internal.spinlock)); 8379 assert(claim_type_is_v2(bdev->internal.claim_type)); 8380 8381 if (bdev->internal.examine_in_progress == 0) { 8382 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8383 free(desc->claim); 8384 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8385 claim_reset(bdev); 8386 } 8387 } else { 8388 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8389 desc->claim->module = NULL; 8390 desc->claim->desc = NULL; 8391 } 8392 desc->claim = NULL; 8393 } 8394 8395 /* 8396 * End claims v2 8397 */ 8398 8399 struct spdk_bdev * 8400 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8401 { 8402 assert(desc != NULL); 8403 return desc->bdev; 8404 } 8405 8406 int 8407 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8408 { 8409 struct spdk_bdev *bdev, *tmp; 8410 struct spdk_bdev_desc *desc; 8411 int rc = 0; 8412 8413 assert(fn != NULL); 8414 8415 spdk_spin_lock(&g_bdev_mgr.spinlock); 8416 bdev = spdk_bdev_first(); 8417 while (bdev != NULL) { 8418 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8419 if (rc != 0) { 8420 break; 8421 } 8422 rc = bdev_open(bdev, false, desc); 8423 if (rc != 0) { 8424 bdev_desc_free(desc); 8425 if (rc == -ENODEV) { 8426 /* Ignore the error and move to the next bdev. */ 8427 rc = 0; 8428 bdev = spdk_bdev_next(bdev); 8429 continue; 8430 } 8431 break; 8432 } 8433 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8434 8435 rc = fn(ctx, bdev); 8436 8437 spdk_spin_lock(&g_bdev_mgr.spinlock); 8438 tmp = spdk_bdev_next(bdev); 8439 bdev_close(bdev, desc); 8440 if (rc != 0) { 8441 break; 8442 } 8443 bdev = tmp; 8444 } 8445 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8446 8447 return rc; 8448 } 8449 8450 int 8451 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8452 { 8453 struct spdk_bdev *bdev, *tmp; 8454 struct spdk_bdev_desc *desc; 8455 int rc = 0; 8456 8457 assert(fn != NULL); 8458 8459 spdk_spin_lock(&g_bdev_mgr.spinlock); 8460 bdev = spdk_bdev_first_leaf(); 8461 while (bdev != NULL) { 8462 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8463 if (rc != 0) { 8464 break; 8465 } 8466 rc = bdev_open(bdev, false, desc); 8467 if (rc != 0) { 8468 bdev_desc_free(desc); 8469 if (rc == -ENODEV) { 8470 /* Ignore the error and move to the next bdev. */ 8471 rc = 0; 8472 bdev = spdk_bdev_next_leaf(bdev); 8473 continue; 8474 } 8475 break; 8476 } 8477 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8478 8479 rc = fn(ctx, bdev); 8480 8481 spdk_spin_lock(&g_bdev_mgr.spinlock); 8482 tmp = spdk_bdev_next_leaf(bdev); 8483 bdev_close(bdev, desc); 8484 if (rc != 0) { 8485 break; 8486 } 8487 bdev = tmp; 8488 } 8489 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8490 8491 return rc; 8492 } 8493 8494 void 8495 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8496 { 8497 struct iovec *iovs; 8498 int iovcnt; 8499 8500 if (bdev_io == NULL) { 8501 return; 8502 } 8503 8504 switch (bdev_io->type) { 8505 case SPDK_BDEV_IO_TYPE_READ: 8506 case SPDK_BDEV_IO_TYPE_WRITE: 8507 case SPDK_BDEV_IO_TYPE_ZCOPY: 8508 iovs = bdev_io->u.bdev.iovs; 8509 iovcnt = bdev_io->u.bdev.iovcnt; 8510 break; 8511 default: 8512 iovs = NULL; 8513 iovcnt = 0; 8514 break; 8515 } 8516 8517 if (iovp) { 8518 *iovp = iovs; 8519 } 8520 if (iovcntp) { 8521 *iovcntp = iovcnt; 8522 } 8523 } 8524 8525 void * 8526 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8527 { 8528 if (bdev_io == NULL) { 8529 return NULL; 8530 } 8531 8532 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8533 return NULL; 8534 } 8535 8536 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8537 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8538 return bdev_io->u.bdev.md_buf; 8539 } 8540 8541 return NULL; 8542 } 8543 8544 void * 8545 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8546 { 8547 if (bdev_io == NULL) { 8548 assert(false); 8549 return NULL; 8550 } 8551 8552 return bdev_io->internal.caller_ctx; 8553 } 8554 8555 void 8556 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8557 { 8558 8559 if (spdk_bdev_module_list_find(bdev_module->name)) { 8560 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8561 assert(false); 8562 } 8563 8564 spdk_spin_init(&bdev_module->internal.spinlock); 8565 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8566 8567 /* 8568 * Modules with examine callbacks must be initialized first, so they are 8569 * ready to handle examine callbacks from later modules that will 8570 * register physical bdevs. 8571 */ 8572 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8573 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8574 } else { 8575 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8576 } 8577 } 8578 8579 struct spdk_bdev_module * 8580 spdk_bdev_module_list_find(const char *name) 8581 { 8582 struct spdk_bdev_module *bdev_module; 8583 8584 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8585 if (strcmp(name, bdev_module->name) == 0) { 8586 break; 8587 } 8588 } 8589 8590 return bdev_module; 8591 } 8592 8593 static int 8594 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8595 { 8596 uint64_t num_blocks; 8597 void *md_buf = NULL; 8598 8599 num_blocks = bdev_io->u.bdev.num_blocks; 8600 8601 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8602 md_buf = (char *)g_bdev_mgr.zero_buffer + 8603 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8604 } 8605 8606 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8607 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8608 g_bdev_mgr.zero_buffer, md_buf, 8609 bdev_io->u.bdev.offset_blocks, num_blocks, 8610 bdev_write_zero_buffer_done, bdev_io); 8611 } 8612 8613 static void 8614 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8615 { 8616 struct spdk_bdev_io *parent_io = cb_arg; 8617 8618 spdk_bdev_free_io(bdev_io); 8619 8620 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8621 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8622 } 8623 8624 static void 8625 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8626 { 8627 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8628 ctx->bdev->internal.qos_mod_in_progress = false; 8629 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8630 8631 if (ctx->cb_fn) { 8632 ctx->cb_fn(ctx->cb_arg, status); 8633 } 8634 free(ctx); 8635 } 8636 8637 static void 8638 bdev_disable_qos_done(void *cb_arg) 8639 { 8640 struct set_qos_limit_ctx *ctx = cb_arg; 8641 struct spdk_bdev *bdev = ctx->bdev; 8642 struct spdk_bdev_io *bdev_io; 8643 struct spdk_bdev_qos *qos; 8644 8645 spdk_spin_lock(&bdev->internal.spinlock); 8646 qos = bdev->internal.qos; 8647 bdev->internal.qos = NULL; 8648 spdk_spin_unlock(&bdev->internal.spinlock); 8649 8650 while (!TAILQ_EMPTY(&qos->queued)) { 8651 /* Send queued I/O back to their original thread for resubmission. */ 8652 bdev_io = TAILQ_FIRST(&qos->queued); 8653 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8654 8655 if (bdev_io->internal.io_submit_ch) { 8656 /* 8657 * Channel was changed when sending it to the QoS thread - change it back 8658 * before sending it back to the original thread. 8659 */ 8660 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8661 bdev_io->internal.io_submit_ch = NULL; 8662 } 8663 8664 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8665 _bdev_io_submit, bdev_io); 8666 } 8667 8668 if (qos->thread != NULL) { 8669 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8670 spdk_poller_unregister(&qos->poller); 8671 } 8672 8673 free(qos); 8674 8675 bdev_set_qos_limit_done(ctx, 0); 8676 } 8677 8678 static void 8679 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8680 { 8681 struct set_qos_limit_ctx *ctx = _ctx; 8682 struct spdk_thread *thread; 8683 8684 spdk_spin_lock(&bdev->internal.spinlock); 8685 thread = bdev->internal.qos->thread; 8686 spdk_spin_unlock(&bdev->internal.spinlock); 8687 8688 if (thread != NULL) { 8689 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8690 } else { 8691 bdev_disable_qos_done(ctx); 8692 } 8693 } 8694 8695 static void 8696 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8697 struct spdk_io_channel *ch, void *_ctx) 8698 { 8699 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8700 8701 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8702 8703 spdk_bdev_for_each_channel_continue(i, 0); 8704 } 8705 8706 static void 8707 bdev_update_qos_rate_limit_msg(void *cb_arg) 8708 { 8709 struct set_qos_limit_ctx *ctx = cb_arg; 8710 struct spdk_bdev *bdev = ctx->bdev; 8711 8712 spdk_spin_lock(&bdev->internal.spinlock); 8713 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8714 spdk_spin_unlock(&bdev->internal.spinlock); 8715 8716 bdev_set_qos_limit_done(ctx, 0); 8717 } 8718 8719 static void 8720 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8721 struct spdk_io_channel *ch, void *_ctx) 8722 { 8723 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8724 8725 spdk_spin_lock(&bdev->internal.spinlock); 8726 bdev_enable_qos(bdev, bdev_ch); 8727 spdk_spin_unlock(&bdev->internal.spinlock); 8728 spdk_bdev_for_each_channel_continue(i, 0); 8729 } 8730 8731 static void 8732 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8733 { 8734 struct set_qos_limit_ctx *ctx = _ctx; 8735 8736 bdev_set_qos_limit_done(ctx, status); 8737 } 8738 8739 static void 8740 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8741 { 8742 int i; 8743 8744 assert(bdev->internal.qos != NULL); 8745 8746 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8747 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8748 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8749 8750 if (limits[i] == 0) { 8751 bdev->internal.qos->rate_limits[i].limit = 8752 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8753 } 8754 } 8755 } 8756 } 8757 8758 void 8759 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8760 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8761 { 8762 struct set_qos_limit_ctx *ctx; 8763 uint32_t limit_set_complement; 8764 uint64_t min_limit_per_sec; 8765 int i; 8766 bool disable_rate_limit = true; 8767 8768 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8769 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8770 continue; 8771 } 8772 8773 if (limits[i] > 0) { 8774 disable_rate_limit = false; 8775 } 8776 8777 if (bdev_qos_is_iops_rate_limit(i) == true) { 8778 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8779 } else { 8780 /* Change from megabyte to byte rate limit */ 8781 limits[i] = limits[i] * 1024 * 1024; 8782 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8783 } 8784 8785 limit_set_complement = limits[i] % min_limit_per_sec; 8786 if (limit_set_complement) { 8787 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8788 limits[i], min_limit_per_sec); 8789 limits[i] += min_limit_per_sec - limit_set_complement; 8790 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8791 } 8792 } 8793 8794 ctx = calloc(1, sizeof(*ctx)); 8795 if (ctx == NULL) { 8796 cb_fn(cb_arg, -ENOMEM); 8797 return; 8798 } 8799 8800 ctx->cb_fn = cb_fn; 8801 ctx->cb_arg = cb_arg; 8802 ctx->bdev = bdev; 8803 8804 spdk_spin_lock(&bdev->internal.spinlock); 8805 if (bdev->internal.qos_mod_in_progress) { 8806 spdk_spin_unlock(&bdev->internal.spinlock); 8807 free(ctx); 8808 cb_fn(cb_arg, -EAGAIN); 8809 return; 8810 } 8811 bdev->internal.qos_mod_in_progress = true; 8812 8813 if (disable_rate_limit == true && bdev->internal.qos) { 8814 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8815 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8816 (bdev->internal.qos->rate_limits[i].limit > 0 && 8817 bdev->internal.qos->rate_limits[i].limit != 8818 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8819 disable_rate_limit = false; 8820 break; 8821 } 8822 } 8823 } 8824 8825 if (disable_rate_limit == false) { 8826 if (bdev->internal.qos == NULL) { 8827 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8828 if (!bdev->internal.qos) { 8829 spdk_spin_unlock(&bdev->internal.spinlock); 8830 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8831 bdev_set_qos_limit_done(ctx, -ENOMEM); 8832 return; 8833 } 8834 } 8835 8836 if (bdev->internal.qos->thread == NULL) { 8837 /* Enabling */ 8838 bdev_set_qos_rate_limits(bdev, limits); 8839 8840 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8841 bdev_enable_qos_done); 8842 } else { 8843 /* Updating */ 8844 bdev_set_qos_rate_limits(bdev, limits); 8845 8846 spdk_thread_send_msg(bdev->internal.qos->thread, 8847 bdev_update_qos_rate_limit_msg, ctx); 8848 } 8849 } else { 8850 if (bdev->internal.qos != NULL) { 8851 bdev_set_qos_rate_limits(bdev, limits); 8852 8853 /* Disabling */ 8854 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8855 bdev_disable_qos_msg_done); 8856 } else { 8857 spdk_spin_unlock(&bdev->internal.spinlock); 8858 bdev_set_qos_limit_done(ctx, 0); 8859 return; 8860 } 8861 } 8862 8863 spdk_spin_unlock(&bdev->internal.spinlock); 8864 } 8865 8866 struct spdk_bdev_histogram_ctx { 8867 spdk_bdev_histogram_status_cb cb_fn; 8868 void *cb_arg; 8869 struct spdk_bdev *bdev; 8870 int status; 8871 }; 8872 8873 static void 8874 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8875 { 8876 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8877 8878 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8879 ctx->bdev->internal.histogram_in_progress = false; 8880 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8881 ctx->cb_fn(ctx->cb_arg, ctx->status); 8882 free(ctx); 8883 } 8884 8885 static void 8886 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8887 struct spdk_io_channel *_ch, void *_ctx) 8888 { 8889 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8890 8891 if (ch->histogram != NULL) { 8892 spdk_histogram_data_free(ch->histogram); 8893 ch->histogram = NULL; 8894 } 8895 spdk_bdev_for_each_channel_continue(i, 0); 8896 } 8897 8898 static void 8899 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8900 { 8901 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8902 8903 if (status != 0) { 8904 ctx->status = status; 8905 ctx->bdev->internal.histogram_enabled = false; 8906 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8907 bdev_histogram_disable_channel_cb); 8908 } else { 8909 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8910 ctx->bdev->internal.histogram_in_progress = false; 8911 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8912 ctx->cb_fn(ctx->cb_arg, ctx->status); 8913 free(ctx); 8914 } 8915 } 8916 8917 static void 8918 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8919 struct spdk_io_channel *_ch, void *_ctx) 8920 { 8921 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8922 int status = 0; 8923 8924 if (ch->histogram == NULL) { 8925 ch->histogram = spdk_histogram_data_alloc(); 8926 if (ch->histogram == NULL) { 8927 status = -ENOMEM; 8928 } 8929 } 8930 8931 spdk_bdev_for_each_channel_continue(i, status); 8932 } 8933 8934 void 8935 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8936 void *cb_arg, bool enable) 8937 { 8938 struct spdk_bdev_histogram_ctx *ctx; 8939 8940 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8941 if (ctx == NULL) { 8942 cb_fn(cb_arg, -ENOMEM); 8943 return; 8944 } 8945 8946 ctx->bdev = bdev; 8947 ctx->status = 0; 8948 ctx->cb_fn = cb_fn; 8949 ctx->cb_arg = cb_arg; 8950 8951 spdk_spin_lock(&bdev->internal.spinlock); 8952 if (bdev->internal.histogram_in_progress) { 8953 spdk_spin_unlock(&bdev->internal.spinlock); 8954 free(ctx); 8955 cb_fn(cb_arg, -EAGAIN); 8956 return; 8957 } 8958 8959 bdev->internal.histogram_in_progress = true; 8960 spdk_spin_unlock(&bdev->internal.spinlock); 8961 8962 bdev->internal.histogram_enabled = enable; 8963 8964 if (enable) { 8965 /* Allocate histogram for each channel */ 8966 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8967 bdev_histogram_enable_channel_cb); 8968 } else { 8969 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8970 bdev_histogram_disable_channel_cb); 8971 } 8972 } 8973 8974 struct spdk_bdev_histogram_data_ctx { 8975 spdk_bdev_histogram_data_cb cb_fn; 8976 void *cb_arg; 8977 struct spdk_bdev *bdev; 8978 /** merged histogram data from all channels */ 8979 struct spdk_histogram_data *histogram; 8980 }; 8981 8982 static void 8983 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8984 { 8985 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8986 8987 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8988 free(ctx); 8989 } 8990 8991 static void 8992 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8993 struct spdk_io_channel *_ch, void *_ctx) 8994 { 8995 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8996 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8997 int status = 0; 8998 8999 if (ch->histogram == NULL) { 9000 status = -EFAULT; 9001 } else { 9002 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9003 } 9004 9005 spdk_bdev_for_each_channel_continue(i, status); 9006 } 9007 9008 void 9009 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9010 spdk_bdev_histogram_data_cb cb_fn, 9011 void *cb_arg) 9012 { 9013 struct spdk_bdev_histogram_data_ctx *ctx; 9014 9015 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9016 if (ctx == NULL) { 9017 cb_fn(cb_arg, -ENOMEM, NULL); 9018 return; 9019 } 9020 9021 ctx->bdev = bdev; 9022 ctx->cb_fn = cb_fn; 9023 ctx->cb_arg = cb_arg; 9024 9025 ctx->histogram = histogram; 9026 9027 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9028 bdev_histogram_get_channel_cb); 9029 } 9030 9031 void 9032 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9033 void *cb_arg) 9034 { 9035 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9036 int status = 0; 9037 9038 assert(cb_fn != NULL); 9039 9040 if (bdev_ch->histogram == NULL) { 9041 status = -EFAULT; 9042 } 9043 cb_fn(cb_arg, status, bdev_ch->histogram); 9044 } 9045 9046 size_t 9047 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9048 size_t max_events) 9049 { 9050 struct media_event_entry *entry; 9051 size_t num_events = 0; 9052 9053 for (; num_events < max_events; ++num_events) { 9054 entry = TAILQ_FIRST(&desc->pending_media_events); 9055 if (entry == NULL) { 9056 break; 9057 } 9058 9059 events[num_events] = entry->event; 9060 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9061 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9062 } 9063 9064 return num_events; 9065 } 9066 9067 int 9068 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9069 size_t num_events) 9070 { 9071 struct spdk_bdev_desc *desc; 9072 struct media_event_entry *entry; 9073 size_t event_id; 9074 int rc = 0; 9075 9076 assert(bdev->media_events); 9077 9078 spdk_spin_lock(&bdev->internal.spinlock); 9079 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9080 if (desc->write) { 9081 break; 9082 } 9083 } 9084 9085 if (desc == NULL || desc->media_events_buffer == NULL) { 9086 rc = -ENODEV; 9087 goto out; 9088 } 9089 9090 for (event_id = 0; event_id < num_events; ++event_id) { 9091 entry = TAILQ_FIRST(&desc->free_media_events); 9092 if (entry == NULL) { 9093 break; 9094 } 9095 9096 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9097 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9098 entry->event = events[event_id]; 9099 } 9100 9101 rc = event_id; 9102 out: 9103 spdk_spin_unlock(&bdev->internal.spinlock); 9104 return rc; 9105 } 9106 9107 static void 9108 _media_management_notify(void *arg) 9109 { 9110 struct spdk_bdev_desc *desc = arg; 9111 9112 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9113 } 9114 9115 void 9116 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9117 { 9118 struct spdk_bdev_desc *desc; 9119 9120 spdk_spin_lock(&bdev->internal.spinlock); 9121 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9122 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9123 event_notify(desc, _media_management_notify); 9124 } 9125 } 9126 spdk_spin_unlock(&bdev->internal.spinlock); 9127 } 9128 9129 struct locked_lba_range_ctx { 9130 struct lba_range range; 9131 struct lba_range *current_range; 9132 struct lba_range *owner_range; 9133 struct spdk_poller *poller; 9134 lock_range_cb cb_fn; 9135 void *cb_arg; 9136 }; 9137 9138 static void 9139 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9140 { 9141 struct locked_lba_range_ctx *ctx = _ctx; 9142 9143 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9144 free(ctx); 9145 } 9146 9147 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9148 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9149 9150 static void 9151 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9152 { 9153 struct locked_lba_range_ctx *ctx = _ctx; 9154 9155 if (status == -ENOMEM) { 9156 /* One of the channels could not allocate a range object. 9157 * So we have to go back and clean up any ranges that were 9158 * allocated successfully before we return error status to 9159 * the caller. We can reuse the unlock function to do that 9160 * clean up. 9161 */ 9162 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9163 bdev_lock_error_cleanup_cb); 9164 return; 9165 } 9166 9167 /* All channels have locked this range and no I/O overlapping the range 9168 * are outstanding! Set the owner_ch for the range object for the 9169 * locking channel, so that this channel will know that it is allowed 9170 * to write to this range. 9171 */ 9172 if (ctx->owner_range != NULL) { 9173 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9174 } 9175 9176 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9177 9178 /* Don't free the ctx here. Its range is in the bdev's global list of 9179 * locked ranges still, and will be removed and freed when this range 9180 * is later unlocked. 9181 */ 9182 } 9183 9184 static int 9185 bdev_lock_lba_range_check_io(void *_i) 9186 { 9187 struct spdk_bdev_channel_iter *i = _i; 9188 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9189 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9190 struct locked_lba_range_ctx *ctx = i->ctx; 9191 struct lba_range *range = ctx->current_range; 9192 struct spdk_bdev_io *bdev_io; 9193 9194 spdk_poller_unregister(&ctx->poller); 9195 9196 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9197 * range. But we need to wait until any outstanding IO overlapping with this range 9198 * are completed. 9199 */ 9200 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9201 if (bdev_io_range_is_locked(bdev_io, range)) { 9202 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9203 return SPDK_POLLER_BUSY; 9204 } 9205 } 9206 9207 spdk_bdev_for_each_channel_continue(i, 0); 9208 return SPDK_POLLER_BUSY; 9209 } 9210 9211 static void 9212 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9213 struct spdk_io_channel *_ch, void *_ctx) 9214 { 9215 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9216 struct locked_lba_range_ctx *ctx = _ctx; 9217 struct lba_range *range; 9218 9219 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9220 if (range->length == ctx->range.length && 9221 range->offset == ctx->range.offset && 9222 range->locked_ctx == ctx->range.locked_ctx) { 9223 /* This range already exists on this channel, so don't add 9224 * it again. This can happen when a new channel is created 9225 * while the for_each_channel operation is in progress. 9226 * Do not check for outstanding I/O in that case, since the 9227 * range was locked before any I/O could be submitted to the 9228 * new channel. 9229 */ 9230 spdk_bdev_for_each_channel_continue(i, 0); 9231 return; 9232 } 9233 } 9234 9235 range = calloc(1, sizeof(*range)); 9236 if (range == NULL) { 9237 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9238 return; 9239 } 9240 9241 range->length = ctx->range.length; 9242 range->offset = ctx->range.offset; 9243 range->locked_ctx = ctx->range.locked_ctx; 9244 ctx->current_range = range; 9245 if (ctx->range.owner_ch == ch) { 9246 /* This is the range object for the channel that will hold 9247 * the lock. Store it in the ctx object so that we can easily 9248 * set its owner_ch after the lock is finally acquired. 9249 */ 9250 ctx->owner_range = range; 9251 } 9252 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9253 bdev_lock_lba_range_check_io(i); 9254 } 9255 9256 static void 9257 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9258 { 9259 assert(spdk_get_thread() == ctx->range.owner_thread); 9260 assert(ctx->range.owner_ch == NULL || 9261 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9262 9263 /* We will add a copy of this range to each channel now. */ 9264 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9265 bdev_lock_lba_range_cb); 9266 } 9267 9268 static bool 9269 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9270 { 9271 struct lba_range *r; 9272 9273 TAILQ_FOREACH(r, tailq, tailq) { 9274 if (bdev_lba_range_overlapped(range, r)) { 9275 return true; 9276 } 9277 } 9278 return false; 9279 } 9280 9281 static int 9282 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9283 uint64_t offset, uint64_t length, 9284 lock_range_cb cb_fn, void *cb_arg) 9285 { 9286 struct locked_lba_range_ctx *ctx; 9287 9288 ctx = calloc(1, sizeof(*ctx)); 9289 if (ctx == NULL) { 9290 return -ENOMEM; 9291 } 9292 9293 ctx->range.offset = offset; 9294 ctx->range.length = length; 9295 ctx->range.owner_thread = spdk_get_thread(); 9296 ctx->range.owner_ch = ch; 9297 ctx->range.locked_ctx = cb_arg; 9298 ctx->range.bdev = bdev; 9299 ctx->cb_fn = cb_fn; 9300 ctx->cb_arg = cb_arg; 9301 9302 spdk_spin_lock(&bdev->internal.spinlock); 9303 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9304 /* There is an active lock overlapping with this range. 9305 * Put it on the pending list until this range no 9306 * longer overlaps with another. 9307 */ 9308 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9309 } else { 9310 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9311 bdev_lock_lba_range_ctx(bdev, ctx); 9312 } 9313 spdk_spin_unlock(&bdev->internal.spinlock); 9314 return 0; 9315 } 9316 9317 static int 9318 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9319 uint64_t offset, uint64_t length, 9320 lock_range_cb cb_fn, void *cb_arg) 9321 { 9322 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9323 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9324 9325 if (cb_arg == NULL) { 9326 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9327 return -EINVAL; 9328 } 9329 9330 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9331 } 9332 9333 static void 9334 bdev_lock_lba_range_ctx_msg(void *_ctx) 9335 { 9336 struct locked_lba_range_ctx *ctx = _ctx; 9337 9338 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9339 } 9340 9341 static void 9342 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9343 { 9344 struct locked_lba_range_ctx *ctx = _ctx; 9345 struct locked_lba_range_ctx *pending_ctx; 9346 struct lba_range *range, *tmp; 9347 9348 spdk_spin_lock(&bdev->internal.spinlock); 9349 /* Check if there are any pending locked ranges that overlap with this range 9350 * that was just unlocked. If there are, check that it doesn't overlap with any 9351 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9352 * the lock process. 9353 */ 9354 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9355 if (bdev_lba_range_overlapped(range, &ctx->range) && 9356 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9357 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9358 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9359 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9360 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9361 bdev_lock_lba_range_ctx_msg, pending_ctx); 9362 } 9363 } 9364 spdk_spin_unlock(&bdev->internal.spinlock); 9365 9366 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9367 free(ctx); 9368 } 9369 9370 static void 9371 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9372 struct spdk_io_channel *_ch, void *_ctx) 9373 { 9374 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9375 struct locked_lba_range_ctx *ctx = _ctx; 9376 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9377 struct spdk_bdev_io *bdev_io; 9378 struct lba_range *range; 9379 9380 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9381 if (ctx->range.offset == range->offset && 9382 ctx->range.length == range->length && 9383 ctx->range.locked_ctx == range->locked_ctx) { 9384 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9385 free(range); 9386 break; 9387 } 9388 } 9389 9390 /* Note: we should almost always be able to assert that the range specified 9391 * was found. But there are some very rare corner cases where a new channel 9392 * gets created simultaneously with a range unlock, where this function 9393 * would execute on that new channel and wouldn't have the range. 9394 * We also use this to clean up range allocations when a later allocation 9395 * fails in the locking path. 9396 * So we can't actually assert() here. 9397 */ 9398 9399 /* Swap the locked IO into a temporary list, and then try to submit them again. 9400 * We could hyper-optimize this to only resubmit locked I/O that overlap 9401 * with the range that was just unlocked, but this isn't a performance path so 9402 * we go for simplicity here. 9403 */ 9404 TAILQ_INIT(&io_locked); 9405 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9406 while (!TAILQ_EMPTY(&io_locked)) { 9407 bdev_io = TAILQ_FIRST(&io_locked); 9408 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9409 bdev_io_submit(bdev_io); 9410 } 9411 9412 spdk_bdev_for_each_channel_continue(i, 0); 9413 } 9414 9415 static int 9416 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9417 lock_range_cb cb_fn, void *cb_arg) 9418 { 9419 struct locked_lba_range_ctx *ctx; 9420 struct lba_range *range; 9421 9422 spdk_spin_lock(&bdev->internal.spinlock); 9423 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9424 * and remove it. This ensures new channels don't inherit the locked range. 9425 * Then we will send a message to each channel to remove the range from its 9426 * per-channel list. 9427 */ 9428 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9429 if (range->offset == offset && range->length == length && 9430 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9431 break; 9432 } 9433 } 9434 if (range == NULL) { 9435 assert(false); 9436 spdk_spin_unlock(&bdev->internal.spinlock); 9437 return -EINVAL; 9438 } 9439 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9440 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9441 spdk_spin_unlock(&bdev->internal.spinlock); 9442 9443 ctx->cb_fn = cb_fn; 9444 ctx->cb_arg = cb_arg; 9445 9446 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9447 bdev_unlock_lba_range_cb); 9448 return 0; 9449 } 9450 9451 static int 9452 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9453 uint64_t offset, uint64_t length, 9454 lock_range_cb cb_fn, void *cb_arg) 9455 { 9456 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9457 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9458 struct lba_range *range; 9459 bool range_found = false; 9460 9461 /* Let's make sure the specified channel actually has a lock on 9462 * the specified range. Note that the range must match exactly. 9463 */ 9464 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9465 if (range->offset == offset && range->length == length && 9466 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9467 range_found = true; 9468 break; 9469 } 9470 } 9471 9472 if (!range_found) { 9473 return -EINVAL; 9474 } 9475 9476 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9477 } 9478 9479 struct bdev_quiesce_ctx { 9480 spdk_bdev_quiesce_cb cb_fn; 9481 void *cb_arg; 9482 }; 9483 9484 static void 9485 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9486 { 9487 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9488 9489 if (quiesce_ctx->cb_fn != NULL) { 9490 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9491 } 9492 9493 free(quiesce_ctx); 9494 } 9495 9496 static void 9497 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9498 { 9499 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9500 struct spdk_bdev_module *module = range->bdev->module; 9501 9502 if (status != 0) { 9503 if (quiesce_ctx->cb_fn != NULL) { 9504 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9505 } 9506 free(quiesce_ctx); 9507 return; 9508 } 9509 9510 spdk_spin_lock(&module->internal.spinlock); 9511 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9512 spdk_spin_unlock(&module->internal.spinlock); 9513 9514 if (quiesce_ctx->cb_fn != NULL) { 9515 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9516 quiesce_ctx->cb_fn = NULL; 9517 quiesce_ctx->cb_arg = NULL; 9518 } 9519 /* quiesce_ctx will be freed on unquiesce */ 9520 } 9521 9522 static int 9523 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9524 uint64_t offset, uint64_t length, 9525 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9526 bool unquiesce) 9527 { 9528 struct bdev_quiesce_ctx *quiesce_ctx; 9529 int rc; 9530 9531 if (module != bdev->module) { 9532 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9533 return -EINVAL; 9534 } 9535 9536 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9537 return -EINVAL; 9538 } 9539 9540 if (unquiesce) { 9541 struct lba_range *range; 9542 9543 /* Make sure the specified range is actually quiesced in the specified module and 9544 * then remove it from the list. Note that the range must match exactly. 9545 */ 9546 spdk_spin_lock(&module->internal.spinlock); 9547 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9548 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9549 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9550 break; 9551 } 9552 } 9553 spdk_spin_unlock(&module->internal.spinlock); 9554 9555 if (range == NULL) { 9556 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9557 return -EINVAL; 9558 } 9559 9560 quiesce_ctx = range->locked_ctx; 9561 quiesce_ctx->cb_fn = cb_fn; 9562 quiesce_ctx->cb_arg = cb_arg; 9563 9564 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9565 } else { 9566 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9567 if (quiesce_ctx == NULL) { 9568 return -ENOMEM; 9569 } 9570 9571 quiesce_ctx->cb_fn = cb_fn; 9572 quiesce_ctx->cb_arg = cb_arg; 9573 9574 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9575 if (rc != 0) { 9576 free(quiesce_ctx); 9577 } 9578 } 9579 9580 return rc; 9581 } 9582 9583 int 9584 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9585 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9586 { 9587 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9588 } 9589 9590 int 9591 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9592 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9593 { 9594 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9595 } 9596 9597 int 9598 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9599 uint64_t offset, uint64_t length, 9600 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9601 { 9602 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9603 } 9604 9605 int 9606 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9607 uint64_t offset, uint64_t length, 9608 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9609 { 9610 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9611 } 9612 9613 int 9614 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9615 int array_size) 9616 { 9617 if (!bdev) { 9618 return -EINVAL; 9619 } 9620 9621 if (bdev->fn_table->get_memory_domains) { 9622 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9623 } 9624 9625 return 0; 9626 } 9627 9628 struct spdk_bdev_for_each_io_ctx { 9629 void *ctx; 9630 spdk_bdev_io_fn fn; 9631 spdk_bdev_for_each_io_cb cb; 9632 }; 9633 9634 static void 9635 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9636 struct spdk_io_channel *io_ch, void *_ctx) 9637 { 9638 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9639 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9640 struct spdk_bdev_io *bdev_io; 9641 int rc = 0; 9642 9643 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9644 rc = ctx->fn(ctx->ctx, bdev_io); 9645 if (rc != 0) { 9646 break; 9647 } 9648 } 9649 9650 spdk_bdev_for_each_channel_continue(i, rc); 9651 } 9652 9653 static void 9654 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9655 { 9656 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9657 9658 ctx->cb(ctx->ctx, status); 9659 9660 free(ctx); 9661 } 9662 9663 void 9664 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9665 spdk_bdev_for_each_io_cb cb) 9666 { 9667 struct spdk_bdev_for_each_io_ctx *ctx; 9668 9669 assert(fn != NULL && cb != NULL); 9670 9671 ctx = calloc(1, sizeof(*ctx)); 9672 if (ctx == NULL) { 9673 SPDK_ERRLOG("Failed to allocate context.\n"); 9674 cb(_ctx, -ENOMEM); 9675 return; 9676 } 9677 9678 ctx->ctx = _ctx; 9679 ctx->fn = fn; 9680 ctx->cb = cb; 9681 9682 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9683 bdev_for_each_io_done); 9684 } 9685 9686 void 9687 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9688 { 9689 spdk_for_each_channel_continue(iter->i, status); 9690 } 9691 9692 static struct spdk_bdev * 9693 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9694 { 9695 void *io_device = spdk_io_channel_iter_get_io_device(i); 9696 9697 return __bdev_from_io_dev(io_device); 9698 } 9699 9700 static void 9701 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9702 { 9703 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9704 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9705 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9706 9707 iter->i = i; 9708 iter->fn(iter, bdev, ch, iter->ctx); 9709 } 9710 9711 static void 9712 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9713 { 9714 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9715 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9716 9717 iter->i = i; 9718 iter->cpl(bdev, iter->ctx, status); 9719 9720 free(iter); 9721 } 9722 9723 void 9724 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9725 void *ctx, spdk_bdev_for_each_channel_done cpl) 9726 { 9727 struct spdk_bdev_channel_iter *iter; 9728 9729 assert(bdev != NULL && fn != NULL && ctx != NULL); 9730 9731 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9732 if (iter == NULL) { 9733 SPDK_ERRLOG("Unable to allocate iterator\n"); 9734 assert(false); 9735 return; 9736 } 9737 9738 iter->fn = fn; 9739 iter->cpl = cpl; 9740 iter->ctx = ctx; 9741 9742 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9743 iter, bdev_each_channel_cpl); 9744 } 9745 9746 static void 9747 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9748 { 9749 struct spdk_bdev_io *parent_io = cb_arg; 9750 9751 spdk_bdev_free_io(bdev_io); 9752 9753 /* Check return status of write */ 9754 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9755 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9756 } 9757 9758 static void 9759 bdev_copy_do_write(void *_bdev_io) 9760 { 9761 struct spdk_bdev_io *bdev_io = _bdev_io; 9762 int rc; 9763 9764 /* Write blocks */ 9765 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9766 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9767 bdev_io->u.bdev.iovs[0].iov_base, 9768 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9769 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 9770 9771 if (rc == -ENOMEM) { 9772 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9773 } else if (rc != 0) { 9774 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9775 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9776 } 9777 } 9778 9779 static void 9780 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9781 { 9782 struct spdk_bdev_io *parent_io = cb_arg; 9783 9784 spdk_bdev_free_io(bdev_io); 9785 9786 /* Check return status of read */ 9787 if (!success) { 9788 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9789 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9790 return; 9791 } 9792 9793 /* Do write */ 9794 bdev_copy_do_write(parent_io); 9795 } 9796 9797 static void 9798 bdev_copy_do_read(void *_bdev_io) 9799 { 9800 struct spdk_bdev_io *bdev_io = _bdev_io; 9801 int rc; 9802 9803 /* Read blocks */ 9804 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9805 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9806 bdev_io->u.bdev.iovs[0].iov_base, 9807 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9808 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 9809 9810 if (rc == -ENOMEM) { 9811 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9812 } else if (rc != 0) { 9813 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9814 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9815 } 9816 } 9817 9818 static void 9819 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9820 { 9821 if (!success) { 9822 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9823 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9824 return; 9825 } 9826 9827 bdev_copy_do_read(bdev_io); 9828 } 9829 9830 int 9831 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9832 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9833 spdk_bdev_io_completion_cb cb, void *cb_arg) 9834 { 9835 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9836 struct spdk_bdev_io *bdev_io; 9837 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9838 9839 if (!desc->write) { 9840 return -EBADF; 9841 } 9842 9843 if (num_blocks == 0) { 9844 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9845 return -EINVAL; 9846 } 9847 9848 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9849 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9850 SPDK_DEBUGLOG(bdev, 9851 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9852 dst_offset_blocks, src_offset_blocks, num_blocks); 9853 return -EINVAL; 9854 } 9855 9856 bdev_io = bdev_channel_get_io(channel); 9857 if (!bdev_io) { 9858 return -ENOMEM; 9859 } 9860 9861 bdev_io->internal.ch = channel; 9862 bdev_io->internal.desc = desc; 9863 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9864 9865 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9866 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9867 bdev_io->u.bdev.num_blocks = num_blocks; 9868 bdev_io->u.bdev.memory_domain = NULL; 9869 bdev_io->u.bdev.memory_domain_ctx = NULL; 9870 bdev_io->u.bdev.iovs = NULL; 9871 bdev_io->u.bdev.iovcnt = 0; 9872 bdev_io->u.bdev.md_buf = NULL; 9873 bdev_io->u.bdev.accel_sequence = NULL; 9874 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9875 9876 if (dst_offset_blocks == src_offset_blocks) { 9877 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9878 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9879 9880 return 0; 9881 } 9882 9883 9884 /* If the copy size is large and should be split, use the generic split logic 9885 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 9886 * 9887 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 9888 * emulate it using regular read and write requests otherwise. 9889 */ 9890 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 9891 bdev_io->internal.split) { 9892 bdev_io_submit(bdev_io); 9893 return 0; 9894 } 9895 9896 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9897 9898 return 0; 9899 } 9900 9901 SPDK_LOG_REGISTER_COMPONENT(bdev) 9902 9903 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9904 { 9905 struct spdk_trace_tpoint_opts opts[] = { 9906 { 9907 "BDEV_IO_START", TRACE_BDEV_IO_START, 9908 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9909 { 9910 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9911 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9912 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9913 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9914 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9915 } 9916 }, 9917 { 9918 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9919 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9920 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9921 }, 9922 { 9923 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9924 OWNER_BDEV, OBJECT_NONE, 1, 9925 { 9926 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9927 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9928 } 9929 }, 9930 { 9931 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9932 OWNER_BDEV, OBJECT_NONE, 0, 9933 { 9934 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9935 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9936 } 9937 }, 9938 }; 9939 9940 9941 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9942 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9943 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9944 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9945 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9946 } 9947