1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 struct spdk_histogram_data *histogram; 308 309 #ifdef SPDK_CONFIG_VTUNE 310 uint64_t start_tsc; 311 uint64_t interval_tsc; 312 __itt_string_handle *handle; 313 struct spdk_bdev_io_stat *prev_stat; 314 #endif 315 316 bdev_io_tailq_t queued_resets; 317 318 lba_range_tailq_t locked_ranges; 319 320 /** List of I/Os queued by QoS. */ 321 bdev_io_tailq_t qos_queued_io; 322 }; 323 324 struct media_event_entry { 325 struct spdk_bdev_media_event event; 326 TAILQ_ENTRY(media_event_entry) tailq; 327 }; 328 329 #define MEDIA_EVENT_POOL_SIZE 64 330 331 struct spdk_bdev_desc { 332 struct spdk_bdev *bdev; 333 struct spdk_thread *thread; 334 struct { 335 spdk_bdev_event_cb_t event_fn; 336 void *ctx; 337 } callback; 338 bool closed; 339 bool write; 340 bool memory_domains_supported; 341 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 342 struct spdk_spinlock spinlock; 343 uint32_t refs; 344 TAILQ_HEAD(, media_event_entry) pending_media_events; 345 TAILQ_HEAD(, media_event_entry) free_media_events; 346 struct media_event_entry *media_events_buffer; 347 TAILQ_ENTRY(spdk_bdev_desc) link; 348 349 uint64_t timeout_in_sec; 350 spdk_bdev_io_timeout_cb cb_fn; 351 void *cb_arg; 352 struct spdk_poller *io_timeout_poller; 353 struct spdk_bdev_module_claim *claim; 354 }; 355 356 struct spdk_bdev_iostat_ctx { 357 struct spdk_bdev_io_stat *stat; 358 spdk_bdev_get_device_stat_cb cb; 359 void *cb_arg; 360 }; 361 362 struct set_qos_limit_ctx { 363 void (*cb_fn)(void *cb_arg, int status); 364 void *cb_arg; 365 struct spdk_bdev *bdev; 366 }; 367 368 struct spdk_bdev_channel_iter { 369 spdk_bdev_for_each_channel_msg fn; 370 spdk_bdev_for_each_channel_done cpl; 371 struct spdk_io_channel_iter *i; 372 void *ctx; 373 }; 374 375 struct spdk_bdev_io_error_stat { 376 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 377 }; 378 379 enum bdev_io_retry_state { 380 BDEV_IO_RETRY_STATE_INVALID, 381 BDEV_IO_RETRY_STATE_PULL, 382 BDEV_IO_RETRY_STATE_PULL_MD, 383 BDEV_IO_RETRY_STATE_SUBMIT, 384 BDEV_IO_RETRY_STATE_PUSH, 385 BDEV_IO_RETRY_STATE_PUSH_MD, 386 }; 387 388 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 389 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 390 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 391 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 392 393 static inline void bdev_io_complete(void *ctx); 394 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 395 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 396 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 397 398 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 399 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 400 401 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 402 struct spdk_io_channel *ch, void *_ctx); 403 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 404 405 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 406 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 407 uint64_t num_blocks, 408 struct spdk_memory_domain *domain, void *domain_ctx, 409 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 410 spdk_bdev_io_completion_cb cb, void *cb_arg); 411 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 412 struct iovec *iov, int iovcnt, void *md_buf, 413 uint64_t offset_blocks, uint64_t num_blocks, 414 struct spdk_memory_domain *domain, void *domain_ctx, 415 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 416 spdk_bdev_io_completion_cb cb, void *cb_arg); 417 418 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 419 uint64_t offset, uint64_t length, 420 lock_range_cb cb_fn, void *cb_arg); 421 422 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 423 uint64_t offset, uint64_t length, 424 lock_range_cb cb_fn, void *cb_arg); 425 426 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 427 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 428 429 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 430 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 431 static void claim_reset(struct spdk_bdev *bdev); 432 433 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 434 435 #define bdev_get_ext_io_opt(opts, field, defval) \ 436 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 437 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 438 439 void 440 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 441 { 442 if (!opts) { 443 SPDK_ERRLOG("opts should not be NULL\n"); 444 return; 445 } 446 447 if (!opts_size) { 448 SPDK_ERRLOG("opts_size should not be zero value\n"); 449 return; 450 } 451 452 opts->opts_size = opts_size; 453 454 #define SET_FIELD(field) \ 455 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 456 opts->field = g_bdev_opts.field; \ 457 } \ 458 459 SET_FIELD(bdev_io_pool_size); 460 SET_FIELD(bdev_io_cache_size); 461 SET_FIELD(bdev_auto_examine); 462 SET_FIELD(iobuf_small_cache_size); 463 SET_FIELD(iobuf_large_cache_size); 464 465 /* Do not remove this statement, you should always update this statement when you adding a new field, 466 * and do not forget to add the SET_FIELD statement for your added field. */ 467 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 468 469 #undef SET_FIELD 470 } 471 472 int 473 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 474 { 475 uint32_t min_pool_size; 476 477 if (!opts) { 478 SPDK_ERRLOG("opts cannot be NULL\n"); 479 return -1; 480 } 481 482 if (!opts->opts_size) { 483 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 484 return -1; 485 } 486 487 /* 488 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 489 * initialization. A second mgmt_ch will be created on the same thread when the application starts 490 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 491 */ 492 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 493 if (opts->bdev_io_pool_size < min_pool_size) { 494 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 495 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 496 spdk_thread_get_count()); 497 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 498 return -1; 499 } 500 501 #define SET_FIELD(field) \ 502 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 503 g_bdev_opts.field = opts->field; \ 504 } \ 505 506 SET_FIELD(bdev_io_pool_size); 507 SET_FIELD(bdev_io_cache_size); 508 SET_FIELD(bdev_auto_examine); 509 SET_FIELD(iobuf_small_cache_size); 510 SET_FIELD(iobuf_large_cache_size); 511 512 g_bdev_opts.opts_size = opts->opts_size; 513 514 #undef SET_FIELD 515 516 return 0; 517 } 518 519 static struct spdk_bdev * 520 bdev_get_by_name(const char *bdev_name) 521 { 522 struct spdk_bdev_name find; 523 struct spdk_bdev_name *res; 524 525 find.name = (char *)bdev_name; 526 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 527 if (res != NULL) { 528 return res->bdev; 529 } 530 531 return NULL; 532 } 533 534 struct spdk_bdev * 535 spdk_bdev_get_by_name(const char *bdev_name) 536 { 537 struct spdk_bdev *bdev; 538 539 spdk_spin_lock(&g_bdev_mgr.spinlock); 540 bdev = bdev_get_by_name(bdev_name); 541 spdk_spin_unlock(&g_bdev_mgr.spinlock); 542 543 return bdev; 544 } 545 546 struct bdev_io_status_string { 547 enum spdk_bdev_io_status status; 548 const char *str; 549 }; 550 551 static const struct bdev_io_status_string bdev_io_status_strings[] = { 552 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 553 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 554 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 555 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 556 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 557 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 558 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 559 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 560 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 561 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 562 }; 563 564 static const char * 565 bdev_io_status_get_string(enum spdk_bdev_io_status status) 566 { 567 uint32_t i; 568 569 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 570 if (bdev_io_status_strings[i].status == status) { 571 return bdev_io_status_strings[i].str; 572 } 573 } 574 575 return "reserved"; 576 } 577 578 struct spdk_bdev_wait_for_examine_ctx { 579 struct spdk_poller *poller; 580 spdk_bdev_wait_for_examine_cb cb_fn; 581 void *cb_arg; 582 }; 583 584 static bool bdev_module_all_actions_completed(void); 585 586 static int 587 bdev_wait_for_examine_cb(void *arg) 588 { 589 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 590 591 if (!bdev_module_all_actions_completed()) { 592 return SPDK_POLLER_IDLE; 593 } 594 595 spdk_poller_unregister(&ctx->poller); 596 ctx->cb_fn(ctx->cb_arg); 597 free(ctx); 598 599 return SPDK_POLLER_BUSY; 600 } 601 602 int 603 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 604 { 605 struct spdk_bdev_wait_for_examine_ctx *ctx; 606 607 ctx = calloc(1, sizeof(*ctx)); 608 if (ctx == NULL) { 609 return -ENOMEM; 610 } 611 ctx->cb_fn = cb_fn; 612 ctx->cb_arg = cb_arg; 613 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 614 615 return 0; 616 } 617 618 struct spdk_bdev_examine_item { 619 char *name; 620 TAILQ_ENTRY(spdk_bdev_examine_item) link; 621 }; 622 623 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 624 625 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 626 g_bdev_examine_allowlist); 627 628 static inline bool 629 bdev_examine_allowlist_check(const char *name) 630 { 631 struct spdk_bdev_examine_item *item; 632 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 633 if (strcmp(name, item->name) == 0) { 634 return true; 635 } 636 } 637 return false; 638 } 639 640 static inline void 641 bdev_examine_allowlist_free(void) 642 { 643 struct spdk_bdev_examine_item *item; 644 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 645 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 646 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 647 free(item->name); 648 free(item); 649 } 650 } 651 652 static inline bool 653 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 654 { 655 struct spdk_bdev_alias *tmp; 656 if (bdev_examine_allowlist_check(bdev->name)) { 657 return true; 658 } 659 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 660 if (bdev_examine_allowlist_check(tmp->alias.name)) { 661 return true; 662 } 663 } 664 return false; 665 } 666 667 static inline bool 668 bdev_ok_to_examine(struct spdk_bdev *bdev) 669 { 670 if (g_bdev_opts.bdev_auto_examine) { 671 return true; 672 } else { 673 return bdev_in_examine_allowlist(bdev); 674 } 675 } 676 677 static void 678 bdev_examine(struct spdk_bdev *bdev) 679 { 680 struct spdk_bdev_module *module; 681 struct spdk_bdev_module_claim *claim, *tmpclaim; 682 uint32_t action; 683 684 if (!bdev_ok_to_examine(bdev)) { 685 return; 686 } 687 688 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 689 if (module->examine_config) { 690 spdk_spin_lock(&module->internal.spinlock); 691 action = module->internal.action_in_progress; 692 module->internal.action_in_progress++; 693 spdk_spin_unlock(&module->internal.spinlock); 694 module->examine_config(bdev); 695 if (action != module->internal.action_in_progress) { 696 SPDK_ERRLOG("examine_config for module %s did not call " 697 "spdk_bdev_module_examine_done()\n", module->name); 698 } 699 } 700 } 701 702 spdk_spin_lock(&bdev->internal.spinlock); 703 704 switch (bdev->internal.claim_type) { 705 case SPDK_BDEV_CLAIM_NONE: 706 /* Examine by all bdev modules */ 707 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 708 if (module->examine_disk) { 709 spdk_spin_lock(&module->internal.spinlock); 710 module->internal.action_in_progress++; 711 spdk_spin_unlock(&module->internal.spinlock); 712 spdk_spin_unlock(&bdev->internal.spinlock); 713 module->examine_disk(bdev); 714 spdk_spin_lock(&bdev->internal.spinlock); 715 } 716 } 717 break; 718 case SPDK_BDEV_CLAIM_EXCL_WRITE: 719 /* Examine by the one bdev module with a v1 claim */ 720 module = bdev->internal.claim.v1.module; 721 if (module->examine_disk) { 722 spdk_spin_lock(&module->internal.spinlock); 723 module->internal.action_in_progress++; 724 spdk_spin_unlock(&module->internal.spinlock); 725 spdk_spin_unlock(&bdev->internal.spinlock); 726 module->examine_disk(bdev); 727 return; 728 } 729 break; 730 default: 731 /* Examine by all bdev modules with a v2 claim */ 732 assert(claim_type_is_v2(bdev->internal.claim_type)); 733 /* 734 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 735 * list, perhaps accessing freed memory. Without protection, this could happen 736 * while the lock is dropped during the examine callback. 737 */ 738 bdev->internal.examine_in_progress++; 739 740 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 741 module = claim->module; 742 743 if (module == NULL) { 744 /* This is a vestigial claim, held by examine_count */ 745 continue; 746 } 747 748 if (module->examine_disk == NULL) { 749 continue; 750 } 751 752 spdk_spin_lock(&module->internal.spinlock); 753 module->internal.action_in_progress++; 754 spdk_spin_unlock(&module->internal.spinlock); 755 756 /* Call examine_disk without holding internal.spinlock. */ 757 spdk_spin_unlock(&bdev->internal.spinlock); 758 module->examine_disk(bdev); 759 spdk_spin_lock(&bdev->internal.spinlock); 760 } 761 762 assert(bdev->internal.examine_in_progress > 0); 763 bdev->internal.examine_in_progress--; 764 if (bdev->internal.examine_in_progress == 0) { 765 /* Remove any claims that were released during examine_disk */ 766 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 767 if (claim->desc != NULL) { 768 continue; 769 } 770 771 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 772 free(claim); 773 } 774 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 775 claim_reset(bdev); 776 } 777 } 778 } 779 780 spdk_spin_unlock(&bdev->internal.spinlock); 781 } 782 783 int 784 spdk_bdev_examine(const char *name) 785 { 786 struct spdk_bdev *bdev; 787 struct spdk_bdev_examine_item *item; 788 struct spdk_thread *thread = spdk_get_thread(); 789 790 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 791 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 792 thread ? spdk_thread_get_name(thread) : "null"); 793 return -EINVAL; 794 } 795 796 if (g_bdev_opts.bdev_auto_examine) { 797 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 798 return -EINVAL; 799 } 800 801 if (bdev_examine_allowlist_check(name)) { 802 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 803 return -EEXIST; 804 } 805 806 item = calloc(1, sizeof(*item)); 807 if (!item) { 808 return -ENOMEM; 809 } 810 item->name = strdup(name); 811 if (!item->name) { 812 free(item); 813 return -ENOMEM; 814 } 815 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 816 817 bdev = spdk_bdev_get_by_name(name); 818 if (bdev) { 819 bdev_examine(bdev); 820 } 821 return 0; 822 } 823 824 static inline void 825 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 826 { 827 struct spdk_bdev_examine_item *item; 828 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 829 spdk_json_write_object_begin(w); 830 spdk_json_write_named_string(w, "method", "bdev_examine"); 831 spdk_json_write_named_object_begin(w, "params"); 832 spdk_json_write_named_string(w, "name", item->name); 833 spdk_json_write_object_end(w); 834 spdk_json_write_object_end(w); 835 } 836 } 837 838 struct spdk_bdev * 839 spdk_bdev_first(void) 840 { 841 struct spdk_bdev *bdev; 842 843 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 844 if (bdev) { 845 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 846 } 847 848 return bdev; 849 } 850 851 struct spdk_bdev * 852 spdk_bdev_next(struct spdk_bdev *prev) 853 { 854 struct spdk_bdev *bdev; 855 856 bdev = TAILQ_NEXT(prev, internal.link); 857 if (bdev) { 858 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 859 } 860 861 return bdev; 862 } 863 864 static struct spdk_bdev * 865 _bdev_next_leaf(struct spdk_bdev *bdev) 866 { 867 while (bdev != NULL) { 868 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 869 return bdev; 870 } else { 871 bdev = TAILQ_NEXT(bdev, internal.link); 872 } 873 } 874 875 return bdev; 876 } 877 878 struct spdk_bdev * 879 spdk_bdev_first_leaf(void) 880 { 881 struct spdk_bdev *bdev; 882 883 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 884 885 if (bdev) { 886 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 887 } 888 889 return bdev; 890 } 891 892 struct spdk_bdev * 893 spdk_bdev_next_leaf(struct spdk_bdev *prev) 894 { 895 struct spdk_bdev *bdev; 896 897 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 898 899 if (bdev) { 900 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 901 } 902 903 return bdev; 904 } 905 906 static inline bool 907 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 908 { 909 return bdev_io->internal.memory_domain; 910 } 911 912 static inline bool 913 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 914 { 915 return bdev_io->internal.has_accel_sequence; 916 } 917 918 static inline void 919 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 920 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 921 { 922 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 923 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 924 * channels we will instead wait for half to complete. 925 */ 926 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 927 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 928 929 assert(state != BDEV_IO_RETRY_STATE_INVALID); 930 bdev_io->internal.retry_state = state; 931 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 932 } 933 934 static inline void 935 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 936 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 937 { 938 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 939 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 940 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 941 942 assert(state != BDEV_IO_RETRY_STATE_INVALID); 943 bdev_io->internal.retry_state = state; 944 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 945 } 946 947 void 948 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 949 { 950 struct iovec *iovs; 951 952 if (bdev_io->u.bdev.iovs == NULL) { 953 bdev_io->u.bdev.iovs = &bdev_io->iov; 954 bdev_io->u.bdev.iovcnt = 1; 955 } 956 957 iovs = bdev_io->u.bdev.iovs; 958 959 assert(iovs != NULL); 960 assert(bdev_io->u.bdev.iovcnt >= 1); 961 962 iovs[0].iov_base = buf; 963 iovs[0].iov_len = len; 964 } 965 966 void 967 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 968 { 969 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 970 bdev_io->u.bdev.md_buf = md_buf; 971 } 972 973 static bool 974 _is_buf_allocated(const struct iovec *iovs) 975 { 976 if (iovs == NULL) { 977 return false; 978 } 979 980 return iovs[0].iov_base != NULL; 981 } 982 983 static bool 984 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 985 { 986 int i; 987 uintptr_t iov_base; 988 989 if (spdk_likely(alignment == 1)) { 990 return true; 991 } 992 993 for (i = 0; i < iovcnt; i++) { 994 iov_base = (uintptr_t)iovs[i].iov_base; 995 if ((iov_base & (alignment - 1)) != 0) { 996 return false; 997 } 998 } 999 1000 return true; 1001 } 1002 1003 static inline bool 1004 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1005 { 1006 if (!bdev_io->internal.accel_sequence) { 1007 return false; 1008 } 1009 1010 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1011 * bdev module didn't support accel sequences */ 1012 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1013 } 1014 1015 static inline void 1016 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1017 struct spdk_bdev_shared_resource *shared_resource) 1018 { 1019 bdev_ch->io_outstanding++; 1020 shared_resource->io_outstanding++; 1021 } 1022 1023 static inline void 1024 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1025 struct spdk_bdev_shared_resource *shared_resource) 1026 { 1027 assert(bdev_ch->io_outstanding > 0); 1028 assert(shared_resource->io_outstanding > 0); 1029 bdev_ch->io_outstanding--; 1030 shared_resource->io_outstanding--; 1031 } 1032 1033 static void 1034 bdev_io_submit_sequence_cb(void *ctx, int status) 1035 { 1036 struct spdk_bdev_io *bdev_io = ctx; 1037 1038 bdev_io->u.bdev.accel_sequence = NULL; 1039 bdev_io->internal.accel_sequence = NULL; 1040 1041 if (spdk_unlikely(status != 0)) { 1042 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1043 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1044 bdev_io_complete_unsubmitted(bdev_io); 1045 return; 1046 } 1047 1048 bdev_io_submit(bdev_io); 1049 } 1050 1051 static void 1052 bdev_io_exec_sequence_cb(void *ctx, int status) 1053 { 1054 struct spdk_bdev_io *bdev_io = ctx; 1055 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1056 1057 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1058 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1059 1060 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1061 bdev_ch_retry_io(ch); 1062 } 1063 1064 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1065 } 1066 1067 static void 1068 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1069 { 1070 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1071 1072 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1073 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1074 1075 /* Since the operations are appended during submission, they're in the opposite order than 1076 * how we want to execute them for reads (i.e. we need to execute the most recently added 1077 * operation first), so reverse the sequence before executing it. 1078 */ 1079 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1080 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1081 } 1082 1083 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1084 bdev_io_increment_outstanding(ch, ch->shared_resource); 1085 bdev_io->internal.data_transfer_cpl = cb_fn; 1086 1087 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1088 bdev_io_exec_sequence_cb, bdev_io); 1089 } 1090 1091 static void 1092 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1093 { 1094 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1095 void *buf; 1096 1097 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1098 buf = bdev_io->internal.buf; 1099 bdev_io->internal.buf = NULL; 1100 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1101 bdev_io->internal.get_aux_buf_cb = NULL; 1102 } else { 1103 assert(bdev_io->internal.get_buf_cb != NULL); 1104 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1105 bdev_io->internal.get_buf_cb = NULL; 1106 } 1107 } 1108 1109 static void 1110 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1111 { 1112 struct spdk_bdev_io *bdev_io = ctx; 1113 1114 if (rc) { 1115 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1116 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1117 } 1118 bdev_io_get_buf_complete(bdev_io, !rc); 1119 } 1120 1121 static void 1122 bdev_io_pull_md_buf_done(void *ctx, int status) 1123 { 1124 struct spdk_bdev_io *bdev_io = ctx; 1125 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1126 1127 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1128 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1129 1130 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1131 bdev_ch_retry_io(ch); 1132 } 1133 1134 assert(bdev_io->internal.data_transfer_cpl); 1135 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1136 } 1137 1138 static void 1139 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1140 { 1141 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1142 int rc = 0; 1143 1144 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1145 if (bdev_io_use_memory_domain(bdev_io)) { 1146 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1147 bdev_io_increment_outstanding(ch, ch->shared_resource); 1148 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1149 bdev_io->internal.memory_domain_ctx, 1150 &bdev_io->internal.orig_md_iov, 1, 1151 &bdev_io->internal.bounce_md_iov, 1, 1152 bdev_io_pull_md_buf_done, bdev_io); 1153 if (rc == 0) { 1154 /* Continue to submit IO in completion callback */ 1155 return; 1156 } 1157 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1158 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1159 if (rc != -ENOMEM) { 1160 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1161 spdk_memory_domain_get_dma_device_id( 1162 bdev_io->internal.memory_domain), rc); 1163 } 1164 } else { 1165 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1166 bdev_io->internal.orig_md_iov.iov_base, 1167 bdev_io->internal.orig_md_iov.iov_len); 1168 } 1169 } 1170 1171 if (spdk_unlikely(rc == -ENOMEM)) { 1172 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1173 } else { 1174 assert(bdev_io->internal.data_transfer_cpl); 1175 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1176 } 1177 } 1178 1179 static void 1180 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1181 { 1182 /* save original md_buf */ 1183 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1184 bdev_io->internal.orig_md_iov.iov_len = len; 1185 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1186 bdev_io->internal.bounce_md_iov.iov_len = len; 1187 /* set bounce md_buf */ 1188 bdev_io->u.bdev.md_buf = md_buf; 1189 1190 bdev_io_pull_md_buf(bdev_io); 1191 } 1192 1193 static void 1194 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1195 { 1196 struct spdk_bdev *bdev = bdev_io->bdev; 1197 uint64_t md_len; 1198 void *buf; 1199 1200 if (spdk_bdev_is_md_separate(bdev)) { 1201 assert(!bdev_io_use_accel_sequence(bdev_io)); 1202 1203 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1204 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1205 1206 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1207 1208 if (bdev_io->u.bdev.md_buf != NULL) { 1209 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1210 return; 1211 } else { 1212 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1213 } 1214 } 1215 1216 bdev_io_get_buf_complete(bdev_io, true); 1217 } 1218 1219 static inline void 1220 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1221 { 1222 if (rc) { 1223 SPDK_ERRLOG("Failed to get data buffer\n"); 1224 assert(bdev_io->internal.data_transfer_cpl); 1225 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1226 return; 1227 } 1228 1229 _bdev_io_set_md_buf(bdev_io); 1230 } 1231 1232 static void 1233 bdev_io_pull_data_done_and_track(void *ctx, int status) 1234 { 1235 struct spdk_bdev_io *bdev_io = ctx; 1236 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1237 1238 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1239 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1240 1241 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1242 bdev_ch_retry_io(ch); 1243 } 1244 1245 bdev_io_pull_data_done(bdev_io, status); 1246 } 1247 1248 static void 1249 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1250 { 1251 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1252 int rc = 0; 1253 1254 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1255 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1256 * operation */ 1257 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1258 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1259 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1260 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1261 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1262 NULL, NULL, 1263 bdev_io->internal.orig_iovs, 1264 bdev_io->internal.orig_iovcnt, 1265 bdev_io->internal.memory_domain, 1266 bdev_io->internal.memory_domain_ctx, 1267 0, NULL, NULL); 1268 } else { 1269 /* We need to reverse the src/dst for reads */ 1270 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1271 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1272 bdev_io->internal.orig_iovs, 1273 bdev_io->internal.orig_iovcnt, 1274 bdev_io->internal.memory_domain, 1275 bdev_io->internal.memory_domain_ctx, 1276 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1277 NULL, NULL, 0, NULL, NULL); 1278 } 1279 1280 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1281 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1282 bdev_io->internal.accel_sequence); 1283 } 1284 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1285 /* if this is write path, copy data from original buffer to bounce buffer */ 1286 if (bdev_io_use_memory_domain(bdev_io)) { 1287 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1288 bdev_io_increment_outstanding(ch, ch->shared_resource); 1289 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1290 bdev_io->internal.memory_domain_ctx, 1291 bdev_io->internal.orig_iovs, 1292 (uint32_t) bdev_io->internal.orig_iovcnt, 1293 bdev_io->u.bdev.iovs, 1, 1294 bdev_io_pull_data_done_and_track, 1295 bdev_io); 1296 if (rc == 0) { 1297 /* Continue to submit IO in completion callback */ 1298 return; 1299 } 1300 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1301 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1302 if (rc != -ENOMEM) { 1303 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1304 spdk_memory_domain_get_dma_device_id( 1305 bdev_io->internal.memory_domain)); 1306 } 1307 } else { 1308 assert(bdev_io->u.bdev.iovcnt == 1); 1309 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1310 bdev_io->u.bdev.iovs[0].iov_len, 1311 bdev_io->internal.orig_iovs, 1312 bdev_io->internal.orig_iovcnt); 1313 } 1314 } 1315 1316 if (spdk_unlikely(rc == -ENOMEM)) { 1317 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1318 } else { 1319 bdev_io_pull_data_done(bdev_io, rc); 1320 } 1321 } 1322 1323 static void 1324 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1325 bdev_copy_bounce_buffer_cpl cpl_cb) 1326 { 1327 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1328 1329 bdev_io->internal.data_transfer_cpl = cpl_cb; 1330 /* save original iovec */ 1331 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1332 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1333 /* set bounce iov */ 1334 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1335 bdev_io->u.bdev.iovcnt = 1; 1336 /* set bounce buffer for this operation */ 1337 bdev_io->u.bdev.iovs[0].iov_base = buf; 1338 bdev_io->u.bdev.iovs[0].iov_len = len; 1339 1340 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1341 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1342 } else { 1343 bdev_io_pull_data(bdev_io); 1344 } 1345 } 1346 1347 static void 1348 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1349 { 1350 struct spdk_bdev *bdev = bdev_io->bdev; 1351 bool buf_allocated; 1352 uint64_t alignment; 1353 void *aligned_buf; 1354 1355 bdev_io->internal.buf = buf; 1356 1357 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1358 bdev_io_get_buf_complete(bdev_io, true); 1359 return; 1360 } 1361 1362 alignment = spdk_bdev_get_buf_align(bdev); 1363 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1364 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1365 1366 if (buf_allocated) { 1367 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1368 /* Continue in completion callback */ 1369 return; 1370 } else { 1371 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1372 } 1373 1374 _bdev_io_set_md_buf(bdev_io); 1375 } 1376 1377 static inline uint64_t 1378 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1379 { 1380 struct spdk_bdev *bdev = bdev_io->bdev; 1381 uint64_t md_len, alignment; 1382 1383 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1384 1385 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1386 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1387 1388 return len + alignment + md_len; 1389 } 1390 1391 static void 1392 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1393 { 1394 struct spdk_bdev_mgmt_channel *ch; 1395 1396 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1397 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1398 } 1399 1400 static void 1401 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1402 { 1403 assert(bdev_io->internal.buf != NULL); 1404 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1405 bdev_io->internal.buf = NULL; 1406 } 1407 1408 void 1409 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1410 { 1411 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1412 1413 assert(buf != NULL); 1414 _bdev_io_put_buf(bdev_io, buf, len); 1415 } 1416 1417 static inline void 1418 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1419 struct spdk_bdev_io *bdev_io) 1420 { 1421 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1422 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1423 * sequence pointer to make sure we won't touch it anymore. */ 1424 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1425 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1426 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1427 bdev_io->internal.accel_sequence = NULL; 1428 } 1429 1430 bdev->fn_table->submit_request(ioch, bdev_io); 1431 } 1432 1433 static inline void 1434 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1435 { 1436 struct spdk_bdev *bdev = bdev_io->bdev; 1437 1438 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1439 bdev_io->internal.error.nvme.cdw0 = 0; 1440 bdev_io->num_retries++; 1441 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1442 } 1443 1444 static void 1445 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1446 { 1447 struct spdk_bdev_io *bdev_io; 1448 1449 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1450 /* 1451 * Allow some more I/O to complete before retrying the nomem_io queue. 1452 * Some drivers (such as nvme) cannot immediately take a new I/O in 1453 * the context of a completion, because the resources for the I/O are 1454 * not released until control returns to the bdev poller. Also, we 1455 * may require several small I/O to complete before a larger I/O 1456 * (that requires splitting) can be submitted. 1457 */ 1458 return; 1459 } 1460 1461 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1462 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1463 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1464 1465 switch (bdev_io->internal.retry_state) { 1466 case BDEV_IO_RETRY_STATE_SUBMIT: 1467 bdev_ch_resubmit_io(shared_resource, bdev_io); 1468 break; 1469 case BDEV_IO_RETRY_STATE_PULL: 1470 bdev_io_pull_data(bdev_io); 1471 break; 1472 case BDEV_IO_RETRY_STATE_PULL_MD: 1473 bdev_io_pull_md_buf(bdev_io); 1474 break; 1475 case BDEV_IO_RETRY_STATE_PUSH: 1476 bdev_io_push_bounce_data(bdev_io); 1477 break; 1478 case BDEV_IO_RETRY_STATE_PUSH_MD: 1479 bdev_io_push_bounce_md_buf(bdev_io); 1480 break; 1481 default: 1482 assert(0 && "invalid retry state"); 1483 break; 1484 } 1485 1486 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1487 /* This IO completed again with NOMEM status, so break the loop and 1488 * don't try anymore. Note that a bdev_io that fails with NOMEM 1489 * always gets requeued at the front of the list, to maintain 1490 * ordering. 1491 */ 1492 break; 1493 } 1494 } 1495 } 1496 1497 static void 1498 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1499 { 1500 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1501 } 1502 1503 static int 1504 bdev_no_mem_poller(void *ctx) 1505 { 1506 struct spdk_bdev_shared_resource *shared_resource = ctx; 1507 1508 spdk_poller_unregister(&shared_resource->nomem_poller); 1509 1510 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1511 bdev_shared_ch_retry_io(shared_resource); 1512 } 1513 /* the retry cb may re-register the poller so double check */ 1514 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1515 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1516 /* No IOs were submitted, try again */ 1517 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1518 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1519 } 1520 1521 return SPDK_POLLER_BUSY; 1522 } 1523 1524 static inline bool 1525 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1526 { 1527 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1528 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1529 1530 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1531 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1532 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1533 1534 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1535 /* Special case when we have nomem IOs and no outstanding IOs which completions 1536 * could trigger retry of queued IOs 1537 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1538 * new IOs submitted, e.g. qd==1 */ 1539 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1540 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1541 } 1542 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1543 * ownership of that sequence is transferred back to the bdev layer, so we need to 1544 * restore internal.accel_sequence to make sure that the sequence is handled 1545 * correctly in case the I/O is later aborted. */ 1546 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1547 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1548 assert(bdev_io->internal.accel_sequence == NULL); 1549 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1550 } 1551 1552 return true; 1553 } 1554 1555 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1556 bdev_ch_retry_io(bdev_ch); 1557 } 1558 1559 return false; 1560 } 1561 1562 static void 1563 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1564 { 1565 struct spdk_bdev_io *bdev_io = ctx; 1566 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1567 1568 if (rc) { 1569 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1570 } 1571 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1572 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1573 */ 1574 bdev_io_put_buf(bdev_io); 1575 1576 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1577 bdev_ch_retry_io(ch); 1578 } 1579 1580 /* Continue with IO completion flow */ 1581 bdev_io_complete(bdev_io); 1582 } 1583 1584 static void 1585 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1586 { 1587 struct spdk_bdev_io *bdev_io = ctx; 1588 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1589 1590 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1591 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1592 1593 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1594 bdev_ch_retry_io(ch); 1595 } 1596 1597 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1598 } 1599 1600 static inline void 1601 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1602 { 1603 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1604 int rc = 0; 1605 1606 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1607 /* do the same for metadata buffer */ 1608 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1609 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1610 1611 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1612 if (bdev_io_use_memory_domain(bdev_io)) { 1613 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1614 bdev_io_increment_outstanding(ch, ch->shared_resource); 1615 /* If memory domain is used then we need to call async push function */ 1616 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1617 bdev_io->internal.memory_domain_ctx, 1618 &bdev_io->internal.orig_md_iov, 1619 (uint32_t)bdev_io->internal.orig_iovcnt, 1620 &bdev_io->internal.bounce_md_iov, 1, 1621 bdev_io_push_bounce_md_buf_done, 1622 bdev_io); 1623 if (rc == 0) { 1624 /* Continue IO completion in async callback */ 1625 return; 1626 } 1627 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1628 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1629 if (rc != -ENOMEM) { 1630 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1631 spdk_memory_domain_get_dma_device_id( 1632 bdev_io->internal.memory_domain)); 1633 } 1634 } else { 1635 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1636 bdev_io->internal.orig_md_iov.iov_len); 1637 } 1638 } 1639 } 1640 1641 if (spdk_unlikely(rc == -ENOMEM)) { 1642 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1643 } else { 1644 assert(bdev_io->internal.data_transfer_cpl); 1645 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1646 } 1647 } 1648 1649 static inline void 1650 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1651 { 1652 assert(bdev_io->internal.data_transfer_cpl); 1653 if (rc) { 1654 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1655 return; 1656 } 1657 1658 /* set original buffer for this io */ 1659 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1660 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1661 /* disable bouncing buffer for this io */ 1662 bdev_io->internal.orig_iovcnt = 0; 1663 bdev_io->internal.orig_iovs = NULL; 1664 1665 bdev_io_push_bounce_md_buf(bdev_io); 1666 } 1667 1668 static void 1669 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1670 { 1671 struct spdk_bdev_io *bdev_io = ctx; 1672 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1673 1674 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1675 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1676 1677 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1678 bdev_ch_retry_io(ch); 1679 } 1680 1681 bdev_io_push_bounce_data_done(bdev_io, status); 1682 } 1683 1684 static inline void 1685 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1686 { 1687 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1688 int rc = 0; 1689 1690 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1691 assert(!bdev_io_use_accel_sequence(bdev_io)); 1692 1693 /* if this is read path, copy data from bounce buffer to original buffer */ 1694 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1695 if (bdev_io_use_memory_domain(bdev_io)) { 1696 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1697 bdev_io_increment_outstanding(ch, ch->shared_resource); 1698 /* If memory domain is used then we need to call async push function */ 1699 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1700 bdev_io->internal.memory_domain_ctx, 1701 bdev_io->internal.orig_iovs, 1702 (uint32_t)bdev_io->internal.orig_iovcnt, 1703 &bdev_io->internal.bounce_iov, 1, 1704 bdev_io_push_bounce_data_done_and_track, 1705 bdev_io); 1706 if (rc == 0) { 1707 /* Continue IO completion in async callback */ 1708 return; 1709 } 1710 1711 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1712 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1713 if (rc != -ENOMEM) { 1714 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1715 spdk_memory_domain_get_dma_device_id( 1716 bdev_io->internal.memory_domain)); 1717 } 1718 } else { 1719 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1720 bdev_io->internal.orig_iovcnt, 1721 bdev_io->internal.bounce_iov.iov_base, 1722 bdev_io->internal.bounce_iov.iov_len); 1723 } 1724 } 1725 1726 if (spdk_unlikely(rc == -ENOMEM)) { 1727 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1728 } else { 1729 bdev_io_push_bounce_data_done(bdev_io, rc); 1730 } 1731 } 1732 1733 static inline void 1734 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1735 { 1736 bdev_io->internal.data_transfer_cpl = cpl_cb; 1737 bdev_io_push_bounce_data(bdev_io); 1738 } 1739 1740 static void 1741 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1742 { 1743 struct spdk_bdev_io *bdev_io; 1744 1745 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1746 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1747 } 1748 1749 static void 1750 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1751 { 1752 struct spdk_bdev_mgmt_channel *mgmt_ch; 1753 uint64_t max_len; 1754 void *buf; 1755 1756 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1757 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1758 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1759 1760 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1761 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1762 bdev_io_get_buf_complete(bdev_io, false); 1763 return; 1764 } 1765 1766 bdev_io->internal.buf_len = len; 1767 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1768 bdev_io_get_iobuf_cb); 1769 if (buf != NULL) { 1770 _bdev_io_set_buf(bdev_io, buf, len); 1771 } 1772 } 1773 1774 void 1775 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1776 { 1777 struct spdk_bdev *bdev = bdev_io->bdev; 1778 uint64_t alignment; 1779 1780 assert(cb != NULL); 1781 bdev_io->internal.get_buf_cb = cb; 1782 1783 alignment = spdk_bdev_get_buf_align(bdev); 1784 1785 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1786 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1787 /* Buffer already present and aligned */ 1788 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1789 return; 1790 } 1791 1792 bdev_io_get_buf(bdev_io, len); 1793 } 1794 1795 static void 1796 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1797 bool success) 1798 { 1799 if (!success) { 1800 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1801 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1802 bdev_io_complete_unsubmitted(bdev_io); 1803 return; 1804 } 1805 1806 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1807 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1808 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1809 return; 1810 } 1811 /* For reads we'll execute the sequence after the data is read, so, for now, only 1812 * clear out accel_sequence pointer and submit the IO */ 1813 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1814 bdev_io->u.bdev.accel_sequence = NULL; 1815 } 1816 1817 bdev_io_submit(bdev_io); 1818 } 1819 1820 static void 1821 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1822 uint64_t len) 1823 { 1824 assert(cb != NULL); 1825 bdev_io->internal.get_buf_cb = cb; 1826 1827 bdev_io_get_buf(bdev_io, len); 1828 } 1829 1830 void 1831 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1832 { 1833 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1834 1835 assert(cb != NULL); 1836 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1837 bdev_io->internal.get_aux_buf_cb = cb; 1838 bdev_io_get_buf(bdev_io, len); 1839 } 1840 1841 static int 1842 bdev_module_get_max_ctx_size(void) 1843 { 1844 struct spdk_bdev_module *bdev_module; 1845 int max_bdev_module_size = 0; 1846 1847 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1848 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1849 max_bdev_module_size = bdev_module->get_ctx_size(); 1850 } 1851 } 1852 1853 return max_bdev_module_size; 1854 } 1855 1856 static void 1857 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1858 { 1859 if (!bdev->internal.histogram_enabled) { 1860 return; 1861 } 1862 1863 spdk_json_write_object_begin(w); 1864 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1865 1866 spdk_json_write_named_object_begin(w, "params"); 1867 spdk_json_write_named_string(w, "name", bdev->name); 1868 1869 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1870 spdk_json_write_object_end(w); 1871 1872 spdk_json_write_object_end(w); 1873 } 1874 1875 static void 1876 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1877 { 1878 int i; 1879 struct spdk_bdev_qos *qos = bdev->internal.qos; 1880 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1881 1882 if (!qos) { 1883 return; 1884 } 1885 1886 spdk_bdev_get_qos_rate_limits(bdev, limits); 1887 1888 spdk_json_write_object_begin(w); 1889 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1890 1891 spdk_json_write_named_object_begin(w, "params"); 1892 spdk_json_write_named_string(w, "name", bdev->name); 1893 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1894 if (limits[i] > 0) { 1895 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1896 } 1897 } 1898 spdk_json_write_object_end(w); 1899 1900 spdk_json_write_object_end(w); 1901 } 1902 1903 void 1904 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1905 { 1906 struct spdk_bdev_module *bdev_module; 1907 struct spdk_bdev *bdev; 1908 1909 assert(w != NULL); 1910 1911 spdk_json_write_array_begin(w); 1912 1913 spdk_json_write_object_begin(w); 1914 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1915 spdk_json_write_named_object_begin(w, "params"); 1916 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1917 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1918 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1919 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1920 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1921 spdk_json_write_object_end(w); 1922 spdk_json_write_object_end(w); 1923 1924 bdev_examine_allowlist_config_json(w); 1925 1926 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1927 if (bdev_module->config_json) { 1928 bdev_module->config_json(w); 1929 } 1930 } 1931 1932 spdk_spin_lock(&g_bdev_mgr.spinlock); 1933 1934 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1935 if (bdev->fn_table->write_config_json) { 1936 bdev->fn_table->write_config_json(bdev, w); 1937 } 1938 1939 bdev_qos_config_json(bdev, w); 1940 bdev_enable_histogram_config_json(bdev, w); 1941 } 1942 1943 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1944 1945 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1946 spdk_json_write_object_begin(w); 1947 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1948 spdk_json_write_object_end(w); 1949 1950 spdk_json_write_array_end(w); 1951 } 1952 1953 static void 1954 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1955 { 1956 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1957 struct spdk_bdev_io *bdev_io; 1958 1959 spdk_iobuf_channel_fini(&ch->iobuf); 1960 1961 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1962 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1963 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1964 ch->per_thread_cache_count--; 1965 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1966 } 1967 1968 assert(ch->per_thread_cache_count == 0); 1969 } 1970 1971 static int 1972 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1973 { 1974 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1975 struct spdk_bdev_io *bdev_io; 1976 uint32_t i; 1977 int rc; 1978 1979 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 1980 g_bdev_opts.iobuf_small_cache_size, 1981 g_bdev_opts.iobuf_large_cache_size); 1982 if (rc != 0) { 1983 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1984 return -1; 1985 } 1986 1987 STAILQ_INIT(&ch->per_thread_cache); 1988 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1989 1990 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1991 ch->per_thread_cache_count = 0; 1992 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1993 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1994 if (bdev_io == NULL) { 1995 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1996 assert(false); 1997 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1998 return -1; 1999 } 2000 ch->per_thread_cache_count++; 2001 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2002 } 2003 2004 TAILQ_INIT(&ch->shared_resources); 2005 TAILQ_INIT(&ch->io_wait_queue); 2006 2007 return 0; 2008 } 2009 2010 static void 2011 bdev_init_complete(int rc) 2012 { 2013 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2014 void *cb_arg = g_init_cb_arg; 2015 struct spdk_bdev_module *m; 2016 2017 g_bdev_mgr.init_complete = true; 2018 g_init_cb_fn = NULL; 2019 g_init_cb_arg = NULL; 2020 2021 /* 2022 * For modules that need to know when subsystem init is complete, 2023 * inform them now. 2024 */ 2025 if (rc == 0) { 2026 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2027 if (m->init_complete) { 2028 m->init_complete(); 2029 } 2030 } 2031 } 2032 2033 cb_fn(cb_arg, rc); 2034 } 2035 2036 static bool 2037 bdev_module_all_actions_completed(void) 2038 { 2039 struct spdk_bdev_module *m; 2040 2041 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2042 if (m->internal.action_in_progress > 0) { 2043 return false; 2044 } 2045 } 2046 return true; 2047 } 2048 2049 static void 2050 bdev_module_action_complete(void) 2051 { 2052 /* 2053 * Don't finish bdev subsystem initialization if 2054 * module pre-initialization is still in progress, or 2055 * the subsystem been already initialized. 2056 */ 2057 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2058 return; 2059 } 2060 2061 /* 2062 * Check all bdev modules for inits/examinations in progress. If any 2063 * exist, return immediately since we cannot finish bdev subsystem 2064 * initialization until all are completed. 2065 */ 2066 if (!bdev_module_all_actions_completed()) { 2067 return; 2068 } 2069 2070 /* 2071 * Modules already finished initialization - now that all 2072 * the bdev modules have finished their asynchronous I/O 2073 * processing, the entire bdev layer can be marked as complete. 2074 */ 2075 bdev_init_complete(0); 2076 } 2077 2078 static void 2079 bdev_module_action_done(struct spdk_bdev_module *module) 2080 { 2081 spdk_spin_lock(&module->internal.spinlock); 2082 assert(module->internal.action_in_progress > 0); 2083 module->internal.action_in_progress--; 2084 spdk_spin_unlock(&module->internal.spinlock); 2085 bdev_module_action_complete(); 2086 } 2087 2088 void 2089 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2090 { 2091 assert(module->async_init); 2092 bdev_module_action_done(module); 2093 } 2094 2095 void 2096 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2097 { 2098 bdev_module_action_done(module); 2099 } 2100 2101 /** The last initialized bdev module */ 2102 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2103 2104 static void 2105 bdev_init_failed(void *cb_arg) 2106 { 2107 struct spdk_bdev_module *module = cb_arg; 2108 2109 spdk_spin_lock(&module->internal.spinlock); 2110 assert(module->internal.action_in_progress > 0); 2111 module->internal.action_in_progress--; 2112 spdk_spin_unlock(&module->internal.spinlock); 2113 bdev_init_complete(-1); 2114 } 2115 2116 static int 2117 bdev_modules_init(void) 2118 { 2119 struct spdk_bdev_module *module; 2120 int rc = 0; 2121 2122 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2123 g_resume_bdev_module = module; 2124 if (module->async_init) { 2125 spdk_spin_lock(&module->internal.spinlock); 2126 module->internal.action_in_progress = 1; 2127 spdk_spin_unlock(&module->internal.spinlock); 2128 } 2129 rc = module->module_init(); 2130 if (rc != 0) { 2131 /* Bump action_in_progress to prevent other modules from completion of modules_init 2132 * Send message to defer application shutdown until resources are cleaned up */ 2133 spdk_spin_lock(&module->internal.spinlock); 2134 module->internal.action_in_progress = 1; 2135 spdk_spin_unlock(&module->internal.spinlock); 2136 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2137 return rc; 2138 } 2139 } 2140 2141 g_resume_bdev_module = NULL; 2142 return 0; 2143 } 2144 2145 void 2146 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2147 { 2148 int rc = 0; 2149 char mempool_name[32]; 2150 2151 assert(cb_fn != NULL); 2152 2153 g_init_cb_fn = cb_fn; 2154 g_init_cb_arg = cb_arg; 2155 2156 spdk_notify_type_register("bdev_register"); 2157 spdk_notify_type_register("bdev_unregister"); 2158 2159 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2160 2161 rc = spdk_iobuf_register_module("bdev"); 2162 if (rc != 0) { 2163 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2164 bdev_init_complete(-1); 2165 return; 2166 } 2167 2168 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2169 g_bdev_opts.bdev_io_pool_size, 2170 sizeof(struct spdk_bdev_io) + 2171 bdev_module_get_max_ctx_size(), 2172 0, 2173 SPDK_ENV_SOCKET_ID_ANY); 2174 2175 if (g_bdev_mgr.bdev_io_pool == NULL) { 2176 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2177 bdev_init_complete(-1); 2178 return; 2179 } 2180 2181 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2182 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2183 if (!g_bdev_mgr.zero_buffer) { 2184 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2185 bdev_init_complete(-1); 2186 return; 2187 } 2188 2189 #ifdef SPDK_CONFIG_VTUNE 2190 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2191 #endif 2192 2193 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2194 bdev_mgmt_channel_destroy, 2195 sizeof(struct spdk_bdev_mgmt_channel), 2196 "bdev_mgr"); 2197 2198 rc = bdev_modules_init(); 2199 g_bdev_mgr.module_init_complete = true; 2200 if (rc != 0) { 2201 SPDK_ERRLOG("bdev modules init failed\n"); 2202 return; 2203 } 2204 2205 bdev_module_action_complete(); 2206 } 2207 2208 static void 2209 bdev_mgr_unregister_cb(void *io_device) 2210 { 2211 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2212 2213 if (g_bdev_mgr.bdev_io_pool) { 2214 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2215 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2216 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2217 g_bdev_opts.bdev_io_pool_size); 2218 } 2219 2220 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2221 } 2222 2223 spdk_free(g_bdev_mgr.zero_buffer); 2224 2225 bdev_examine_allowlist_free(); 2226 2227 cb_fn(g_fini_cb_arg); 2228 g_fini_cb_fn = NULL; 2229 g_fini_cb_arg = NULL; 2230 g_bdev_mgr.init_complete = false; 2231 g_bdev_mgr.module_init_complete = false; 2232 } 2233 2234 static void 2235 bdev_module_fini_iter(void *arg) 2236 { 2237 struct spdk_bdev_module *bdev_module; 2238 2239 /* FIXME: Handling initialization failures is broken now, 2240 * so we won't even try cleaning up after successfully 2241 * initialized modules. if module_init_complete is false, 2242 * just call spdk_bdev_mgr_unregister_cb 2243 */ 2244 if (!g_bdev_mgr.module_init_complete) { 2245 bdev_mgr_unregister_cb(NULL); 2246 return; 2247 } 2248 2249 /* Start iterating from the last touched module */ 2250 if (!g_resume_bdev_module) { 2251 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2252 } else { 2253 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2254 internal.tailq); 2255 } 2256 2257 while (bdev_module) { 2258 if (bdev_module->async_fini) { 2259 /* Save our place so we can resume later. We must 2260 * save the variable here, before calling module_fini() 2261 * below, because in some cases the module may immediately 2262 * call spdk_bdev_module_fini_done() and re-enter 2263 * this function to continue iterating. */ 2264 g_resume_bdev_module = bdev_module; 2265 } 2266 2267 if (bdev_module->module_fini) { 2268 bdev_module->module_fini(); 2269 } 2270 2271 if (bdev_module->async_fini) { 2272 return; 2273 } 2274 2275 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2276 internal.tailq); 2277 } 2278 2279 g_resume_bdev_module = NULL; 2280 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2281 } 2282 2283 void 2284 spdk_bdev_module_fini_done(void) 2285 { 2286 if (spdk_get_thread() != g_fini_thread) { 2287 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2288 } else { 2289 bdev_module_fini_iter(NULL); 2290 } 2291 } 2292 2293 static void 2294 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2295 { 2296 struct spdk_bdev *bdev = cb_arg; 2297 2298 if (bdeverrno && bdev) { 2299 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2300 bdev->name); 2301 2302 /* 2303 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2304 * bdev; try to continue by manually removing this bdev from the list and continue 2305 * with the next bdev in the list. 2306 */ 2307 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2308 } 2309 2310 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2311 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2312 /* 2313 * Bdev module finish need to be deferred as we might be in the middle of some context 2314 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2315 * after returning. 2316 */ 2317 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2318 return; 2319 } 2320 2321 /* 2322 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2323 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2324 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2325 * base bdevs. 2326 * 2327 * Also, walk the list in the reverse order. 2328 */ 2329 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2330 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2331 spdk_spin_lock(&bdev->internal.spinlock); 2332 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2333 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2334 spdk_spin_unlock(&bdev->internal.spinlock); 2335 continue; 2336 } 2337 spdk_spin_unlock(&bdev->internal.spinlock); 2338 2339 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2340 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2341 return; 2342 } 2343 2344 /* 2345 * If any bdev fails to unclaim underlying bdev properly, we may face the 2346 * case of bdev list consisting of claimed bdevs only (if claims are managed 2347 * correctly, this would mean there's a loop in the claims graph which is 2348 * clearly impossible). Warn and unregister last bdev on the list then. 2349 */ 2350 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2351 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2352 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2353 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2354 return; 2355 } 2356 } 2357 2358 static void 2359 bdev_module_fini_start_iter(void *arg) 2360 { 2361 struct spdk_bdev_module *bdev_module; 2362 2363 if (!g_resume_bdev_module) { 2364 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2365 } else { 2366 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2367 } 2368 2369 while (bdev_module) { 2370 if (bdev_module->async_fini_start) { 2371 /* Save our place so we can resume later. We must 2372 * save the variable here, before calling fini_start() 2373 * below, because in some cases the module may immediately 2374 * call spdk_bdev_module_fini_start_done() and re-enter 2375 * this function to continue iterating. */ 2376 g_resume_bdev_module = bdev_module; 2377 } 2378 2379 if (bdev_module->fini_start) { 2380 bdev_module->fini_start(); 2381 } 2382 2383 if (bdev_module->async_fini_start) { 2384 return; 2385 } 2386 2387 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2388 } 2389 2390 g_resume_bdev_module = NULL; 2391 2392 bdev_finish_unregister_bdevs_iter(NULL, 0); 2393 } 2394 2395 void 2396 spdk_bdev_module_fini_start_done(void) 2397 { 2398 if (spdk_get_thread() != g_fini_thread) { 2399 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2400 } else { 2401 bdev_module_fini_start_iter(NULL); 2402 } 2403 } 2404 2405 static void 2406 bdev_finish_wait_for_examine_done(void *cb_arg) 2407 { 2408 bdev_module_fini_start_iter(NULL); 2409 } 2410 2411 static void bdev_open_async_fini(void); 2412 2413 void 2414 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2415 { 2416 int rc; 2417 2418 assert(cb_fn != NULL); 2419 2420 g_fini_thread = spdk_get_thread(); 2421 2422 g_fini_cb_fn = cb_fn; 2423 g_fini_cb_arg = cb_arg; 2424 2425 bdev_open_async_fini(); 2426 2427 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2428 if (rc != 0) { 2429 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2430 bdev_finish_wait_for_examine_done(NULL); 2431 } 2432 } 2433 2434 struct spdk_bdev_io * 2435 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2436 { 2437 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2438 struct spdk_bdev_io *bdev_io; 2439 2440 if (ch->per_thread_cache_count > 0) { 2441 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2442 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2443 ch->per_thread_cache_count--; 2444 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2445 /* 2446 * Don't try to look for bdev_ios in the global pool if there are 2447 * waiters on bdev_ios - we don't want this caller to jump the line. 2448 */ 2449 bdev_io = NULL; 2450 } else { 2451 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2452 } 2453 2454 return bdev_io; 2455 } 2456 2457 void 2458 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2459 { 2460 struct spdk_bdev_mgmt_channel *ch; 2461 2462 assert(bdev_io != NULL); 2463 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2464 2465 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2466 2467 if (bdev_io->internal.buf != NULL) { 2468 bdev_io_put_buf(bdev_io); 2469 } 2470 2471 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2472 ch->per_thread_cache_count++; 2473 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2474 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2475 struct spdk_bdev_io_wait_entry *entry; 2476 2477 entry = TAILQ_FIRST(&ch->io_wait_queue); 2478 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2479 entry->cb_fn(entry->cb_arg); 2480 } 2481 } else { 2482 /* We should never have a full cache with entries on the io wait queue. */ 2483 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2484 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2485 } 2486 } 2487 2488 static bool 2489 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2490 { 2491 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2492 2493 switch (limit) { 2494 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2495 return true; 2496 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2497 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2498 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2499 return false; 2500 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2501 default: 2502 return false; 2503 } 2504 } 2505 2506 static bool 2507 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2508 { 2509 switch (bdev_io->type) { 2510 case SPDK_BDEV_IO_TYPE_NVME_IO: 2511 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2512 case SPDK_BDEV_IO_TYPE_READ: 2513 case SPDK_BDEV_IO_TYPE_WRITE: 2514 return true; 2515 case SPDK_BDEV_IO_TYPE_ZCOPY: 2516 if (bdev_io->u.bdev.zcopy.start) { 2517 return true; 2518 } else { 2519 return false; 2520 } 2521 default: 2522 return false; 2523 } 2524 } 2525 2526 static bool 2527 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2528 { 2529 switch (bdev_io->type) { 2530 case SPDK_BDEV_IO_TYPE_NVME_IO: 2531 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2532 /* Bit 1 (0x2) set for read operation */ 2533 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2534 return true; 2535 } else { 2536 return false; 2537 } 2538 case SPDK_BDEV_IO_TYPE_READ: 2539 return true; 2540 case SPDK_BDEV_IO_TYPE_ZCOPY: 2541 /* Populate to read from disk */ 2542 if (bdev_io->u.bdev.zcopy.populate) { 2543 return true; 2544 } else { 2545 return false; 2546 } 2547 default: 2548 return false; 2549 } 2550 } 2551 2552 static uint64_t 2553 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2554 { 2555 struct spdk_bdev *bdev = bdev_io->bdev; 2556 2557 switch (bdev_io->type) { 2558 case SPDK_BDEV_IO_TYPE_NVME_IO: 2559 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2560 return bdev_io->u.nvme_passthru.nbytes; 2561 case SPDK_BDEV_IO_TYPE_READ: 2562 case SPDK_BDEV_IO_TYPE_WRITE: 2563 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2564 case SPDK_BDEV_IO_TYPE_ZCOPY: 2565 /* Track the data in the start phase only */ 2566 if (bdev_io->u.bdev.zcopy.start) { 2567 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2568 } else { 2569 return 0; 2570 } 2571 default: 2572 return 0; 2573 } 2574 } 2575 2576 static inline bool 2577 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2578 { 2579 int64_t remaining_this_timeslice; 2580 2581 if (!limit->max_per_timeslice) { 2582 /* The QoS is disabled */ 2583 return false; 2584 } 2585 2586 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2587 __ATOMIC_RELAXED); 2588 if (remaining_this_timeslice + (int64_t)delta > 0) { 2589 /* There was still a quota for this delta -> the IO shouldn't be queued 2590 * 2591 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2592 * quota can be allowed once a while. Such overrun then taken into account in 2593 * the QoS poller, where the next timeslice quota is calculated. 2594 */ 2595 return false; 2596 } 2597 2598 /* There was no quota for this delta -> the IO should be queued 2599 * The remaining_this_timeslice must be rewinded so it reflects the real 2600 * amount of IOs or bytes allowed. 2601 */ 2602 __atomic_add_fetch( 2603 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2604 return true; 2605 } 2606 2607 static inline void 2608 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2609 { 2610 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2611 } 2612 2613 static bool 2614 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2615 { 2616 return bdev_qos_rw_queue_io(limit, io, 1); 2617 } 2618 2619 static void 2620 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2621 { 2622 bdev_qos_rw_rewind_io(limit, io, 1); 2623 } 2624 2625 static bool 2626 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2627 { 2628 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2629 } 2630 2631 static void 2632 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2633 { 2634 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2635 } 2636 2637 static bool 2638 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2639 { 2640 if (bdev_is_read_io(io) == false) { 2641 return false; 2642 } 2643 2644 return bdev_qos_rw_bps_queue(limit, io); 2645 } 2646 2647 static void 2648 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2649 { 2650 if (bdev_is_read_io(io) != false) { 2651 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2652 } 2653 } 2654 2655 static bool 2656 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2657 { 2658 if (bdev_is_read_io(io) == true) { 2659 return false; 2660 } 2661 2662 return bdev_qos_rw_bps_queue(limit, io); 2663 } 2664 2665 static void 2666 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2667 { 2668 if (bdev_is_read_io(io) != true) { 2669 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2670 } 2671 } 2672 2673 static void 2674 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2675 { 2676 int i; 2677 2678 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2679 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2680 qos->rate_limits[i].queue_io = NULL; 2681 continue; 2682 } 2683 2684 switch (i) { 2685 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2686 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2687 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2688 break; 2689 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2690 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2691 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2692 break; 2693 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2694 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2695 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2696 break; 2697 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2698 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2699 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2700 break; 2701 default: 2702 break; 2703 } 2704 } 2705 } 2706 2707 static void 2708 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2709 struct spdk_bdev_io *bdev_io, 2710 enum spdk_bdev_io_status status) 2711 { 2712 bdev_io->internal.in_submit_request = true; 2713 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2714 spdk_bdev_io_complete(bdev_io, status); 2715 bdev_io->internal.in_submit_request = false; 2716 } 2717 2718 static inline void 2719 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2720 { 2721 struct spdk_bdev *bdev = bdev_io->bdev; 2722 struct spdk_io_channel *ch = bdev_ch->channel; 2723 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2724 2725 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2726 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2727 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2728 2729 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2730 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2731 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2732 SPDK_BDEV_IO_STATUS_SUCCESS); 2733 return; 2734 } 2735 } 2736 2737 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2738 bdev_io->bdev->split_on_write_unit && 2739 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2740 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2741 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2742 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2743 return; 2744 } 2745 2746 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2747 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2748 bdev_io->internal.in_submit_request = true; 2749 bdev_submit_request(bdev, ch, bdev_io); 2750 bdev_io->internal.in_submit_request = false; 2751 } else { 2752 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2753 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2754 /* Special case when we have nomem IOs and no outstanding IOs which completions 2755 * could trigger retry of queued IOs */ 2756 bdev_shared_ch_retry_io(shared_resource); 2757 } 2758 } 2759 } 2760 2761 static bool 2762 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2763 { 2764 int i; 2765 2766 if (bdev_qos_io_to_limit(bdev_io) == true) { 2767 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2768 if (!qos->rate_limits[i].queue_io) { 2769 continue; 2770 } 2771 2772 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2773 bdev_io) == true) { 2774 for (i -= 1; i >= 0 ; i--) { 2775 if (!qos->rate_limits[i].queue_io) { 2776 continue; 2777 } 2778 2779 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2780 } 2781 return true; 2782 } 2783 } 2784 } 2785 2786 return false; 2787 } 2788 2789 static int 2790 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2791 { 2792 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2793 int submitted_ios = 0; 2794 2795 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2796 if (!bdev_qos_queue_io(qos, bdev_io)) { 2797 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2798 bdev_io_do_submit(ch, bdev_io); 2799 2800 submitted_ios++; 2801 } 2802 } 2803 2804 return submitted_ios; 2805 } 2806 2807 static void 2808 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2809 { 2810 int rc; 2811 2812 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2813 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2814 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2815 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2816 &bdev_io->internal.waitq_entry); 2817 if (rc != 0) { 2818 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2819 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2820 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2821 } 2822 } 2823 2824 static bool 2825 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2826 { 2827 uint32_t io_boundary; 2828 struct spdk_bdev *bdev = bdev_io->bdev; 2829 uint32_t max_segment_size = bdev->max_segment_size; 2830 uint32_t max_size = bdev->max_rw_size; 2831 int max_segs = bdev->max_num_segments; 2832 2833 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2834 io_boundary = bdev->write_unit_size; 2835 } else if (bdev->split_on_optimal_io_boundary) { 2836 io_boundary = bdev->optimal_io_boundary; 2837 } else { 2838 io_boundary = 0; 2839 } 2840 2841 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2842 return false; 2843 } 2844 2845 if (io_boundary) { 2846 uint64_t start_stripe, end_stripe; 2847 2848 start_stripe = bdev_io->u.bdev.offset_blocks; 2849 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2850 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2851 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2852 start_stripe >>= spdk_u32log2(io_boundary); 2853 end_stripe >>= spdk_u32log2(io_boundary); 2854 } else { 2855 start_stripe /= io_boundary; 2856 end_stripe /= io_boundary; 2857 } 2858 2859 if (start_stripe != end_stripe) { 2860 return true; 2861 } 2862 } 2863 2864 if (max_segs) { 2865 if (bdev_io->u.bdev.iovcnt > max_segs) { 2866 return true; 2867 } 2868 } 2869 2870 if (max_segment_size) { 2871 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2872 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2873 return true; 2874 } 2875 } 2876 } 2877 2878 if (max_size) { 2879 if (bdev_io->u.bdev.num_blocks > max_size) { 2880 return true; 2881 } 2882 } 2883 2884 return false; 2885 } 2886 2887 static bool 2888 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2889 { 2890 uint32_t num_unmap_segments; 2891 2892 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2893 return false; 2894 } 2895 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2896 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2897 return true; 2898 } 2899 2900 return false; 2901 } 2902 2903 static bool 2904 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2905 { 2906 if (!bdev_io->bdev->max_write_zeroes) { 2907 return false; 2908 } 2909 2910 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2911 return true; 2912 } 2913 2914 return false; 2915 } 2916 2917 static bool 2918 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2919 { 2920 if (bdev_io->bdev->max_copy != 0 && 2921 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2922 return true; 2923 } 2924 2925 return false; 2926 } 2927 2928 static bool 2929 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2930 { 2931 switch (bdev_io->type) { 2932 case SPDK_BDEV_IO_TYPE_READ: 2933 case SPDK_BDEV_IO_TYPE_WRITE: 2934 return bdev_rw_should_split(bdev_io); 2935 case SPDK_BDEV_IO_TYPE_UNMAP: 2936 return bdev_unmap_should_split(bdev_io); 2937 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2938 return bdev_write_zeroes_should_split(bdev_io); 2939 case SPDK_BDEV_IO_TYPE_COPY: 2940 return bdev_copy_should_split(bdev_io); 2941 default: 2942 return false; 2943 } 2944 } 2945 2946 static uint32_t 2947 _to_next_boundary(uint64_t offset, uint32_t boundary) 2948 { 2949 return (boundary - (offset % boundary)); 2950 } 2951 2952 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2953 2954 static void _bdev_rw_split(void *_bdev_io); 2955 2956 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2957 2958 static void 2959 _bdev_unmap_split(void *_bdev_io) 2960 { 2961 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2962 } 2963 2964 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2965 2966 static void 2967 _bdev_write_zeroes_split(void *_bdev_io) 2968 { 2969 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2970 } 2971 2972 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2973 2974 static void 2975 _bdev_copy_split(void *_bdev_io) 2976 { 2977 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2978 } 2979 2980 static int 2981 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2982 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2983 { 2984 int rc; 2985 uint64_t current_offset, current_remaining, current_src_offset; 2986 spdk_bdev_io_wait_cb io_wait_fn; 2987 2988 current_offset = *offset; 2989 current_remaining = *remaining; 2990 2991 bdev_io->u.bdev.split_outstanding++; 2992 2993 io_wait_fn = _bdev_rw_split; 2994 switch (bdev_io->type) { 2995 case SPDK_BDEV_IO_TYPE_READ: 2996 assert(bdev_io->u.bdev.accel_sequence == NULL); 2997 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2998 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2999 iov, iovcnt, md_buf, current_offset, 3000 num_blocks, bdev_io->internal.memory_domain, 3001 bdev_io->internal.memory_domain_ctx, NULL, 3002 bdev_io->u.bdev.dif_check_flags, 3003 bdev_io_split_done, bdev_io); 3004 break; 3005 case SPDK_BDEV_IO_TYPE_WRITE: 3006 assert(bdev_io->u.bdev.accel_sequence == NULL); 3007 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3008 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3009 iov, iovcnt, md_buf, current_offset, 3010 num_blocks, bdev_io->internal.memory_domain, 3011 bdev_io->internal.memory_domain_ctx, NULL, 3012 bdev_io->u.bdev.dif_check_flags, 3013 bdev_io_split_done, bdev_io); 3014 break; 3015 case SPDK_BDEV_IO_TYPE_UNMAP: 3016 io_wait_fn = _bdev_unmap_split; 3017 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3018 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3019 current_offset, num_blocks, 3020 bdev_io_split_done, bdev_io); 3021 break; 3022 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3023 io_wait_fn = _bdev_write_zeroes_split; 3024 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3025 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3026 current_offset, num_blocks, 3027 bdev_io_split_done, bdev_io); 3028 break; 3029 case SPDK_BDEV_IO_TYPE_COPY: 3030 io_wait_fn = _bdev_copy_split; 3031 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3032 (current_offset - bdev_io->u.bdev.offset_blocks); 3033 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3034 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3035 current_offset, current_src_offset, num_blocks, 3036 bdev_io_split_done, bdev_io); 3037 break; 3038 default: 3039 assert(false); 3040 rc = -EINVAL; 3041 break; 3042 } 3043 3044 if (rc == 0) { 3045 current_offset += num_blocks; 3046 current_remaining -= num_blocks; 3047 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 3048 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 3049 *offset = current_offset; 3050 *remaining = current_remaining; 3051 } else { 3052 bdev_io->u.bdev.split_outstanding--; 3053 if (rc == -ENOMEM) { 3054 if (bdev_io->u.bdev.split_outstanding == 0) { 3055 /* No I/O is outstanding. Hence we should wait here. */ 3056 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3057 } 3058 } else { 3059 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3060 if (bdev_io->u.bdev.split_outstanding == 0) { 3061 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3062 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3063 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3064 } 3065 } 3066 } 3067 3068 return rc; 3069 } 3070 3071 static void 3072 _bdev_rw_split(void *_bdev_io) 3073 { 3074 struct iovec *parent_iov, *iov; 3075 struct spdk_bdev_io *bdev_io = _bdev_io; 3076 struct spdk_bdev *bdev = bdev_io->bdev; 3077 uint64_t parent_offset, current_offset, remaining; 3078 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3079 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3080 uint32_t iovcnt, iov_len, child_iovsize; 3081 uint32_t blocklen = bdev->blocklen; 3082 uint32_t io_boundary; 3083 uint32_t max_segment_size = bdev->max_segment_size; 3084 uint32_t max_child_iovcnt = bdev->max_num_segments; 3085 uint32_t max_size = bdev->max_rw_size; 3086 void *md_buf = NULL; 3087 int rc; 3088 3089 max_size = max_size ? max_size : UINT32_MAX; 3090 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3091 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3092 SPDK_BDEV_IO_NUM_CHILD_IOV; 3093 3094 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3095 io_boundary = bdev->write_unit_size; 3096 } else if (bdev->split_on_optimal_io_boundary) { 3097 io_boundary = bdev->optimal_io_boundary; 3098 } else { 3099 io_boundary = UINT32_MAX; 3100 } 3101 3102 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3103 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3104 parent_offset = bdev_io->u.bdev.offset_blocks; 3105 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3106 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3107 3108 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3109 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3110 if (parent_iov_offset < parent_iov->iov_len) { 3111 break; 3112 } 3113 parent_iov_offset -= parent_iov->iov_len; 3114 } 3115 3116 child_iovcnt = 0; 3117 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3118 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3119 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3120 to_next_boundary = spdk_min(remaining, to_next_boundary); 3121 to_next_boundary = spdk_min(max_size, to_next_boundary); 3122 to_next_boundary_bytes = to_next_boundary * blocklen; 3123 3124 iov = &bdev_io->child_iov[child_iovcnt]; 3125 iovcnt = 0; 3126 3127 if (bdev_io->u.bdev.md_buf) { 3128 md_buf = (char *)bdev_io->u.bdev.md_buf + 3129 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3130 } 3131 3132 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3133 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3134 iovcnt < child_iovsize) { 3135 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3136 iov_len = parent_iov->iov_len - parent_iov_offset; 3137 3138 iov_len = spdk_min(iov_len, max_segment_size); 3139 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3140 to_next_boundary_bytes -= iov_len; 3141 3142 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3143 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3144 3145 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3146 parent_iov_offset += iov_len; 3147 } else { 3148 parent_iovpos++; 3149 parent_iov_offset = 0; 3150 } 3151 child_iovcnt++; 3152 iovcnt++; 3153 } 3154 3155 if (to_next_boundary_bytes > 0) { 3156 /* We had to stop this child I/O early because we ran out of 3157 * child_iov space or were limited by max_num_segments. 3158 * Ensure the iovs to be aligned with block size and 3159 * then adjust to_next_boundary before starting the 3160 * child I/O. 3161 */ 3162 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3163 iovcnt == child_iovsize); 3164 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3165 if (to_last_block_bytes != 0) { 3166 uint32_t child_iovpos = child_iovcnt - 1; 3167 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3168 * so the loop will naturally end 3169 */ 3170 3171 to_last_block_bytes = blocklen - to_last_block_bytes; 3172 to_next_boundary_bytes += to_last_block_bytes; 3173 while (to_last_block_bytes > 0 && iovcnt > 0) { 3174 iov_len = spdk_min(to_last_block_bytes, 3175 bdev_io->child_iov[child_iovpos].iov_len); 3176 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3177 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3178 child_iovpos--; 3179 if (--iovcnt == 0) { 3180 /* If the child IO is less than a block size just return. 3181 * If the first child IO of any split round is less than 3182 * a block size, an error exit. 3183 */ 3184 if (bdev_io->u.bdev.split_outstanding == 0) { 3185 SPDK_ERRLOG("The first child io was less than a block size\n"); 3186 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3187 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3188 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3189 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3190 } 3191 3192 return; 3193 } 3194 } 3195 3196 to_last_block_bytes -= iov_len; 3197 3198 if (parent_iov_offset == 0) { 3199 parent_iovpos--; 3200 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3201 } 3202 parent_iov_offset -= iov_len; 3203 } 3204 3205 assert(to_last_block_bytes == 0); 3206 } 3207 to_next_boundary -= to_next_boundary_bytes / blocklen; 3208 } 3209 3210 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3211 ¤t_offset, &remaining); 3212 if (spdk_unlikely(rc)) { 3213 return; 3214 } 3215 } 3216 } 3217 3218 static void 3219 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3220 { 3221 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3222 uint32_t num_children_reqs = 0; 3223 int rc; 3224 3225 offset = bdev_io->u.bdev.split_current_offset_blocks; 3226 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3227 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3228 3229 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3230 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3231 3232 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3233 &offset, &remaining); 3234 if (spdk_likely(rc == 0)) { 3235 num_children_reqs++; 3236 } else { 3237 return; 3238 } 3239 } 3240 } 3241 3242 static void 3243 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3244 { 3245 uint64_t offset, write_zeroes_blocks, remaining; 3246 uint32_t num_children_reqs = 0; 3247 int rc; 3248 3249 offset = bdev_io->u.bdev.split_current_offset_blocks; 3250 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3251 3252 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3253 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3254 3255 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3256 &offset, &remaining); 3257 if (spdk_likely(rc == 0)) { 3258 num_children_reqs++; 3259 } else { 3260 return; 3261 } 3262 } 3263 } 3264 3265 static void 3266 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3267 { 3268 uint64_t offset, copy_blocks, remaining; 3269 uint32_t num_children_reqs = 0; 3270 int rc; 3271 3272 offset = bdev_io->u.bdev.split_current_offset_blocks; 3273 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3274 3275 assert(bdev_io->bdev->max_copy != 0); 3276 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3277 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3278 3279 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3280 &offset, &remaining); 3281 if (spdk_likely(rc == 0)) { 3282 num_children_reqs++; 3283 } else { 3284 return; 3285 } 3286 } 3287 } 3288 3289 static void 3290 parent_bdev_io_complete(void *ctx, int rc) 3291 { 3292 struct spdk_bdev_io *parent_io = ctx; 3293 3294 if (rc) { 3295 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3296 } 3297 3298 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3299 parent_io->internal.caller_ctx); 3300 } 3301 3302 static void 3303 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3304 { 3305 struct spdk_bdev_io *bdev_io = ctx; 3306 3307 /* u.bdev.accel_sequence should have already been cleared at this point */ 3308 assert(bdev_io->u.bdev.accel_sequence == NULL); 3309 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3310 bdev_io->internal.accel_sequence = NULL; 3311 3312 if (spdk_unlikely(status != 0)) { 3313 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3314 } 3315 3316 parent_bdev_io_complete(bdev_io, status); 3317 } 3318 3319 static void 3320 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3321 { 3322 struct spdk_bdev_io *parent_io = cb_arg; 3323 3324 spdk_bdev_free_io(bdev_io); 3325 3326 if (!success) { 3327 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3328 /* If any child I/O failed, stop further splitting process. */ 3329 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3330 parent_io->u.bdev.split_remaining_num_blocks = 0; 3331 } 3332 parent_io->u.bdev.split_outstanding--; 3333 if (parent_io->u.bdev.split_outstanding != 0) { 3334 return; 3335 } 3336 3337 /* 3338 * Parent I/O finishes when all blocks are consumed. 3339 */ 3340 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3341 assert(parent_io->internal.cb != bdev_io_split_done); 3342 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3343 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3344 3345 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3346 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3347 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3348 return; 3349 } else if (parent_io->internal.orig_iovcnt != 0 && 3350 !bdev_io_use_accel_sequence(bdev_io)) { 3351 /* bdev IO will be completed in the callback */ 3352 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3353 return; 3354 } 3355 } 3356 3357 parent_bdev_io_complete(parent_io, 0); 3358 return; 3359 } 3360 3361 /* 3362 * Continue with the splitting process. This function will complete the parent I/O if the 3363 * splitting is done. 3364 */ 3365 switch (parent_io->type) { 3366 case SPDK_BDEV_IO_TYPE_READ: 3367 case SPDK_BDEV_IO_TYPE_WRITE: 3368 _bdev_rw_split(parent_io); 3369 break; 3370 case SPDK_BDEV_IO_TYPE_UNMAP: 3371 bdev_unmap_split(parent_io); 3372 break; 3373 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3374 bdev_write_zeroes_split(parent_io); 3375 break; 3376 case SPDK_BDEV_IO_TYPE_COPY: 3377 bdev_copy_split(parent_io); 3378 break; 3379 default: 3380 assert(false); 3381 break; 3382 } 3383 } 3384 3385 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3386 bool success); 3387 3388 static void 3389 bdev_io_split(struct spdk_bdev_io *bdev_io) 3390 { 3391 assert(bdev_io_should_split(bdev_io)); 3392 3393 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3394 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3395 bdev_io->u.bdev.split_outstanding = 0; 3396 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3397 3398 switch (bdev_io->type) { 3399 case SPDK_BDEV_IO_TYPE_READ: 3400 case SPDK_BDEV_IO_TYPE_WRITE: 3401 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3402 _bdev_rw_split(bdev_io); 3403 } else { 3404 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3405 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3406 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3407 } 3408 break; 3409 case SPDK_BDEV_IO_TYPE_UNMAP: 3410 bdev_unmap_split(bdev_io); 3411 break; 3412 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3413 bdev_write_zeroes_split(bdev_io); 3414 break; 3415 case SPDK_BDEV_IO_TYPE_COPY: 3416 bdev_copy_split(bdev_io); 3417 break; 3418 default: 3419 assert(false); 3420 break; 3421 } 3422 } 3423 3424 static void 3425 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3426 { 3427 if (!success) { 3428 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3429 return; 3430 } 3431 3432 _bdev_rw_split(bdev_io); 3433 } 3434 3435 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3436 * be inlined, at least on some compilers. 3437 */ 3438 static inline void 3439 _bdev_io_submit(void *ctx) 3440 { 3441 struct spdk_bdev_io *bdev_io = ctx; 3442 struct spdk_bdev *bdev = bdev_io->bdev; 3443 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3444 3445 if (spdk_likely(bdev_ch->flags == 0)) { 3446 bdev_io_do_submit(bdev_ch, bdev_io); 3447 return; 3448 } 3449 3450 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3451 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3452 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3453 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3454 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3455 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3456 } else { 3457 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3458 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3459 } 3460 } else { 3461 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3462 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3463 } 3464 } 3465 3466 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3467 3468 bool 3469 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3470 { 3471 if (range1->length == 0 || range2->length == 0) { 3472 return false; 3473 } 3474 3475 if (range1->offset + range1->length <= range2->offset) { 3476 return false; 3477 } 3478 3479 if (range2->offset + range2->length <= range1->offset) { 3480 return false; 3481 } 3482 3483 return true; 3484 } 3485 3486 static bool 3487 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3488 { 3489 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3490 struct lba_range r; 3491 3492 switch (bdev_io->type) { 3493 case SPDK_BDEV_IO_TYPE_NVME_IO: 3494 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3495 /* Don't try to decode the NVMe command - just assume worst-case and that 3496 * it overlaps a locked range. 3497 */ 3498 return true; 3499 case SPDK_BDEV_IO_TYPE_READ: 3500 if (!range->quiesce) { 3501 return false; 3502 } 3503 /* fallthrough */ 3504 case SPDK_BDEV_IO_TYPE_WRITE: 3505 case SPDK_BDEV_IO_TYPE_UNMAP: 3506 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3507 case SPDK_BDEV_IO_TYPE_ZCOPY: 3508 case SPDK_BDEV_IO_TYPE_COPY: 3509 r.offset = bdev_io->u.bdev.offset_blocks; 3510 r.length = bdev_io->u.bdev.num_blocks; 3511 if (!bdev_lba_range_overlapped(range, &r)) { 3512 /* This I/O doesn't overlap the specified LBA range. */ 3513 return false; 3514 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3515 /* This I/O overlaps, but the I/O is on the same channel that locked this 3516 * range, and the caller_ctx is the same as the locked_ctx. This means 3517 * that this I/O is associated with the lock, and is allowed to execute. 3518 */ 3519 return false; 3520 } else { 3521 return true; 3522 } 3523 default: 3524 return false; 3525 } 3526 } 3527 3528 void 3529 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3530 { 3531 struct spdk_bdev *bdev = bdev_io->bdev; 3532 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3533 3534 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3535 3536 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3537 struct lba_range *range; 3538 3539 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3540 if (bdev_io_range_is_locked(bdev_io, range)) { 3541 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3542 return; 3543 } 3544 } 3545 } 3546 3547 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3548 3549 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3550 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3551 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3552 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3553 spdk_bdev_get_name(bdev)); 3554 3555 if (bdev_io->internal.split) { 3556 bdev_io_split(bdev_io); 3557 return; 3558 } 3559 3560 _bdev_io_submit(bdev_io); 3561 } 3562 3563 static inline void 3564 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3565 { 3566 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3567 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3568 * For write operation we need to pull buffers from memory domain before submitting IO. 3569 * Once read operation completes, we need to use memory_domain push functionality to 3570 * update data in original memory domain IO buffer 3571 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3572 bdev_io->u.bdev.memory_domain = NULL; 3573 bdev_io->u.bdev.memory_domain_ctx = NULL; 3574 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3575 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3576 } 3577 3578 static inline void 3579 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3580 { 3581 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3582 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3583 3584 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3585 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3586 bdev_io_complete_unsubmitted(bdev_io); 3587 return; 3588 } 3589 3590 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3591 * support them, but we need to execute an accel sequence and the data buffer is from accel 3592 * memory domain (to avoid doing a push/pull from that domain). 3593 */ 3594 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3595 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3596 _bdev_io_ext_use_bounce_buffer(bdev_io); 3597 return; 3598 } 3599 3600 if (needs_exec) { 3601 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3602 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3603 return; 3604 } 3605 /* For reads we'll execute the sequence after the data is read, so, for now, only 3606 * clear out accel_sequence pointer and submit the IO */ 3607 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3608 bdev_io->u.bdev.accel_sequence = NULL; 3609 } 3610 3611 bdev_io_submit(bdev_io); 3612 } 3613 3614 static void 3615 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3616 { 3617 struct spdk_bdev *bdev = bdev_io->bdev; 3618 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3619 struct spdk_io_channel *ch = bdev_ch->channel; 3620 3621 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3622 3623 bdev_io->internal.in_submit_request = true; 3624 bdev_submit_request(bdev, ch, bdev_io); 3625 bdev_io->internal.in_submit_request = false; 3626 } 3627 3628 void 3629 bdev_io_init(struct spdk_bdev_io *bdev_io, 3630 struct spdk_bdev *bdev, void *cb_arg, 3631 spdk_bdev_io_completion_cb cb) 3632 { 3633 bdev_io->bdev = bdev; 3634 bdev_io->internal.caller_ctx = cb_arg; 3635 bdev_io->internal.cb = cb; 3636 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3637 bdev_io->internal.in_submit_request = false; 3638 bdev_io->internal.buf = NULL; 3639 bdev_io->internal.orig_iovs = NULL; 3640 bdev_io->internal.orig_iovcnt = 0; 3641 bdev_io->internal.orig_md_iov.iov_base = NULL; 3642 bdev_io->internal.error.nvme.cdw0 = 0; 3643 bdev_io->num_retries = 0; 3644 bdev_io->internal.get_buf_cb = NULL; 3645 bdev_io->internal.get_aux_buf_cb = NULL; 3646 bdev_io->internal.memory_domain = NULL; 3647 bdev_io->internal.memory_domain_ctx = NULL; 3648 bdev_io->internal.data_transfer_cpl = NULL; 3649 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3650 bdev_io->internal.accel_sequence = NULL; 3651 bdev_io->internal.has_accel_sequence = false; 3652 } 3653 3654 static bool 3655 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3656 { 3657 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3658 } 3659 3660 bool 3661 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3662 { 3663 bool supported; 3664 3665 supported = bdev_io_type_supported(bdev, io_type); 3666 3667 if (!supported) { 3668 switch (io_type) { 3669 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3670 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3671 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3672 break; 3673 default: 3674 break; 3675 } 3676 } 3677 3678 return supported; 3679 } 3680 3681 uint64_t 3682 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3683 { 3684 return bdev_io->internal.submit_tsc; 3685 } 3686 3687 int 3688 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3689 { 3690 if (bdev->fn_table->dump_info_json) { 3691 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3692 } 3693 3694 return 0; 3695 } 3696 3697 static void 3698 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3699 { 3700 uint32_t max_per_timeslice = 0; 3701 int i; 3702 3703 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3704 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3705 qos->rate_limits[i].max_per_timeslice = 0; 3706 continue; 3707 } 3708 3709 max_per_timeslice = qos->rate_limits[i].limit * 3710 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3711 3712 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3713 qos->rate_limits[i].min_per_timeslice); 3714 3715 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3716 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3717 } 3718 3719 bdev_qos_set_ops(qos); 3720 } 3721 3722 static void 3723 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3724 struct spdk_io_channel *io_ch, void *ctx) 3725 { 3726 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3727 int status; 3728 3729 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3730 3731 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3732 /* TODO: channels round robing */ 3733 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3734 3735 spdk_bdev_for_each_channel_continue(i, status); 3736 } 3737 3738 3739 static void 3740 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3741 { 3742 3743 } 3744 3745 static int 3746 bdev_channel_poll_qos(void *arg) 3747 { 3748 struct spdk_bdev *bdev = arg; 3749 struct spdk_bdev_qos *qos = bdev->internal.qos; 3750 uint64_t now = spdk_get_ticks(); 3751 int i; 3752 int64_t remaining_last_timeslice; 3753 3754 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3755 /* We received our callback earlier than expected - return 3756 * immediately and wait to do accounting until at least one 3757 * timeslice has actually expired. This should never happen 3758 * with a well-behaved timer implementation. 3759 */ 3760 return SPDK_POLLER_IDLE; 3761 } 3762 3763 /* Reset for next round of rate limiting */ 3764 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3765 /* We may have allowed the IOs or bytes to slightly overrun in the last 3766 * timeslice. remaining_this_timeslice is signed, so if it's negative 3767 * here, we'll account for the overrun so that the next timeslice will 3768 * be appropriately reduced. 3769 */ 3770 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3771 0, __ATOMIC_RELAXED); 3772 if (remaining_last_timeslice < 0) { 3773 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3774 * potentially use 2 atomic ops each, so they can intertwine. 3775 * This race can potentialy cause the limits to be a little fuzzy but won't cause any real damage. 3776 */ 3777 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3778 remaining_last_timeslice, __ATOMIC_RELAXED); 3779 } 3780 } 3781 3782 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3783 qos->last_timeslice += qos->timeslice_size; 3784 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3785 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3786 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3787 } 3788 } 3789 3790 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3791 bdev_channel_submit_qos_io_done); 3792 3793 return SPDK_POLLER_BUSY; 3794 } 3795 3796 static void 3797 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3798 { 3799 struct spdk_bdev_shared_resource *shared_resource; 3800 struct lba_range *range; 3801 3802 bdev_free_io_stat(ch->stat); 3803 #ifdef SPDK_CONFIG_VTUNE 3804 bdev_free_io_stat(ch->prev_stat); 3805 #endif 3806 3807 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3808 range = TAILQ_FIRST(&ch->locked_ranges); 3809 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3810 free(range); 3811 } 3812 3813 spdk_put_io_channel(ch->channel); 3814 spdk_put_io_channel(ch->accel_channel); 3815 3816 shared_resource = ch->shared_resource; 3817 3818 assert(TAILQ_EMPTY(&ch->io_locked)); 3819 assert(TAILQ_EMPTY(&ch->io_submitted)); 3820 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3821 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3822 assert(ch->io_outstanding == 0); 3823 assert(shared_resource->ref > 0); 3824 shared_resource->ref--; 3825 if (shared_resource->ref == 0) { 3826 assert(shared_resource->io_outstanding == 0); 3827 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3828 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3829 spdk_poller_unregister(&shared_resource->nomem_poller); 3830 free(shared_resource); 3831 } 3832 } 3833 3834 static void 3835 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3836 { 3837 struct spdk_bdev_qos *qos = bdev->internal.qos; 3838 int i; 3839 3840 assert(spdk_spin_held(&bdev->internal.spinlock)); 3841 3842 /* Rate limiting on this bdev enabled */ 3843 if (qos) { 3844 if (qos->ch == NULL) { 3845 struct spdk_io_channel *io_ch; 3846 3847 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3848 bdev->name, spdk_get_thread()); 3849 3850 /* No qos channel has been selected, so set one up */ 3851 3852 /* Take another reference to ch */ 3853 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3854 assert(io_ch != NULL); 3855 qos->ch = ch; 3856 3857 qos->thread = spdk_io_channel_get_thread(io_ch); 3858 3859 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3860 if (bdev_qos_is_iops_rate_limit(i) == true) { 3861 qos->rate_limits[i].min_per_timeslice = 3862 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3863 } else { 3864 qos->rate_limits[i].min_per_timeslice = 3865 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3866 } 3867 3868 if (qos->rate_limits[i].limit == 0) { 3869 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3870 } 3871 } 3872 bdev_qos_update_max_quota_per_timeslice(qos); 3873 qos->timeslice_size = 3874 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3875 qos->last_timeslice = spdk_get_ticks(); 3876 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3877 bdev, 3878 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3879 } 3880 3881 ch->flags |= BDEV_CH_QOS_ENABLED; 3882 } 3883 } 3884 3885 struct poll_timeout_ctx { 3886 struct spdk_bdev_desc *desc; 3887 uint64_t timeout_in_sec; 3888 spdk_bdev_io_timeout_cb cb_fn; 3889 void *cb_arg; 3890 }; 3891 3892 static void 3893 bdev_desc_free(struct spdk_bdev_desc *desc) 3894 { 3895 spdk_spin_destroy(&desc->spinlock); 3896 free(desc->media_events_buffer); 3897 free(desc); 3898 } 3899 3900 static void 3901 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3902 { 3903 struct poll_timeout_ctx *ctx = _ctx; 3904 struct spdk_bdev_desc *desc = ctx->desc; 3905 3906 free(ctx); 3907 3908 spdk_spin_lock(&desc->spinlock); 3909 desc->refs--; 3910 if (desc->closed == true && desc->refs == 0) { 3911 spdk_spin_unlock(&desc->spinlock); 3912 bdev_desc_free(desc); 3913 return; 3914 } 3915 spdk_spin_unlock(&desc->spinlock); 3916 } 3917 3918 static void 3919 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3920 struct spdk_io_channel *io_ch, void *_ctx) 3921 { 3922 struct poll_timeout_ctx *ctx = _ctx; 3923 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3924 struct spdk_bdev_desc *desc = ctx->desc; 3925 struct spdk_bdev_io *bdev_io; 3926 uint64_t now; 3927 3928 spdk_spin_lock(&desc->spinlock); 3929 if (desc->closed == true) { 3930 spdk_spin_unlock(&desc->spinlock); 3931 spdk_bdev_for_each_channel_continue(i, -1); 3932 return; 3933 } 3934 spdk_spin_unlock(&desc->spinlock); 3935 3936 now = spdk_get_ticks(); 3937 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3938 /* Exclude any I/O that are generated via splitting. */ 3939 if (bdev_io->internal.cb == bdev_io_split_done) { 3940 continue; 3941 } 3942 3943 /* Once we find an I/O that has not timed out, we can immediately 3944 * exit the loop. 3945 */ 3946 if (now < (bdev_io->internal.submit_tsc + 3947 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3948 goto end; 3949 } 3950 3951 if (bdev_io->internal.desc == desc) { 3952 ctx->cb_fn(ctx->cb_arg, bdev_io); 3953 } 3954 } 3955 3956 end: 3957 spdk_bdev_for_each_channel_continue(i, 0); 3958 } 3959 3960 static int 3961 bdev_poll_timeout_io(void *arg) 3962 { 3963 struct spdk_bdev_desc *desc = arg; 3964 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3965 struct poll_timeout_ctx *ctx; 3966 3967 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3968 if (!ctx) { 3969 SPDK_ERRLOG("failed to allocate memory\n"); 3970 return SPDK_POLLER_BUSY; 3971 } 3972 ctx->desc = desc; 3973 ctx->cb_arg = desc->cb_arg; 3974 ctx->cb_fn = desc->cb_fn; 3975 ctx->timeout_in_sec = desc->timeout_in_sec; 3976 3977 /* Take a ref on the descriptor in case it gets closed while we are checking 3978 * all of the channels. 3979 */ 3980 spdk_spin_lock(&desc->spinlock); 3981 desc->refs++; 3982 spdk_spin_unlock(&desc->spinlock); 3983 3984 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3985 bdev_channel_poll_timeout_io_done); 3986 3987 return SPDK_POLLER_BUSY; 3988 } 3989 3990 int 3991 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3992 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3993 { 3994 assert(desc->thread == spdk_get_thread()); 3995 3996 spdk_poller_unregister(&desc->io_timeout_poller); 3997 3998 if (timeout_in_sec) { 3999 assert(cb_fn != NULL); 4000 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4001 desc, 4002 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4003 1000); 4004 if (desc->io_timeout_poller == NULL) { 4005 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4006 return -1; 4007 } 4008 } 4009 4010 desc->cb_fn = cb_fn; 4011 desc->cb_arg = cb_arg; 4012 desc->timeout_in_sec = timeout_in_sec; 4013 4014 return 0; 4015 } 4016 4017 static int 4018 bdev_channel_create(void *io_device, void *ctx_buf) 4019 { 4020 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4021 struct spdk_bdev_channel *ch = ctx_buf; 4022 struct spdk_io_channel *mgmt_io_ch; 4023 struct spdk_bdev_mgmt_channel *mgmt_ch; 4024 struct spdk_bdev_shared_resource *shared_resource; 4025 struct lba_range *range; 4026 4027 ch->bdev = bdev; 4028 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4029 if (!ch->channel) { 4030 return -1; 4031 } 4032 4033 ch->accel_channel = spdk_accel_get_io_channel(); 4034 if (!ch->accel_channel) { 4035 spdk_put_io_channel(ch->channel); 4036 return -1; 4037 } 4038 4039 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 4040 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4041 4042 assert(ch->histogram == NULL); 4043 if (bdev->internal.histogram_enabled) { 4044 ch->histogram = spdk_histogram_data_alloc(); 4045 if (ch->histogram == NULL) { 4046 SPDK_ERRLOG("Could not allocate histogram\n"); 4047 } 4048 } 4049 4050 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4051 if (!mgmt_io_ch) { 4052 spdk_put_io_channel(ch->channel); 4053 spdk_put_io_channel(ch->accel_channel); 4054 return -1; 4055 } 4056 4057 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4058 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4059 if (shared_resource->shared_ch == ch->channel) { 4060 spdk_put_io_channel(mgmt_io_ch); 4061 shared_resource->ref++; 4062 break; 4063 } 4064 } 4065 4066 if (shared_resource == NULL) { 4067 shared_resource = calloc(1, sizeof(*shared_resource)); 4068 if (shared_resource == NULL) { 4069 spdk_put_io_channel(ch->channel); 4070 spdk_put_io_channel(ch->accel_channel); 4071 spdk_put_io_channel(mgmt_io_ch); 4072 return -1; 4073 } 4074 4075 shared_resource->mgmt_ch = mgmt_ch; 4076 shared_resource->io_outstanding = 0; 4077 TAILQ_INIT(&shared_resource->nomem_io); 4078 shared_resource->nomem_threshold = 0; 4079 shared_resource->shared_ch = ch->channel; 4080 shared_resource->ref = 1; 4081 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4082 } 4083 4084 ch->io_outstanding = 0; 4085 TAILQ_INIT(&ch->queued_resets); 4086 TAILQ_INIT(&ch->locked_ranges); 4087 TAILQ_INIT(&ch->qos_queued_io); 4088 ch->flags = 0; 4089 ch->shared_resource = shared_resource; 4090 4091 TAILQ_INIT(&ch->io_submitted); 4092 TAILQ_INIT(&ch->io_locked); 4093 TAILQ_INIT(&ch->io_accel_exec); 4094 TAILQ_INIT(&ch->io_memory_domain); 4095 4096 ch->stat = bdev_alloc_io_stat(false); 4097 if (ch->stat == NULL) { 4098 bdev_channel_destroy_resource(ch); 4099 return -1; 4100 } 4101 4102 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4103 4104 #ifdef SPDK_CONFIG_VTUNE 4105 { 4106 char *name; 4107 __itt_init_ittlib(NULL, 0); 4108 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4109 if (!name) { 4110 bdev_channel_destroy_resource(ch); 4111 return -1; 4112 } 4113 ch->handle = __itt_string_handle_create(name); 4114 free(name); 4115 ch->start_tsc = spdk_get_ticks(); 4116 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4117 ch->prev_stat = bdev_alloc_io_stat(false); 4118 if (ch->prev_stat == NULL) { 4119 bdev_channel_destroy_resource(ch); 4120 return -1; 4121 } 4122 } 4123 #endif 4124 4125 spdk_spin_lock(&bdev->internal.spinlock); 4126 bdev_enable_qos(bdev, ch); 4127 4128 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4129 struct lba_range *new_range; 4130 4131 new_range = calloc(1, sizeof(*new_range)); 4132 if (new_range == NULL) { 4133 spdk_spin_unlock(&bdev->internal.spinlock); 4134 bdev_channel_destroy_resource(ch); 4135 return -1; 4136 } 4137 new_range->length = range->length; 4138 new_range->offset = range->offset; 4139 new_range->locked_ctx = range->locked_ctx; 4140 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4141 } 4142 4143 spdk_spin_unlock(&bdev->internal.spinlock); 4144 4145 return 0; 4146 } 4147 4148 static int 4149 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4150 void *cb_ctx) 4151 { 4152 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4153 struct spdk_bdev_io *bdev_io; 4154 uint64_t buf_len; 4155 4156 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4157 if (bdev_io->internal.ch == bdev_ch) { 4158 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4159 spdk_iobuf_entry_abort(ch, entry, buf_len); 4160 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4161 } 4162 4163 return 0; 4164 } 4165 4166 /* 4167 * Abort I/O that are waiting on a data buffer. 4168 */ 4169 static void 4170 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4171 { 4172 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4173 bdev_abort_all_buf_io_cb, ch); 4174 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4175 bdev_abort_all_buf_io_cb, ch); 4176 } 4177 4178 /* 4179 * Abort I/O that are queued waiting for submission. These types of I/O are 4180 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4181 */ 4182 static void 4183 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4184 { 4185 struct spdk_bdev_io *bdev_io, *tmp; 4186 4187 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4188 if (bdev_io->internal.ch == ch) { 4189 TAILQ_REMOVE(queue, bdev_io, internal.link); 4190 /* 4191 * spdk_bdev_io_complete() assumes that the completed I/O had 4192 * been submitted to the bdev module. Since in this case it 4193 * hadn't, bump io_outstanding to account for the decrement 4194 * that spdk_bdev_io_complete() will do. 4195 */ 4196 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4197 bdev_io_increment_outstanding(ch, ch->shared_resource); 4198 } 4199 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4200 } 4201 } 4202 } 4203 4204 static bool 4205 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4206 { 4207 struct spdk_bdev_io *bdev_io; 4208 4209 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4210 if (bdev_io == bio_to_abort) { 4211 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4212 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4213 return true; 4214 } 4215 } 4216 4217 return false; 4218 } 4219 4220 static int 4221 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4222 { 4223 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4224 uint64_t buf_len; 4225 4226 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4227 if (bdev_io == bio_to_abort) { 4228 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4229 spdk_iobuf_entry_abort(ch, entry, buf_len); 4230 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4231 return 1; 4232 } 4233 4234 return 0; 4235 } 4236 4237 static bool 4238 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4239 { 4240 int rc; 4241 4242 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4243 bdev_abort_buf_io_cb, bio_to_abort); 4244 if (rc == 1) { 4245 return true; 4246 } 4247 4248 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4249 bdev_abort_buf_io_cb, bio_to_abort); 4250 return rc == 1; 4251 } 4252 4253 static void 4254 bdev_qos_channel_destroy(void *cb_arg) 4255 { 4256 struct spdk_bdev_qos *qos = cb_arg; 4257 4258 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4259 spdk_poller_unregister(&qos->poller); 4260 4261 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4262 4263 free(qos); 4264 } 4265 4266 static int 4267 bdev_qos_destroy(struct spdk_bdev *bdev) 4268 { 4269 int i; 4270 4271 /* 4272 * Cleanly shutting down the QoS poller is tricky, because 4273 * during the asynchronous operation the user could open 4274 * a new descriptor and create a new channel, spawning 4275 * a new QoS poller. 4276 * 4277 * The strategy is to create a new QoS structure here and swap it 4278 * in. The shutdown path then continues to refer to the old one 4279 * until it completes and then releases it. 4280 */ 4281 struct spdk_bdev_qos *new_qos, *old_qos; 4282 4283 old_qos = bdev->internal.qos; 4284 4285 new_qos = calloc(1, sizeof(*new_qos)); 4286 if (!new_qos) { 4287 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4288 return -ENOMEM; 4289 } 4290 4291 /* Copy the old QoS data into the newly allocated structure */ 4292 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4293 4294 /* Zero out the key parts of the QoS structure */ 4295 new_qos->ch = NULL; 4296 new_qos->thread = NULL; 4297 new_qos->poller = NULL; 4298 /* 4299 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4300 * It will be used later for the new QoS structure. 4301 */ 4302 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4303 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4304 new_qos->rate_limits[i].min_per_timeslice = 0; 4305 new_qos->rate_limits[i].max_per_timeslice = 0; 4306 } 4307 4308 bdev->internal.qos = new_qos; 4309 4310 if (old_qos->thread == NULL) { 4311 free(old_qos); 4312 } else { 4313 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4314 } 4315 4316 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4317 * been destroyed yet. The destruction path will end up waiting for the final 4318 * channel to be put before it releases resources. */ 4319 4320 return 0; 4321 } 4322 4323 void 4324 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4325 { 4326 total->bytes_read += add->bytes_read; 4327 total->num_read_ops += add->num_read_ops; 4328 total->bytes_written += add->bytes_written; 4329 total->num_write_ops += add->num_write_ops; 4330 total->bytes_unmapped += add->bytes_unmapped; 4331 total->num_unmap_ops += add->num_unmap_ops; 4332 total->bytes_copied += add->bytes_copied; 4333 total->num_copy_ops += add->num_copy_ops; 4334 total->read_latency_ticks += add->read_latency_ticks; 4335 total->write_latency_ticks += add->write_latency_ticks; 4336 total->unmap_latency_ticks += add->unmap_latency_ticks; 4337 total->copy_latency_ticks += add->copy_latency_ticks; 4338 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4339 total->max_read_latency_ticks = add->max_read_latency_ticks; 4340 } 4341 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4342 total->min_read_latency_ticks = add->min_read_latency_ticks; 4343 } 4344 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4345 total->max_write_latency_ticks = add->max_write_latency_ticks; 4346 } 4347 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4348 total->min_write_latency_ticks = add->min_write_latency_ticks; 4349 } 4350 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4351 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4352 } 4353 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4354 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4355 } 4356 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4357 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4358 } 4359 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4360 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4361 } 4362 } 4363 4364 static void 4365 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4366 { 4367 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4368 4369 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4370 memcpy(to_stat->io_error, from_stat->io_error, 4371 sizeof(struct spdk_bdev_io_error_stat)); 4372 } 4373 } 4374 4375 void 4376 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4377 { 4378 stat->max_read_latency_ticks = 0; 4379 stat->min_read_latency_ticks = UINT64_MAX; 4380 stat->max_write_latency_ticks = 0; 4381 stat->min_write_latency_ticks = UINT64_MAX; 4382 stat->max_unmap_latency_ticks = 0; 4383 stat->min_unmap_latency_ticks = UINT64_MAX; 4384 stat->max_copy_latency_ticks = 0; 4385 stat->min_copy_latency_ticks = UINT64_MAX; 4386 4387 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4388 return; 4389 } 4390 4391 stat->bytes_read = 0; 4392 stat->num_read_ops = 0; 4393 stat->bytes_written = 0; 4394 stat->num_write_ops = 0; 4395 stat->bytes_unmapped = 0; 4396 stat->num_unmap_ops = 0; 4397 stat->bytes_copied = 0; 4398 stat->num_copy_ops = 0; 4399 stat->read_latency_ticks = 0; 4400 stat->write_latency_ticks = 0; 4401 stat->unmap_latency_ticks = 0; 4402 stat->copy_latency_ticks = 0; 4403 4404 if (stat->io_error != NULL) { 4405 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4406 } 4407 } 4408 4409 struct spdk_bdev_io_stat * 4410 bdev_alloc_io_stat(bool io_error_stat) 4411 { 4412 struct spdk_bdev_io_stat *stat; 4413 4414 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4415 if (stat == NULL) { 4416 return NULL; 4417 } 4418 4419 if (io_error_stat) { 4420 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4421 if (stat->io_error == NULL) { 4422 free(stat); 4423 return NULL; 4424 } 4425 } else { 4426 stat->io_error = NULL; 4427 } 4428 4429 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4430 4431 return stat; 4432 } 4433 4434 void 4435 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4436 { 4437 if (stat != NULL) { 4438 free(stat->io_error); 4439 free(stat); 4440 } 4441 } 4442 4443 void 4444 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4445 { 4446 int i; 4447 4448 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4449 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4450 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4451 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4452 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4453 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4454 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4455 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4456 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4457 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4458 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4459 stat->min_read_latency_ticks != UINT64_MAX ? 4460 stat->min_read_latency_ticks : 0); 4461 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4462 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4463 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4464 stat->min_write_latency_ticks != UINT64_MAX ? 4465 stat->min_write_latency_ticks : 0); 4466 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4467 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4468 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4469 stat->min_unmap_latency_ticks != UINT64_MAX ? 4470 stat->min_unmap_latency_ticks : 0); 4471 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4472 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4473 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4474 stat->min_copy_latency_ticks != UINT64_MAX ? 4475 stat->min_copy_latency_ticks : 0); 4476 4477 if (stat->io_error != NULL) { 4478 spdk_json_write_named_object_begin(w, "io_error"); 4479 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4480 if (stat->io_error->error_status[i] != 0) { 4481 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4482 stat->io_error->error_status[i]); 4483 } 4484 } 4485 spdk_json_write_object_end(w); 4486 } 4487 } 4488 4489 static void 4490 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4491 { 4492 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4493 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4494 4495 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4496 bdev_abort_all_buf_io(mgmt_ch, ch); 4497 } 4498 4499 static void 4500 bdev_channel_destroy(void *io_device, void *ctx_buf) 4501 { 4502 struct spdk_bdev_channel *ch = ctx_buf; 4503 4504 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4505 spdk_get_thread()); 4506 4507 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4508 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4509 4510 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4511 spdk_spin_lock(&ch->bdev->internal.spinlock); 4512 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4513 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4514 4515 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4516 4517 bdev_channel_abort_queued_ios(ch); 4518 4519 if (ch->histogram) { 4520 spdk_histogram_data_free(ch->histogram); 4521 } 4522 4523 bdev_channel_destroy_resource(ch); 4524 } 4525 4526 /* 4527 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4528 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4529 */ 4530 static int 4531 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4532 { 4533 struct spdk_bdev_name *tmp; 4534 4535 bdev_name->name = strdup(name); 4536 if (bdev_name->name == NULL) { 4537 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4538 return -ENOMEM; 4539 } 4540 4541 bdev_name->bdev = bdev; 4542 4543 spdk_spin_lock(&g_bdev_mgr.spinlock); 4544 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4545 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4546 4547 if (tmp != NULL) { 4548 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4549 free(bdev_name->name); 4550 return -EEXIST; 4551 } 4552 4553 return 0; 4554 } 4555 4556 static void 4557 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4558 { 4559 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4560 free(bdev_name->name); 4561 } 4562 4563 static void 4564 bdev_name_del(struct spdk_bdev_name *bdev_name) 4565 { 4566 spdk_spin_lock(&g_bdev_mgr.spinlock); 4567 bdev_name_del_unsafe(bdev_name); 4568 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4569 } 4570 4571 int 4572 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4573 { 4574 struct spdk_bdev_alias *tmp; 4575 int ret; 4576 4577 if (alias == NULL) { 4578 SPDK_ERRLOG("Empty alias passed\n"); 4579 return -EINVAL; 4580 } 4581 4582 tmp = calloc(1, sizeof(*tmp)); 4583 if (tmp == NULL) { 4584 SPDK_ERRLOG("Unable to allocate alias\n"); 4585 return -ENOMEM; 4586 } 4587 4588 ret = bdev_name_add(&tmp->alias, bdev, alias); 4589 if (ret != 0) { 4590 free(tmp); 4591 return ret; 4592 } 4593 4594 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4595 4596 return 0; 4597 } 4598 4599 static int 4600 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4601 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4602 { 4603 struct spdk_bdev_alias *tmp; 4604 4605 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4606 if (strcmp(alias, tmp->alias.name) == 0) { 4607 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4608 alias_del_fn(&tmp->alias); 4609 free(tmp); 4610 return 0; 4611 } 4612 } 4613 4614 return -ENOENT; 4615 } 4616 4617 int 4618 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4619 { 4620 int rc; 4621 4622 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4623 if (rc == -ENOENT) { 4624 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4625 } 4626 4627 return rc; 4628 } 4629 4630 void 4631 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4632 { 4633 struct spdk_bdev_alias *p, *tmp; 4634 4635 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4636 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4637 bdev_name_del(&p->alias); 4638 free(p); 4639 } 4640 } 4641 4642 struct spdk_io_channel * 4643 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4644 { 4645 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4646 } 4647 4648 void * 4649 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4650 { 4651 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4652 void *ctx = NULL; 4653 4654 if (bdev->fn_table->get_module_ctx) { 4655 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4656 } 4657 4658 return ctx; 4659 } 4660 4661 const char * 4662 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4663 { 4664 return bdev->module->name; 4665 } 4666 4667 const char * 4668 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4669 { 4670 return bdev->name; 4671 } 4672 4673 const char * 4674 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4675 { 4676 return bdev->product_name; 4677 } 4678 4679 const struct spdk_bdev_aliases_list * 4680 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4681 { 4682 return &bdev->aliases; 4683 } 4684 4685 uint32_t 4686 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4687 { 4688 return bdev->blocklen; 4689 } 4690 4691 uint32_t 4692 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4693 { 4694 return bdev->write_unit_size; 4695 } 4696 4697 uint64_t 4698 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4699 { 4700 return bdev->blockcnt; 4701 } 4702 4703 const char * 4704 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4705 { 4706 return qos_rpc_type[type]; 4707 } 4708 4709 void 4710 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4711 { 4712 int i; 4713 4714 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4715 4716 spdk_spin_lock(&bdev->internal.spinlock); 4717 if (bdev->internal.qos) { 4718 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4719 if (bdev->internal.qos->rate_limits[i].limit != 4720 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4721 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4722 if (bdev_qos_is_iops_rate_limit(i) == false) { 4723 /* Change from Byte to Megabyte which is user visible. */ 4724 limits[i] = limits[i] / 1024 / 1024; 4725 } 4726 } 4727 } 4728 } 4729 spdk_spin_unlock(&bdev->internal.spinlock); 4730 } 4731 4732 size_t 4733 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4734 { 4735 return 1 << bdev->required_alignment; 4736 } 4737 4738 uint32_t 4739 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4740 { 4741 return bdev->optimal_io_boundary; 4742 } 4743 4744 bool 4745 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4746 { 4747 return bdev->write_cache; 4748 } 4749 4750 const struct spdk_uuid * 4751 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4752 { 4753 return &bdev->uuid; 4754 } 4755 4756 uint16_t 4757 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4758 { 4759 return bdev->acwu; 4760 } 4761 4762 uint32_t 4763 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4764 { 4765 return bdev->md_len; 4766 } 4767 4768 bool 4769 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4770 { 4771 return (bdev->md_len != 0) && bdev->md_interleave; 4772 } 4773 4774 bool 4775 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4776 { 4777 return (bdev->md_len != 0) && !bdev->md_interleave; 4778 } 4779 4780 bool 4781 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4782 { 4783 return bdev->zoned; 4784 } 4785 4786 uint32_t 4787 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4788 { 4789 if (spdk_bdev_is_md_interleaved(bdev)) { 4790 return bdev->blocklen - bdev->md_len; 4791 } else { 4792 return bdev->blocklen; 4793 } 4794 } 4795 4796 uint32_t 4797 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4798 { 4799 return bdev->phys_blocklen; 4800 } 4801 4802 static uint32_t 4803 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4804 { 4805 if (!spdk_bdev_is_md_interleaved(bdev)) { 4806 return bdev->blocklen + bdev->md_len; 4807 } else { 4808 return bdev->blocklen; 4809 } 4810 } 4811 4812 /* We have to use the typedef in the function declaration to appease astyle. */ 4813 typedef enum spdk_dif_type spdk_dif_type_t; 4814 4815 spdk_dif_type_t 4816 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4817 { 4818 if (bdev->md_len != 0) { 4819 return bdev->dif_type; 4820 } else { 4821 return SPDK_DIF_DISABLE; 4822 } 4823 } 4824 4825 bool 4826 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4827 { 4828 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4829 return bdev->dif_is_head_of_md; 4830 } else { 4831 return false; 4832 } 4833 } 4834 4835 bool 4836 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4837 enum spdk_dif_check_type check_type) 4838 { 4839 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4840 return false; 4841 } 4842 4843 switch (check_type) { 4844 case SPDK_DIF_CHECK_TYPE_REFTAG: 4845 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4846 case SPDK_DIF_CHECK_TYPE_APPTAG: 4847 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4848 case SPDK_DIF_CHECK_TYPE_GUARD: 4849 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4850 default: 4851 return false; 4852 } 4853 } 4854 4855 static uint32_t 4856 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4857 { 4858 uint64_t aligned_length, max_write_blocks; 4859 4860 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4861 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4862 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4863 4864 return max_write_blocks; 4865 } 4866 4867 uint32_t 4868 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4869 { 4870 return bdev->max_copy; 4871 } 4872 4873 uint64_t 4874 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4875 { 4876 return bdev->internal.measured_queue_depth; 4877 } 4878 4879 uint64_t 4880 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4881 { 4882 return bdev->internal.period; 4883 } 4884 4885 uint64_t 4886 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4887 { 4888 return bdev->internal.weighted_io_time; 4889 } 4890 4891 uint64_t 4892 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4893 { 4894 return bdev->internal.io_time; 4895 } 4896 4897 static void bdev_update_qd_sampling_period(void *ctx); 4898 4899 static void 4900 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4901 { 4902 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4903 4904 if (bdev->internal.measured_queue_depth) { 4905 bdev->internal.io_time += bdev->internal.period; 4906 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4907 } 4908 4909 bdev->internal.qd_poll_in_progress = false; 4910 4911 bdev_update_qd_sampling_period(bdev); 4912 } 4913 4914 static void 4915 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4916 struct spdk_io_channel *io_ch, void *_ctx) 4917 { 4918 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4919 4920 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4921 spdk_bdev_for_each_channel_continue(i, 0); 4922 } 4923 4924 static int 4925 bdev_calculate_measured_queue_depth(void *ctx) 4926 { 4927 struct spdk_bdev *bdev = ctx; 4928 4929 bdev->internal.qd_poll_in_progress = true; 4930 bdev->internal.temporary_queue_depth = 0; 4931 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4932 return SPDK_POLLER_BUSY; 4933 } 4934 4935 static void 4936 bdev_update_qd_sampling_period(void *ctx) 4937 { 4938 struct spdk_bdev *bdev = ctx; 4939 4940 if (bdev->internal.period == bdev->internal.new_period) { 4941 return; 4942 } 4943 4944 if (bdev->internal.qd_poll_in_progress) { 4945 return; 4946 } 4947 4948 bdev->internal.period = bdev->internal.new_period; 4949 4950 spdk_poller_unregister(&bdev->internal.qd_poller); 4951 if (bdev->internal.period != 0) { 4952 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4953 bdev, bdev->internal.period); 4954 } else { 4955 spdk_bdev_close(bdev->internal.qd_desc); 4956 bdev->internal.qd_desc = NULL; 4957 } 4958 } 4959 4960 static void 4961 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4962 { 4963 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4964 } 4965 4966 void 4967 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4968 { 4969 int rc; 4970 4971 if (bdev->internal.new_period == period) { 4972 return; 4973 } 4974 4975 bdev->internal.new_period = period; 4976 4977 if (bdev->internal.qd_desc != NULL) { 4978 assert(bdev->internal.period != 0); 4979 4980 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4981 bdev_update_qd_sampling_period, bdev); 4982 return; 4983 } 4984 4985 assert(bdev->internal.period == 0); 4986 4987 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4988 NULL, &bdev->internal.qd_desc); 4989 if (rc != 0) { 4990 return; 4991 } 4992 4993 bdev->internal.period = period; 4994 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4995 bdev, period); 4996 } 4997 4998 struct bdev_get_current_qd_ctx { 4999 uint64_t current_qd; 5000 spdk_bdev_get_current_qd_cb cb_fn; 5001 void *cb_arg; 5002 }; 5003 5004 static void 5005 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5006 { 5007 struct bdev_get_current_qd_ctx *ctx = _ctx; 5008 5009 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5010 5011 free(ctx); 5012 } 5013 5014 static void 5015 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5016 struct spdk_io_channel *io_ch, void *_ctx) 5017 { 5018 struct bdev_get_current_qd_ctx *ctx = _ctx; 5019 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5020 5021 ctx->current_qd += bdev_ch->io_outstanding; 5022 5023 spdk_bdev_for_each_channel_continue(i, 0); 5024 } 5025 5026 void 5027 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5028 void *cb_arg) 5029 { 5030 struct bdev_get_current_qd_ctx *ctx; 5031 5032 assert(cb_fn != NULL); 5033 5034 ctx = calloc(1, sizeof(*ctx)); 5035 if (ctx == NULL) { 5036 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5037 return; 5038 } 5039 5040 ctx->cb_fn = cb_fn; 5041 ctx->cb_arg = cb_arg; 5042 5043 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5044 } 5045 5046 static void 5047 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5048 { 5049 assert(desc->thread == spdk_get_thread()); 5050 5051 spdk_spin_lock(&desc->spinlock); 5052 desc->refs--; 5053 if (!desc->closed) { 5054 spdk_spin_unlock(&desc->spinlock); 5055 desc->callback.event_fn(type, 5056 desc->bdev, 5057 desc->callback.ctx); 5058 return; 5059 } else if (desc->refs == 0) { 5060 /* This descriptor was closed after this event_notify message was sent. 5061 * spdk_bdev_close() could not free the descriptor since this message was 5062 * in flight, so we free it now using bdev_desc_free(). 5063 */ 5064 spdk_spin_unlock(&desc->spinlock); 5065 bdev_desc_free(desc); 5066 return; 5067 } 5068 spdk_spin_unlock(&desc->spinlock); 5069 } 5070 5071 static void 5072 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5073 { 5074 spdk_spin_lock(&desc->spinlock); 5075 desc->refs++; 5076 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5077 spdk_spin_unlock(&desc->spinlock); 5078 } 5079 5080 static void 5081 _resize_notify(void *ctx) 5082 { 5083 struct spdk_bdev_desc *desc = ctx; 5084 5085 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5086 } 5087 5088 int 5089 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5090 { 5091 struct spdk_bdev_desc *desc; 5092 int ret; 5093 5094 if (size == bdev->blockcnt) { 5095 return 0; 5096 } 5097 5098 spdk_spin_lock(&bdev->internal.spinlock); 5099 5100 /* bdev has open descriptors */ 5101 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5102 bdev->blockcnt > size) { 5103 ret = -EBUSY; 5104 } else { 5105 bdev->blockcnt = size; 5106 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5107 event_notify(desc, _resize_notify); 5108 } 5109 ret = 0; 5110 } 5111 5112 spdk_spin_unlock(&bdev->internal.spinlock); 5113 5114 return ret; 5115 } 5116 5117 /* 5118 * Convert I/O offset and length from bytes to blocks. 5119 * 5120 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5121 */ 5122 static uint64_t 5123 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5124 uint64_t num_bytes, uint64_t *num_blocks) 5125 { 5126 uint32_t block_size = bdev->blocklen; 5127 uint8_t shift_cnt; 5128 5129 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5130 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5131 shift_cnt = spdk_u32log2(block_size); 5132 *offset_blocks = offset_bytes >> shift_cnt; 5133 *num_blocks = num_bytes >> shift_cnt; 5134 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5135 (num_bytes - (*num_blocks << shift_cnt)); 5136 } else { 5137 *offset_blocks = offset_bytes / block_size; 5138 *num_blocks = num_bytes / block_size; 5139 return (offset_bytes % block_size) | (num_bytes % block_size); 5140 } 5141 } 5142 5143 static bool 5144 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5145 { 5146 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5147 * has been an overflow and hence the offset has been wrapped around */ 5148 if (offset_blocks + num_blocks < offset_blocks) { 5149 return false; 5150 } 5151 5152 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5153 if (offset_blocks + num_blocks > bdev->blockcnt) { 5154 return false; 5155 } 5156 5157 return true; 5158 } 5159 5160 static void 5161 bdev_seek_complete_cb(void *ctx) 5162 { 5163 struct spdk_bdev_io *bdev_io = ctx; 5164 5165 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5166 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5167 } 5168 5169 static int 5170 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5171 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5172 spdk_bdev_io_completion_cb cb, void *cb_arg) 5173 { 5174 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5175 struct spdk_bdev_io *bdev_io; 5176 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5177 5178 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5179 5180 /* Check if offset_blocks is valid looking at the validity of one block */ 5181 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5182 return -EINVAL; 5183 } 5184 5185 bdev_io = bdev_channel_get_io(channel); 5186 if (!bdev_io) { 5187 return -ENOMEM; 5188 } 5189 5190 bdev_io->internal.ch = channel; 5191 bdev_io->internal.desc = desc; 5192 bdev_io->type = io_type; 5193 bdev_io->u.bdev.offset_blocks = offset_blocks; 5194 bdev_io->u.bdev.memory_domain = NULL; 5195 bdev_io->u.bdev.memory_domain_ctx = NULL; 5196 bdev_io->u.bdev.accel_sequence = NULL; 5197 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5198 5199 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5200 /* In case bdev doesn't support seek to next data/hole offset, 5201 * it is assumed that only data and no holes are present */ 5202 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5203 bdev_io->u.bdev.seek.offset = offset_blocks; 5204 } else { 5205 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5206 } 5207 5208 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5209 return 0; 5210 } 5211 5212 bdev_io_submit(bdev_io); 5213 return 0; 5214 } 5215 5216 int 5217 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5218 uint64_t offset_blocks, 5219 spdk_bdev_io_completion_cb cb, void *cb_arg) 5220 { 5221 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5222 } 5223 5224 int 5225 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5226 uint64_t offset_blocks, 5227 spdk_bdev_io_completion_cb cb, void *cb_arg) 5228 { 5229 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5230 } 5231 5232 uint64_t 5233 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5234 { 5235 return bdev_io->u.bdev.seek.offset; 5236 } 5237 5238 static int 5239 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5240 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5241 spdk_bdev_io_completion_cb cb, void *cb_arg) 5242 { 5243 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5244 struct spdk_bdev_io *bdev_io; 5245 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5246 5247 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5248 return -EINVAL; 5249 } 5250 5251 bdev_io = bdev_channel_get_io(channel); 5252 if (!bdev_io) { 5253 return -ENOMEM; 5254 } 5255 5256 bdev_io->internal.ch = channel; 5257 bdev_io->internal.desc = desc; 5258 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5259 bdev_io->u.bdev.iovs = &bdev_io->iov; 5260 bdev_io->u.bdev.iovs[0].iov_base = buf; 5261 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5262 bdev_io->u.bdev.iovcnt = 1; 5263 bdev_io->u.bdev.md_buf = md_buf; 5264 bdev_io->u.bdev.num_blocks = num_blocks; 5265 bdev_io->u.bdev.offset_blocks = offset_blocks; 5266 bdev_io->u.bdev.memory_domain = NULL; 5267 bdev_io->u.bdev.memory_domain_ctx = NULL; 5268 bdev_io->u.bdev.accel_sequence = NULL; 5269 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5270 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5271 5272 bdev_io_submit(bdev_io); 5273 return 0; 5274 } 5275 5276 int 5277 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5278 void *buf, uint64_t offset, uint64_t nbytes, 5279 spdk_bdev_io_completion_cb cb, void *cb_arg) 5280 { 5281 uint64_t offset_blocks, num_blocks; 5282 5283 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5284 nbytes, &num_blocks) != 0) { 5285 return -EINVAL; 5286 } 5287 5288 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5289 } 5290 5291 int 5292 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5293 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5294 spdk_bdev_io_completion_cb cb, void *cb_arg) 5295 { 5296 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5297 } 5298 5299 int 5300 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5301 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5302 spdk_bdev_io_completion_cb cb, void *cb_arg) 5303 { 5304 struct iovec iov = { 5305 .iov_base = buf, 5306 }; 5307 5308 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5309 return -EINVAL; 5310 } 5311 5312 if (md_buf && !_is_buf_allocated(&iov)) { 5313 return -EINVAL; 5314 } 5315 5316 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5317 cb, cb_arg); 5318 } 5319 5320 int 5321 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5322 struct iovec *iov, int iovcnt, 5323 uint64_t offset, uint64_t nbytes, 5324 spdk_bdev_io_completion_cb cb, void *cb_arg) 5325 { 5326 uint64_t offset_blocks, num_blocks; 5327 5328 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5329 nbytes, &num_blocks) != 0) { 5330 return -EINVAL; 5331 } 5332 5333 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5334 } 5335 5336 static int 5337 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5338 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5339 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5340 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5341 spdk_bdev_io_completion_cb cb, void *cb_arg) 5342 { 5343 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5344 struct spdk_bdev_io *bdev_io; 5345 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5346 5347 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5348 return -EINVAL; 5349 } 5350 5351 bdev_io = bdev_channel_get_io(channel); 5352 if (spdk_unlikely(!bdev_io)) { 5353 return -ENOMEM; 5354 } 5355 5356 bdev_io->internal.ch = channel; 5357 bdev_io->internal.desc = desc; 5358 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5359 bdev_io->u.bdev.iovs = iov; 5360 bdev_io->u.bdev.iovcnt = iovcnt; 5361 bdev_io->u.bdev.md_buf = md_buf; 5362 bdev_io->u.bdev.num_blocks = num_blocks; 5363 bdev_io->u.bdev.offset_blocks = offset_blocks; 5364 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5365 bdev_io->internal.memory_domain = domain; 5366 bdev_io->internal.memory_domain_ctx = domain_ctx; 5367 bdev_io->internal.accel_sequence = seq; 5368 bdev_io->internal.has_accel_sequence = seq != NULL; 5369 bdev_io->u.bdev.memory_domain = domain; 5370 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5371 bdev_io->u.bdev.accel_sequence = seq; 5372 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5373 5374 _bdev_io_submit_ext(desc, bdev_io); 5375 5376 return 0; 5377 } 5378 5379 int 5380 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5381 struct iovec *iov, int iovcnt, 5382 uint64_t offset_blocks, uint64_t num_blocks, 5383 spdk_bdev_io_completion_cb cb, void *cb_arg) 5384 { 5385 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5386 5387 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5388 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5389 } 5390 5391 int 5392 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5393 struct iovec *iov, int iovcnt, void *md_buf, 5394 uint64_t offset_blocks, uint64_t num_blocks, 5395 spdk_bdev_io_completion_cb cb, void *cb_arg) 5396 { 5397 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5398 5399 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5400 return -EINVAL; 5401 } 5402 5403 if (md_buf && !_is_buf_allocated(iov)) { 5404 return -EINVAL; 5405 } 5406 5407 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5408 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5409 } 5410 5411 static inline bool 5412 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5413 { 5414 /* 5415 * We check if opts size is at least of size when we first introduced 5416 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5417 * are not checked internal. 5418 */ 5419 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5420 sizeof(opts->metadata) && 5421 opts->size <= sizeof(*opts) && 5422 /* When memory domain is used, the user must provide data buffers */ 5423 (!opts->memory_domain || (iov && iov[0].iov_base)); 5424 } 5425 5426 int 5427 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5428 struct iovec *iov, int iovcnt, 5429 uint64_t offset_blocks, uint64_t num_blocks, 5430 spdk_bdev_io_completion_cb cb, void *cb_arg, 5431 struct spdk_bdev_ext_io_opts *opts) 5432 { 5433 struct spdk_memory_domain *domain = NULL; 5434 struct spdk_accel_sequence *seq = NULL; 5435 void *domain_ctx = NULL, *md = NULL; 5436 uint32_t dif_check_flags = 0; 5437 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5438 5439 if (opts) { 5440 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5441 return -EINVAL; 5442 } 5443 5444 md = opts->metadata; 5445 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5446 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5447 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5448 if (md) { 5449 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5450 return -EINVAL; 5451 } 5452 5453 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5454 return -EINVAL; 5455 } 5456 5457 if (spdk_unlikely(seq != NULL)) { 5458 return -EINVAL; 5459 } 5460 } 5461 } 5462 5463 dif_check_flags = bdev->dif_check_flags & 5464 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5465 5466 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5467 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5468 } 5469 5470 static int 5471 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5472 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5473 spdk_bdev_io_completion_cb cb, void *cb_arg) 5474 { 5475 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5476 struct spdk_bdev_io *bdev_io; 5477 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5478 5479 if (!desc->write) { 5480 return -EBADF; 5481 } 5482 5483 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5484 return -EINVAL; 5485 } 5486 5487 bdev_io = bdev_channel_get_io(channel); 5488 if (!bdev_io) { 5489 return -ENOMEM; 5490 } 5491 5492 bdev_io->internal.ch = channel; 5493 bdev_io->internal.desc = desc; 5494 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5495 bdev_io->u.bdev.iovs = &bdev_io->iov; 5496 bdev_io->u.bdev.iovs[0].iov_base = buf; 5497 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5498 bdev_io->u.bdev.iovcnt = 1; 5499 bdev_io->u.bdev.md_buf = md_buf; 5500 bdev_io->u.bdev.num_blocks = num_blocks; 5501 bdev_io->u.bdev.offset_blocks = offset_blocks; 5502 bdev_io->u.bdev.memory_domain = NULL; 5503 bdev_io->u.bdev.memory_domain_ctx = NULL; 5504 bdev_io->u.bdev.accel_sequence = NULL; 5505 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5506 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5507 5508 bdev_io_submit(bdev_io); 5509 return 0; 5510 } 5511 5512 int 5513 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5514 void *buf, uint64_t offset, uint64_t nbytes, 5515 spdk_bdev_io_completion_cb cb, void *cb_arg) 5516 { 5517 uint64_t offset_blocks, num_blocks; 5518 5519 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5520 nbytes, &num_blocks) != 0) { 5521 return -EINVAL; 5522 } 5523 5524 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5525 } 5526 5527 int 5528 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5529 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5530 spdk_bdev_io_completion_cb cb, void *cb_arg) 5531 { 5532 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5533 cb, cb_arg); 5534 } 5535 5536 int 5537 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5538 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5539 spdk_bdev_io_completion_cb cb, void *cb_arg) 5540 { 5541 struct iovec iov = { 5542 .iov_base = buf, 5543 }; 5544 5545 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5546 return -EINVAL; 5547 } 5548 5549 if (md_buf && !_is_buf_allocated(&iov)) { 5550 return -EINVAL; 5551 } 5552 5553 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5554 cb, cb_arg); 5555 } 5556 5557 static int 5558 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5559 struct iovec *iov, int iovcnt, void *md_buf, 5560 uint64_t offset_blocks, uint64_t num_blocks, 5561 struct spdk_memory_domain *domain, void *domain_ctx, 5562 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5563 spdk_bdev_io_completion_cb cb, void *cb_arg) 5564 { 5565 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5566 struct spdk_bdev_io *bdev_io; 5567 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5568 5569 if (spdk_unlikely(!desc->write)) { 5570 return -EBADF; 5571 } 5572 5573 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5574 return -EINVAL; 5575 } 5576 5577 bdev_io = bdev_channel_get_io(channel); 5578 if (spdk_unlikely(!bdev_io)) { 5579 return -ENOMEM; 5580 } 5581 5582 bdev_io->internal.ch = channel; 5583 bdev_io->internal.desc = desc; 5584 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5585 bdev_io->u.bdev.iovs = iov; 5586 bdev_io->u.bdev.iovcnt = iovcnt; 5587 bdev_io->u.bdev.md_buf = md_buf; 5588 bdev_io->u.bdev.num_blocks = num_blocks; 5589 bdev_io->u.bdev.offset_blocks = offset_blocks; 5590 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5591 bdev_io->internal.memory_domain = domain; 5592 bdev_io->internal.memory_domain_ctx = domain_ctx; 5593 bdev_io->internal.accel_sequence = seq; 5594 bdev_io->internal.has_accel_sequence = seq != NULL; 5595 bdev_io->u.bdev.memory_domain = domain; 5596 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5597 bdev_io->u.bdev.accel_sequence = seq; 5598 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5599 5600 _bdev_io_submit_ext(desc, bdev_io); 5601 5602 return 0; 5603 } 5604 5605 int 5606 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5607 struct iovec *iov, int iovcnt, 5608 uint64_t offset, uint64_t len, 5609 spdk_bdev_io_completion_cb cb, void *cb_arg) 5610 { 5611 uint64_t offset_blocks, num_blocks; 5612 5613 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5614 len, &num_blocks) != 0) { 5615 return -EINVAL; 5616 } 5617 5618 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5619 } 5620 5621 int 5622 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5623 struct iovec *iov, int iovcnt, 5624 uint64_t offset_blocks, uint64_t num_blocks, 5625 spdk_bdev_io_completion_cb cb, void *cb_arg) 5626 { 5627 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5628 5629 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5630 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5631 } 5632 5633 int 5634 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5635 struct iovec *iov, int iovcnt, void *md_buf, 5636 uint64_t offset_blocks, uint64_t num_blocks, 5637 spdk_bdev_io_completion_cb cb, void *cb_arg) 5638 { 5639 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5640 5641 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5642 return -EINVAL; 5643 } 5644 5645 if (md_buf && !_is_buf_allocated(iov)) { 5646 return -EINVAL; 5647 } 5648 5649 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5650 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5651 } 5652 5653 int 5654 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5655 struct iovec *iov, int iovcnt, 5656 uint64_t offset_blocks, uint64_t num_blocks, 5657 spdk_bdev_io_completion_cb cb, void *cb_arg, 5658 struct spdk_bdev_ext_io_opts *opts) 5659 { 5660 struct spdk_memory_domain *domain = NULL; 5661 struct spdk_accel_sequence *seq = NULL; 5662 void *domain_ctx = NULL, *md = NULL; 5663 uint32_t dif_check_flags = 0; 5664 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5665 5666 if (opts) { 5667 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5668 return -EINVAL; 5669 } 5670 5671 md = opts->metadata; 5672 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5673 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5674 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5675 if (md) { 5676 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5677 return -EINVAL; 5678 } 5679 5680 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5681 return -EINVAL; 5682 } 5683 5684 if (spdk_unlikely(seq != NULL)) { 5685 return -EINVAL; 5686 } 5687 } 5688 } 5689 5690 dif_check_flags = bdev->dif_check_flags & 5691 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5692 5693 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5694 domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5695 } 5696 5697 static void 5698 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5699 { 5700 struct spdk_bdev_io *parent_io = cb_arg; 5701 struct spdk_bdev *bdev = parent_io->bdev; 5702 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5703 int i, rc = 0; 5704 5705 if (!success) { 5706 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5707 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5708 spdk_bdev_free_io(bdev_io); 5709 return; 5710 } 5711 5712 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5713 rc = memcmp(read_buf, 5714 parent_io->u.bdev.iovs[i].iov_base, 5715 parent_io->u.bdev.iovs[i].iov_len); 5716 if (rc) { 5717 break; 5718 } 5719 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5720 } 5721 5722 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5723 rc = memcmp(bdev_io->u.bdev.md_buf, 5724 parent_io->u.bdev.md_buf, 5725 spdk_bdev_get_md_size(bdev)); 5726 } 5727 5728 spdk_bdev_free_io(bdev_io); 5729 5730 if (rc == 0) { 5731 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5732 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5733 } else { 5734 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5735 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5736 } 5737 } 5738 5739 static void 5740 bdev_compare_do_read(void *_bdev_io) 5741 { 5742 struct spdk_bdev_io *bdev_io = _bdev_io; 5743 int rc; 5744 5745 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5746 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5747 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5748 bdev_compare_do_read_done, bdev_io); 5749 5750 if (rc == -ENOMEM) { 5751 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5752 } else if (rc != 0) { 5753 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5754 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5755 } 5756 } 5757 5758 static int 5759 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5760 struct iovec *iov, int iovcnt, void *md_buf, 5761 uint64_t offset_blocks, uint64_t num_blocks, 5762 spdk_bdev_io_completion_cb cb, void *cb_arg) 5763 { 5764 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5765 struct spdk_bdev_io *bdev_io; 5766 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5767 5768 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5769 return -EINVAL; 5770 } 5771 5772 bdev_io = bdev_channel_get_io(channel); 5773 if (!bdev_io) { 5774 return -ENOMEM; 5775 } 5776 5777 bdev_io->internal.ch = channel; 5778 bdev_io->internal.desc = desc; 5779 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5780 bdev_io->u.bdev.iovs = iov; 5781 bdev_io->u.bdev.iovcnt = iovcnt; 5782 bdev_io->u.bdev.md_buf = md_buf; 5783 bdev_io->u.bdev.num_blocks = num_blocks; 5784 bdev_io->u.bdev.offset_blocks = offset_blocks; 5785 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5786 bdev_io->u.bdev.memory_domain = NULL; 5787 bdev_io->u.bdev.memory_domain_ctx = NULL; 5788 bdev_io->u.bdev.accel_sequence = NULL; 5789 5790 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5791 bdev_io_submit(bdev_io); 5792 return 0; 5793 } 5794 5795 bdev_compare_do_read(bdev_io); 5796 5797 return 0; 5798 } 5799 5800 int 5801 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5802 struct iovec *iov, int iovcnt, 5803 uint64_t offset_blocks, uint64_t num_blocks, 5804 spdk_bdev_io_completion_cb cb, void *cb_arg) 5805 { 5806 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5807 num_blocks, cb, cb_arg); 5808 } 5809 5810 int 5811 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5812 struct iovec *iov, int iovcnt, void *md_buf, 5813 uint64_t offset_blocks, uint64_t num_blocks, 5814 spdk_bdev_io_completion_cb cb, void *cb_arg) 5815 { 5816 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5817 return -EINVAL; 5818 } 5819 5820 if (md_buf && !_is_buf_allocated(iov)) { 5821 return -EINVAL; 5822 } 5823 5824 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5825 num_blocks, cb, cb_arg); 5826 } 5827 5828 static int 5829 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5830 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5831 spdk_bdev_io_completion_cb cb, void *cb_arg) 5832 { 5833 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5834 struct spdk_bdev_io *bdev_io; 5835 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5836 5837 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5838 return -EINVAL; 5839 } 5840 5841 bdev_io = bdev_channel_get_io(channel); 5842 if (!bdev_io) { 5843 return -ENOMEM; 5844 } 5845 5846 bdev_io->internal.ch = channel; 5847 bdev_io->internal.desc = desc; 5848 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5849 bdev_io->u.bdev.iovs = &bdev_io->iov; 5850 bdev_io->u.bdev.iovs[0].iov_base = buf; 5851 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5852 bdev_io->u.bdev.iovcnt = 1; 5853 bdev_io->u.bdev.md_buf = md_buf; 5854 bdev_io->u.bdev.num_blocks = num_blocks; 5855 bdev_io->u.bdev.offset_blocks = offset_blocks; 5856 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5857 bdev_io->u.bdev.memory_domain = NULL; 5858 bdev_io->u.bdev.memory_domain_ctx = NULL; 5859 bdev_io->u.bdev.accel_sequence = NULL; 5860 5861 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5862 bdev_io_submit(bdev_io); 5863 return 0; 5864 } 5865 5866 bdev_compare_do_read(bdev_io); 5867 5868 return 0; 5869 } 5870 5871 int 5872 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5873 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5874 spdk_bdev_io_completion_cb cb, void *cb_arg) 5875 { 5876 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5877 cb, cb_arg); 5878 } 5879 5880 int 5881 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5882 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5883 spdk_bdev_io_completion_cb cb, void *cb_arg) 5884 { 5885 struct iovec iov = { 5886 .iov_base = buf, 5887 }; 5888 5889 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5890 return -EINVAL; 5891 } 5892 5893 if (md_buf && !_is_buf_allocated(&iov)) { 5894 return -EINVAL; 5895 } 5896 5897 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5898 cb, cb_arg); 5899 } 5900 5901 static void 5902 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5903 { 5904 struct spdk_bdev_io *bdev_io = ctx; 5905 5906 if (unlock_status) { 5907 SPDK_ERRLOG("LBA range unlock failed\n"); 5908 } 5909 5910 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5911 false, bdev_io->internal.caller_ctx); 5912 } 5913 5914 static void 5915 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5916 { 5917 bdev_io->internal.status = status; 5918 5919 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5920 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5921 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5922 } 5923 5924 static void 5925 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5926 { 5927 struct spdk_bdev_io *parent_io = cb_arg; 5928 5929 if (!success) { 5930 SPDK_ERRLOG("Compare and write operation failed\n"); 5931 } 5932 5933 spdk_bdev_free_io(bdev_io); 5934 5935 bdev_comparev_and_writev_blocks_unlock(parent_io, 5936 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5937 } 5938 5939 static void 5940 bdev_compare_and_write_do_write(void *_bdev_io) 5941 { 5942 struct spdk_bdev_io *bdev_io = _bdev_io; 5943 int rc; 5944 5945 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5946 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5947 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5948 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5949 bdev_compare_and_write_do_write_done, bdev_io); 5950 5951 5952 if (rc == -ENOMEM) { 5953 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5954 } else if (rc != 0) { 5955 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5956 } 5957 } 5958 5959 static void 5960 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5961 { 5962 struct spdk_bdev_io *parent_io = cb_arg; 5963 5964 spdk_bdev_free_io(bdev_io); 5965 5966 if (!success) { 5967 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5968 return; 5969 } 5970 5971 bdev_compare_and_write_do_write(parent_io); 5972 } 5973 5974 static void 5975 bdev_compare_and_write_do_compare(void *_bdev_io) 5976 { 5977 struct spdk_bdev_io *bdev_io = _bdev_io; 5978 int rc; 5979 5980 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5981 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5982 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5983 bdev_compare_and_write_do_compare_done, bdev_io); 5984 5985 if (rc == -ENOMEM) { 5986 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5987 } else if (rc != 0) { 5988 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5989 } 5990 } 5991 5992 static void 5993 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5994 { 5995 struct spdk_bdev_io *bdev_io = ctx; 5996 5997 if (status) { 5998 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5999 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6000 return; 6001 } 6002 6003 bdev_compare_and_write_do_compare(bdev_io); 6004 } 6005 6006 int 6007 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6008 struct iovec *compare_iov, int compare_iovcnt, 6009 struct iovec *write_iov, int write_iovcnt, 6010 uint64_t offset_blocks, uint64_t num_blocks, 6011 spdk_bdev_io_completion_cb cb, void *cb_arg) 6012 { 6013 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6014 struct spdk_bdev_io *bdev_io; 6015 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6016 6017 if (!desc->write) { 6018 return -EBADF; 6019 } 6020 6021 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6022 return -EINVAL; 6023 } 6024 6025 if (num_blocks > bdev->acwu) { 6026 return -EINVAL; 6027 } 6028 6029 bdev_io = bdev_channel_get_io(channel); 6030 if (!bdev_io) { 6031 return -ENOMEM; 6032 } 6033 6034 bdev_io->internal.ch = channel; 6035 bdev_io->internal.desc = desc; 6036 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6037 bdev_io->u.bdev.iovs = compare_iov; 6038 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6039 bdev_io->u.bdev.fused_iovs = write_iov; 6040 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6041 bdev_io->u.bdev.md_buf = NULL; 6042 bdev_io->u.bdev.num_blocks = num_blocks; 6043 bdev_io->u.bdev.offset_blocks = offset_blocks; 6044 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6045 bdev_io->u.bdev.memory_domain = NULL; 6046 bdev_io->u.bdev.memory_domain_ctx = NULL; 6047 bdev_io->u.bdev.accel_sequence = NULL; 6048 6049 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6050 bdev_io_submit(bdev_io); 6051 return 0; 6052 } 6053 6054 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6055 bdev_comparev_and_writev_blocks_locked, bdev_io); 6056 } 6057 6058 int 6059 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6060 struct iovec *iov, int iovcnt, 6061 uint64_t offset_blocks, uint64_t num_blocks, 6062 bool populate, 6063 spdk_bdev_io_completion_cb cb, void *cb_arg) 6064 { 6065 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6066 struct spdk_bdev_io *bdev_io; 6067 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6068 6069 if (!desc->write) { 6070 return -EBADF; 6071 } 6072 6073 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6074 return -EINVAL; 6075 } 6076 6077 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6078 return -ENOTSUP; 6079 } 6080 6081 bdev_io = bdev_channel_get_io(channel); 6082 if (!bdev_io) { 6083 return -ENOMEM; 6084 } 6085 6086 bdev_io->internal.ch = channel; 6087 bdev_io->internal.desc = desc; 6088 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6089 bdev_io->u.bdev.num_blocks = num_blocks; 6090 bdev_io->u.bdev.offset_blocks = offset_blocks; 6091 bdev_io->u.bdev.iovs = iov; 6092 bdev_io->u.bdev.iovcnt = iovcnt; 6093 bdev_io->u.bdev.md_buf = NULL; 6094 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6095 bdev_io->u.bdev.zcopy.commit = 0; 6096 bdev_io->u.bdev.zcopy.start = 1; 6097 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6098 bdev_io->u.bdev.memory_domain = NULL; 6099 bdev_io->u.bdev.memory_domain_ctx = NULL; 6100 bdev_io->u.bdev.accel_sequence = NULL; 6101 6102 bdev_io_submit(bdev_io); 6103 6104 return 0; 6105 } 6106 6107 int 6108 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6109 spdk_bdev_io_completion_cb cb, void *cb_arg) 6110 { 6111 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6112 return -EINVAL; 6113 } 6114 6115 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6116 bdev_io->u.bdev.zcopy.start = 0; 6117 bdev_io->internal.caller_ctx = cb_arg; 6118 bdev_io->internal.cb = cb; 6119 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6120 6121 bdev_io_submit(bdev_io); 6122 6123 return 0; 6124 } 6125 6126 int 6127 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6128 uint64_t offset, uint64_t len, 6129 spdk_bdev_io_completion_cb cb, void *cb_arg) 6130 { 6131 uint64_t offset_blocks, num_blocks; 6132 6133 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6134 len, &num_blocks) != 0) { 6135 return -EINVAL; 6136 } 6137 6138 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6139 } 6140 6141 int 6142 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6143 uint64_t offset_blocks, uint64_t num_blocks, 6144 spdk_bdev_io_completion_cb cb, void *cb_arg) 6145 { 6146 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6147 struct spdk_bdev_io *bdev_io; 6148 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6149 6150 if (!desc->write) { 6151 return -EBADF; 6152 } 6153 6154 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6155 return -EINVAL; 6156 } 6157 6158 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6159 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6160 return -ENOTSUP; 6161 } 6162 6163 bdev_io = bdev_channel_get_io(channel); 6164 6165 if (!bdev_io) { 6166 return -ENOMEM; 6167 } 6168 6169 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6170 bdev_io->internal.ch = channel; 6171 bdev_io->internal.desc = desc; 6172 bdev_io->u.bdev.offset_blocks = offset_blocks; 6173 bdev_io->u.bdev.num_blocks = num_blocks; 6174 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6175 bdev_io->u.bdev.memory_domain = NULL; 6176 bdev_io->u.bdev.memory_domain_ctx = NULL; 6177 bdev_io->u.bdev.accel_sequence = NULL; 6178 6179 /* If the write_zeroes size is large and should be split, use the generic split 6180 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6181 * 6182 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6183 * or emulate it using regular write request otherwise. 6184 */ 6185 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6186 bdev_io->internal.split) { 6187 bdev_io_submit(bdev_io); 6188 return 0; 6189 } 6190 6191 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6192 6193 return bdev_write_zero_buffer(bdev_io); 6194 } 6195 6196 int 6197 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6198 uint64_t offset, uint64_t nbytes, 6199 spdk_bdev_io_completion_cb cb, void *cb_arg) 6200 { 6201 uint64_t offset_blocks, num_blocks; 6202 6203 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6204 nbytes, &num_blocks) != 0) { 6205 return -EINVAL; 6206 } 6207 6208 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6209 } 6210 6211 static void 6212 bdev_io_complete_cb(void *ctx) 6213 { 6214 struct spdk_bdev_io *bdev_io = ctx; 6215 6216 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6217 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6218 } 6219 6220 int 6221 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6222 uint64_t offset_blocks, uint64_t num_blocks, 6223 spdk_bdev_io_completion_cb cb, void *cb_arg) 6224 { 6225 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6226 struct spdk_bdev_io *bdev_io; 6227 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6228 6229 if (!desc->write) { 6230 return -EBADF; 6231 } 6232 6233 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6234 return -EINVAL; 6235 } 6236 6237 bdev_io = bdev_channel_get_io(channel); 6238 if (!bdev_io) { 6239 return -ENOMEM; 6240 } 6241 6242 bdev_io->internal.ch = channel; 6243 bdev_io->internal.desc = desc; 6244 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6245 6246 bdev_io->u.bdev.iovs = &bdev_io->iov; 6247 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6248 bdev_io->u.bdev.iovs[0].iov_len = 0; 6249 bdev_io->u.bdev.iovcnt = 1; 6250 6251 bdev_io->u.bdev.offset_blocks = offset_blocks; 6252 bdev_io->u.bdev.num_blocks = num_blocks; 6253 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6254 bdev_io->u.bdev.memory_domain = NULL; 6255 bdev_io->u.bdev.memory_domain_ctx = NULL; 6256 bdev_io->u.bdev.accel_sequence = NULL; 6257 6258 if (num_blocks == 0) { 6259 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6260 return 0; 6261 } 6262 6263 bdev_io_submit(bdev_io); 6264 return 0; 6265 } 6266 6267 int 6268 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6269 uint64_t offset, uint64_t length, 6270 spdk_bdev_io_completion_cb cb, void *cb_arg) 6271 { 6272 uint64_t offset_blocks, num_blocks; 6273 6274 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6275 length, &num_blocks) != 0) { 6276 return -EINVAL; 6277 } 6278 6279 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6280 } 6281 6282 int 6283 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6284 uint64_t offset_blocks, uint64_t num_blocks, 6285 spdk_bdev_io_completion_cb cb, void *cb_arg) 6286 { 6287 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6288 struct spdk_bdev_io *bdev_io; 6289 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6290 6291 if (!desc->write) { 6292 return -EBADF; 6293 } 6294 6295 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6296 return -EINVAL; 6297 } 6298 6299 bdev_io = bdev_channel_get_io(channel); 6300 if (!bdev_io) { 6301 return -ENOMEM; 6302 } 6303 6304 bdev_io->internal.ch = channel; 6305 bdev_io->internal.desc = desc; 6306 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6307 bdev_io->u.bdev.iovs = NULL; 6308 bdev_io->u.bdev.iovcnt = 0; 6309 bdev_io->u.bdev.offset_blocks = offset_blocks; 6310 bdev_io->u.bdev.num_blocks = num_blocks; 6311 bdev_io->u.bdev.memory_domain = NULL; 6312 bdev_io->u.bdev.memory_domain_ctx = NULL; 6313 bdev_io->u.bdev.accel_sequence = NULL; 6314 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6315 6316 bdev_io_submit(bdev_io); 6317 return 0; 6318 } 6319 6320 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6321 6322 static void 6323 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6324 { 6325 struct spdk_bdev_channel *ch = _ctx; 6326 struct spdk_bdev_io *bdev_io; 6327 6328 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6329 6330 if (status == -EBUSY) { 6331 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6332 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6333 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6334 } else { 6335 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6336 6337 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6338 /* If outstanding IOs are still present and reset_io_drain_timeout 6339 * seconds passed, start the reset. */ 6340 bdev_io_submit_reset(bdev_io); 6341 } else { 6342 /* We still have in progress memory domain pull/push or we're 6343 * executing accel sequence. Since we cannot abort either of those 6344 * operaions, fail the reset request. */ 6345 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6346 } 6347 } 6348 } else { 6349 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6350 SPDK_DEBUGLOG(bdev, 6351 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6352 ch->bdev->name); 6353 /* Mark the completion status as a SUCCESS and complete the reset. */ 6354 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6355 } 6356 } 6357 6358 static void 6359 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6360 struct spdk_io_channel *io_ch, void *_ctx) 6361 { 6362 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6363 int status = 0; 6364 6365 if (cur_ch->io_outstanding > 0 || 6366 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6367 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6368 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6369 * further iteration over the rest of the channels and pass non-zero status 6370 * to the callback function. */ 6371 status = -EBUSY; 6372 } 6373 spdk_bdev_for_each_channel_continue(i, status); 6374 } 6375 6376 static int 6377 bdev_reset_poll_for_outstanding_io(void *ctx) 6378 { 6379 struct spdk_bdev_channel *ch = ctx; 6380 struct spdk_bdev_io *bdev_io; 6381 6382 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6383 6384 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6385 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6386 bdev_reset_check_outstanding_io_done); 6387 6388 return SPDK_POLLER_BUSY; 6389 } 6390 6391 static void 6392 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6393 { 6394 struct spdk_bdev_channel *ch = _ctx; 6395 struct spdk_bdev_io *bdev_io; 6396 6397 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6398 6399 if (bdev->reset_io_drain_timeout == 0) { 6400 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6401 6402 bdev_io_submit_reset(bdev_io); 6403 return; 6404 } 6405 6406 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6407 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6408 6409 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6410 * submit the reset to the underlying module only if outstanding I/O 6411 * remain after reset_io_drain_timeout seconds have passed. */ 6412 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6413 bdev_reset_check_outstanding_io_done); 6414 } 6415 6416 static void 6417 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6418 struct spdk_io_channel *ch, void *_ctx) 6419 { 6420 struct spdk_bdev_channel *channel; 6421 struct spdk_bdev_mgmt_channel *mgmt_channel; 6422 struct spdk_bdev_shared_resource *shared_resource; 6423 bdev_io_tailq_t tmp_queued; 6424 6425 TAILQ_INIT(&tmp_queued); 6426 6427 channel = __io_ch_to_bdev_ch(ch); 6428 shared_resource = channel->shared_resource; 6429 mgmt_channel = shared_resource->mgmt_ch; 6430 6431 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6432 6433 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6434 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6435 } 6436 6437 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6438 bdev_abort_all_buf_io(mgmt_channel, channel); 6439 bdev_abort_all_queued_io(&tmp_queued, channel); 6440 6441 spdk_bdev_for_each_channel_continue(i, 0); 6442 } 6443 6444 static void 6445 bdev_start_reset(void *ctx) 6446 { 6447 struct spdk_bdev_channel *ch = ctx; 6448 6449 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6450 bdev_reset_freeze_channel_done); 6451 } 6452 6453 static void 6454 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6455 { 6456 struct spdk_bdev *bdev = ch->bdev; 6457 6458 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6459 6460 spdk_spin_lock(&bdev->internal.spinlock); 6461 if (bdev->internal.reset_in_progress == NULL) { 6462 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6463 /* 6464 * Take a channel reference for the target bdev for the life of this 6465 * reset. This guards against the channel getting destroyed while 6466 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6467 * progress. We will release the reference when this reset is 6468 * completed. 6469 */ 6470 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6471 bdev_start_reset(ch); 6472 } 6473 spdk_spin_unlock(&bdev->internal.spinlock); 6474 } 6475 6476 int 6477 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6478 spdk_bdev_io_completion_cb cb, void *cb_arg) 6479 { 6480 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6481 struct spdk_bdev_io *bdev_io; 6482 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6483 6484 bdev_io = bdev_channel_get_io(channel); 6485 if (!bdev_io) { 6486 return -ENOMEM; 6487 } 6488 6489 bdev_io->internal.ch = channel; 6490 bdev_io->internal.desc = desc; 6491 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6492 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6493 bdev_io->u.reset.ch_ref = NULL; 6494 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6495 6496 spdk_spin_lock(&bdev->internal.spinlock); 6497 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6498 spdk_spin_unlock(&bdev->internal.spinlock); 6499 6500 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6501 internal.ch_link); 6502 6503 bdev_channel_start_reset(channel); 6504 6505 return 0; 6506 } 6507 6508 void 6509 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6510 struct spdk_bdev_io_stat *stat) 6511 { 6512 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6513 6514 bdev_get_io_stat(stat, channel->stat); 6515 } 6516 6517 static void 6518 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6519 { 6520 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6521 6522 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6523 bdev_iostat_ctx->cb_arg, 0); 6524 free(bdev_iostat_ctx); 6525 } 6526 6527 static void 6528 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6529 struct spdk_io_channel *ch, void *_ctx) 6530 { 6531 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6532 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6533 6534 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6535 spdk_bdev_for_each_channel_continue(i, 0); 6536 } 6537 6538 void 6539 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6540 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6541 { 6542 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6543 6544 assert(bdev != NULL); 6545 assert(stat != NULL); 6546 assert(cb != NULL); 6547 6548 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6549 if (bdev_iostat_ctx == NULL) { 6550 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6551 cb(bdev, stat, cb_arg, -ENOMEM); 6552 return; 6553 } 6554 6555 bdev_iostat_ctx->stat = stat; 6556 bdev_iostat_ctx->cb = cb; 6557 bdev_iostat_ctx->cb_arg = cb_arg; 6558 6559 /* Start with the statistics from previously deleted channels. */ 6560 spdk_spin_lock(&bdev->internal.spinlock); 6561 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6562 spdk_spin_unlock(&bdev->internal.spinlock); 6563 6564 /* Then iterate and add the statistics from each existing channel. */ 6565 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6566 bdev_get_device_stat_done); 6567 } 6568 6569 struct bdev_iostat_reset_ctx { 6570 enum spdk_bdev_reset_stat_mode mode; 6571 bdev_reset_device_stat_cb cb; 6572 void *cb_arg; 6573 }; 6574 6575 static void 6576 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6577 { 6578 struct bdev_iostat_reset_ctx *ctx = _ctx; 6579 6580 ctx->cb(bdev, ctx->cb_arg, 0); 6581 6582 free(ctx); 6583 } 6584 6585 static void 6586 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6587 struct spdk_io_channel *ch, void *_ctx) 6588 { 6589 struct bdev_iostat_reset_ctx *ctx = _ctx; 6590 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6591 6592 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6593 6594 spdk_bdev_for_each_channel_continue(i, 0); 6595 } 6596 6597 void 6598 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6599 bdev_reset_device_stat_cb cb, void *cb_arg) 6600 { 6601 struct bdev_iostat_reset_ctx *ctx; 6602 6603 assert(bdev != NULL); 6604 assert(cb != NULL); 6605 6606 ctx = calloc(1, sizeof(*ctx)); 6607 if (ctx == NULL) { 6608 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6609 cb(bdev, cb_arg, -ENOMEM); 6610 return; 6611 } 6612 6613 ctx->mode = mode; 6614 ctx->cb = cb; 6615 ctx->cb_arg = cb_arg; 6616 6617 spdk_spin_lock(&bdev->internal.spinlock); 6618 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6619 spdk_spin_unlock(&bdev->internal.spinlock); 6620 6621 spdk_bdev_for_each_channel(bdev, 6622 bdev_reset_each_channel_stat, 6623 ctx, 6624 bdev_reset_device_stat_done); 6625 } 6626 6627 int 6628 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6629 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6630 spdk_bdev_io_completion_cb cb, void *cb_arg) 6631 { 6632 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6633 struct spdk_bdev_io *bdev_io; 6634 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6635 6636 if (!desc->write) { 6637 return -EBADF; 6638 } 6639 6640 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6641 return -ENOTSUP; 6642 } 6643 6644 bdev_io = bdev_channel_get_io(channel); 6645 if (!bdev_io) { 6646 return -ENOMEM; 6647 } 6648 6649 bdev_io->internal.ch = channel; 6650 bdev_io->internal.desc = desc; 6651 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6652 bdev_io->u.nvme_passthru.cmd = *cmd; 6653 bdev_io->u.nvme_passthru.buf = buf; 6654 bdev_io->u.nvme_passthru.nbytes = nbytes; 6655 bdev_io->u.nvme_passthru.md_buf = NULL; 6656 bdev_io->u.nvme_passthru.md_len = 0; 6657 6658 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6659 6660 bdev_io_submit(bdev_io); 6661 return 0; 6662 } 6663 6664 int 6665 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6666 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6667 spdk_bdev_io_completion_cb cb, void *cb_arg) 6668 { 6669 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6670 struct spdk_bdev_io *bdev_io; 6671 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6672 6673 if (!desc->write) { 6674 /* 6675 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6676 * to easily determine if the command is a read or write, but for now just 6677 * do not allow io_passthru with a read-only descriptor. 6678 */ 6679 return -EBADF; 6680 } 6681 6682 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6683 return -ENOTSUP; 6684 } 6685 6686 bdev_io = bdev_channel_get_io(channel); 6687 if (!bdev_io) { 6688 return -ENOMEM; 6689 } 6690 6691 bdev_io->internal.ch = channel; 6692 bdev_io->internal.desc = desc; 6693 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6694 bdev_io->u.nvme_passthru.cmd = *cmd; 6695 bdev_io->u.nvme_passthru.buf = buf; 6696 bdev_io->u.nvme_passthru.nbytes = nbytes; 6697 bdev_io->u.nvme_passthru.md_buf = NULL; 6698 bdev_io->u.nvme_passthru.md_len = 0; 6699 6700 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6701 6702 bdev_io_submit(bdev_io); 6703 return 0; 6704 } 6705 6706 int 6707 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6708 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6709 spdk_bdev_io_completion_cb cb, void *cb_arg) 6710 { 6711 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6712 struct spdk_bdev_io *bdev_io; 6713 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6714 6715 if (!desc->write) { 6716 /* 6717 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6718 * to easily determine if the command is a read or write, but for now just 6719 * do not allow io_passthru with a read-only descriptor. 6720 */ 6721 return -EBADF; 6722 } 6723 6724 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6725 return -ENOTSUP; 6726 } 6727 6728 bdev_io = bdev_channel_get_io(channel); 6729 if (!bdev_io) { 6730 return -ENOMEM; 6731 } 6732 6733 bdev_io->internal.ch = channel; 6734 bdev_io->internal.desc = desc; 6735 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6736 bdev_io->u.nvme_passthru.cmd = *cmd; 6737 bdev_io->u.nvme_passthru.buf = buf; 6738 bdev_io->u.nvme_passthru.nbytes = nbytes; 6739 bdev_io->u.nvme_passthru.md_buf = md_buf; 6740 bdev_io->u.nvme_passthru.md_len = md_len; 6741 6742 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6743 6744 bdev_io_submit(bdev_io); 6745 return 0; 6746 } 6747 6748 int 6749 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6750 struct spdk_io_channel *ch, 6751 const struct spdk_nvme_cmd *cmd, 6752 struct iovec *iov, int iovcnt, size_t nbytes, 6753 void *md_buf, size_t md_len, 6754 spdk_bdev_io_completion_cb cb, void *cb_arg) 6755 { 6756 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6757 struct spdk_bdev_io *bdev_io; 6758 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6759 6760 if (!desc->write) { 6761 /* 6762 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6763 * to easily determine if the command is a read or write, but for now just 6764 * do not allow io_passthru with a read-only descriptor. 6765 */ 6766 return -EBADF; 6767 } 6768 6769 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6770 return -ENOTSUP; 6771 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6772 return -ENOTSUP; 6773 } 6774 6775 bdev_io = bdev_channel_get_io(channel); 6776 if (!bdev_io) { 6777 return -ENOMEM; 6778 } 6779 6780 bdev_io->internal.ch = channel; 6781 bdev_io->internal.desc = desc; 6782 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6783 bdev_io->u.nvme_passthru.cmd = *cmd; 6784 bdev_io->u.nvme_passthru.iovs = iov; 6785 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6786 bdev_io->u.nvme_passthru.nbytes = nbytes; 6787 bdev_io->u.nvme_passthru.md_buf = md_buf; 6788 bdev_io->u.nvme_passthru.md_len = md_len; 6789 6790 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6791 6792 bdev_io_submit(bdev_io); 6793 return 0; 6794 } 6795 6796 static void bdev_abort_retry(void *ctx); 6797 static void bdev_abort(struct spdk_bdev_io *parent_io); 6798 6799 static void 6800 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6801 { 6802 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6803 struct spdk_bdev_io *parent_io = cb_arg; 6804 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6805 6806 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6807 6808 spdk_bdev_free_io(bdev_io); 6809 6810 if (!success) { 6811 /* Check if the target I/O completed in the meantime. */ 6812 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6813 if (tmp_io == bio_to_abort) { 6814 break; 6815 } 6816 } 6817 6818 /* If the target I/O still exists, set the parent to failed. */ 6819 if (tmp_io != NULL) { 6820 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6821 } 6822 } 6823 6824 parent_io->u.bdev.split_outstanding--; 6825 if (parent_io->u.bdev.split_outstanding == 0) { 6826 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6827 bdev_abort_retry(parent_io); 6828 } else { 6829 bdev_io_complete(parent_io); 6830 } 6831 } 6832 } 6833 6834 static int 6835 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6836 struct spdk_bdev_io *bio_to_abort, 6837 spdk_bdev_io_completion_cb cb, void *cb_arg) 6838 { 6839 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6840 struct spdk_bdev_io *bdev_io; 6841 6842 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6843 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6844 /* TODO: Abort reset or abort request. */ 6845 return -ENOTSUP; 6846 } 6847 6848 bdev_io = bdev_channel_get_io(channel); 6849 if (bdev_io == NULL) { 6850 return -ENOMEM; 6851 } 6852 6853 bdev_io->internal.ch = channel; 6854 bdev_io->internal.desc = desc; 6855 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6856 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6857 6858 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6859 assert(bdev_io_should_split(bio_to_abort)); 6860 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6861 6862 /* Parent abort request is not submitted directly, but to manage its 6863 * execution add it to the submitted list here. 6864 */ 6865 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6866 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6867 6868 bdev_abort(bdev_io); 6869 6870 return 0; 6871 } 6872 6873 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6874 6875 /* Submit the abort request to the underlying bdev module. */ 6876 bdev_io_submit(bdev_io); 6877 6878 return 0; 6879 } 6880 6881 static bool 6882 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6883 { 6884 struct spdk_bdev_io *iter; 6885 6886 TAILQ_FOREACH(iter, tailq, internal.link) { 6887 if (iter == bdev_io) { 6888 return true; 6889 } 6890 } 6891 6892 return false; 6893 } 6894 6895 static uint32_t 6896 _bdev_abort(struct spdk_bdev_io *parent_io) 6897 { 6898 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6899 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6900 void *bio_cb_arg; 6901 struct spdk_bdev_io *bio_to_abort; 6902 uint32_t matched_ios; 6903 int rc; 6904 6905 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6906 6907 /* matched_ios is returned and will be kept by the caller. 6908 * 6909 * This function will be used for two cases, 1) the same cb_arg is used for 6910 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6911 * Incrementing split_outstanding directly here may confuse readers especially 6912 * for the 1st case. 6913 * 6914 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6915 * works as expected. 6916 */ 6917 matched_ios = 0; 6918 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6919 6920 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6921 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6922 continue; 6923 } 6924 6925 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6926 /* Any I/O which was submitted after this abort command should be excluded. */ 6927 continue; 6928 } 6929 6930 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6931 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6932 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6933 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6934 break; 6935 } 6936 6937 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6938 if (rc != 0) { 6939 if (rc == -ENOMEM) { 6940 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6941 } else { 6942 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6943 } 6944 break; 6945 } 6946 matched_ios++; 6947 } 6948 6949 return matched_ios; 6950 } 6951 6952 static void 6953 bdev_abort_retry(void *ctx) 6954 { 6955 struct spdk_bdev_io *parent_io = ctx; 6956 uint32_t matched_ios; 6957 6958 matched_ios = _bdev_abort(parent_io); 6959 6960 if (matched_ios == 0) { 6961 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6962 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6963 } else { 6964 /* For retry, the case that no target I/O was found is success 6965 * because it means target I/Os completed in the meantime. 6966 */ 6967 bdev_io_complete(parent_io); 6968 } 6969 return; 6970 } 6971 6972 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6973 parent_io->u.bdev.split_outstanding = matched_ios; 6974 } 6975 6976 static void 6977 bdev_abort(struct spdk_bdev_io *parent_io) 6978 { 6979 uint32_t matched_ios; 6980 6981 matched_ios = _bdev_abort(parent_io); 6982 6983 if (matched_ios == 0) { 6984 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6985 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6986 } else { 6987 /* The case the no target I/O was found is failure. */ 6988 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6989 bdev_io_complete(parent_io); 6990 } 6991 return; 6992 } 6993 6994 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6995 parent_io->u.bdev.split_outstanding = matched_ios; 6996 } 6997 6998 int 6999 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7000 void *bio_cb_arg, 7001 spdk_bdev_io_completion_cb cb, void *cb_arg) 7002 { 7003 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7004 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7005 struct spdk_bdev_io *bdev_io; 7006 7007 if (bio_cb_arg == NULL) { 7008 return -EINVAL; 7009 } 7010 7011 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7012 return -ENOTSUP; 7013 } 7014 7015 bdev_io = bdev_channel_get_io(channel); 7016 if (bdev_io == NULL) { 7017 return -ENOMEM; 7018 } 7019 7020 bdev_io->internal.ch = channel; 7021 bdev_io->internal.desc = desc; 7022 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7023 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7024 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7025 7026 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7027 7028 /* Parent abort request is not submitted directly, but to manage its execution, 7029 * add it to the submitted list here. 7030 */ 7031 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 7032 7033 bdev_abort(bdev_io); 7034 7035 return 0; 7036 } 7037 7038 int 7039 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7040 struct spdk_bdev_io_wait_entry *entry) 7041 { 7042 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7043 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7044 7045 if (bdev != entry->bdev) { 7046 SPDK_ERRLOG("bdevs do not match\n"); 7047 return -EINVAL; 7048 } 7049 7050 if (mgmt_ch->per_thread_cache_count > 0) { 7051 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7052 return -EINVAL; 7053 } 7054 7055 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7056 return 0; 7057 } 7058 7059 static inline void 7060 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7061 { 7062 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7063 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7064 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7065 uint32_t blocklen = bdev_io->bdev->blocklen; 7066 7067 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7068 switch (bdev_io->type) { 7069 case SPDK_BDEV_IO_TYPE_READ: 7070 io_stat->bytes_read += num_blocks * blocklen; 7071 io_stat->num_read_ops++; 7072 io_stat->read_latency_ticks += tsc_diff; 7073 if (io_stat->max_read_latency_ticks < tsc_diff) { 7074 io_stat->max_read_latency_ticks = tsc_diff; 7075 } 7076 if (io_stat->min_read_latency_ticks > tsc_diff) { 7077 io_stat->min_read_latency_ticks = tsc_diff; 7078 } 7079 break; 7080 case SPDK_BDEV_IO_TYPE_WRITE: 7081 io_stat->bytes_written += num_blocks * blocklen; 7082 io_stat->num_write_ops++; 7083 io_stat->write_latency_ticks += tsc_diff; 7084 if (io_stat->max_write_latency_ticks < tsc_diff) { 7085 io_stat->max_write_latency_ticks = tsc_diff; 7086 } 7087 if (io_stat->min_write_latency_ticks > tsc_diff) { 7088 io_stat->min_write_latency_ticks = tsc_diff; 7089 } 7090 break; 7091 case SPDK_BDEV_IO_TYPE_UNMAP: 7092 io_stat->bytes_unmapped += num_blocks * blocklen; 7093 io_stat->num_unmap_ops++; 7094 io_stat->unmap_latency_ticks += tsc_diff; 7095 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7096 io_stat->max_unmap_latency_ticks = tsc_diff; 7097 } 7098 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7099 io_stat->min_unmap_latency_ticks = tsc_diff; 7100 } 7101 break; 7102 case SPDK_BDEV_IO_TYPE_ZCOPY: 7103 /* Track the data in the start phase only */ 7104 if (bdev_io->u.bdev.zcopy.start) { 7105 if (bdev_io->u.bdev.zcopy.populate) { 7106 io_stat->bytes_read += num_blocks * blocklen; 7107 io_stat->num_read_ops++; 7108 io_stat->read_latency_ticks += tsc_diff; 7109 if (io_stat->max_read_latency_ticks < tsc_diff) { 7110 io_stat->max_read_latency_ticks = tsc_diff; 7111 } 7112 if (io_stat->min_read_latency_ticks > tsc_diff) { 7113 io_stat->min_read_latency_ticks = tsc_diff; 7114 } 7115 } else { 7116 io_stat->bytes_written += num_blocks * blocklen; 7117 io_stat->num_write_ops++; 7118 io_stat->write_latency_ticks += tsc_diff; 7119 if (io_stat->max_write_latency_ticks < tsc_diff) { 7120 io_stat->max_write_latency_ticks = tsc_diff; 7121 } 7122 if (io_stat->min_write_latency_ticks > tsc_diff) { 7123 io_stat->min_write_latency_ticks = tsc_diff; 7124 } 7125 } 7126 } 7127 break; 7128 case SPDK_BDEV_IO_TYPE_COPY: 7129 io_stat->bytes_copied += num_blocks * blocklen; 7130 io_stat->num_copy_ops++; 7131 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7132 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7133 io_stat->max_copy_latency_ticks = tsc_diff; 7134 } 7135 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7136 io_stat->min_copy_latency_ticks = tsc_diff; 7137 } 7138 break; 7139 default: 7140 break; 7141 } 7142 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7143 io_stat = bdev_io->bdev->internal.stat; 7144 assert(io_stat->io_error != NULL); 7145 7146 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7147 io_stat->io_error->error_status[-io_status - 1]++; 7148 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7149 } 7150 7151 #ifdef SPDK_CONFIG_VTUNE 7152 uint64_t now_tsc = spdk_get_ticks(); 7153 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7154 uint64_t data[5]; 7155 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7156 7157 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7158 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7159 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7160 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7161 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7162 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7163 7164 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7165 __itt_metadata_u64, 5, data); 7166 7167 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7168 bdev_io->internal.ch->start_tsc = now_tsc; 7169 } 7170 #endif 7171 } 7172 7173 static inline void 7174 _bdev_io_complete(void *ctx) 7175 { 7176 struct spdk_bdev_io *bdev_io = ctx; 7177 7178 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 7179 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7180 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7181 } 7182 7183 assert(bdev_io->internal.cb != NULL); 7184 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7185 7186 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7187 bdev_io->internal.caller_ctx); 7188 } 7189 7190 static inline void 7191 bdev_io_complete(void *ctx) 7192 { 7193 struct spdk_bdev_io *bdev_io = ctx; 7194 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7195 uint64_t tsc, tsc_diff; 7196 7197 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7198 /* 7199 * Defer completion to avoid potential infinite recursion if the 7200 * user's completion callback issues a new I/O. 7201 */ 7202 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7203 bdev_io_complete, bdev_io); 7204 return; 7205 } 7206 7207 tsc = spdk_get_ticks(); 7208 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7209 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7210 bdev_io->internal.caller_ctx); 7211 7212 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7213 7214 if (bdev_io->internal.ch->histogram) { 7215 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7216 } 7217 7218 bdev_io_update_io_stat(bdev_io, tsc_diff); 7219 _bdev_io_complete(bdev_io); 7220 } 7221 7222 /* The difference between this function and bdev_io_complete() is that this should be called to 7223 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7224 * io_submitted list and don't have submit_tsc updated. 7225 */ 7226 static inline void 7227 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7228 { 7229 /* Since the IO hasn't been submitted it's bound to be failed */ 7230 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7231 7232 /* At this point we don't know if the IO is completed from submission context or not, but, 7233 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7234 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7235 _bdev_io_complete, bdev_io); 7236 } 7237 7238 static void bdev_destroy_cb(void *io_device); 7239 7240 static void 7241 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7242 { 7243 struct spdk_bdev_io *bdev_io = _ctx; 7244 7245 if (bdev_io->u.reset.ch_ref != NULL) { 7246 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7247 bdev_io->u.reset.ch_ref = NULL; 7248 } 7249 7250 bdev_io_complete(bdev_io); 7251 7252 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7253 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7254 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7255 } 7256 } 7257 7258 static void 7259 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7260 struct spdk_io_channel *_ch, void *_ctx) 7261 { 7262 struct spdk_bdev_io *bdev_io = _ctx; 7263 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7264 struct spdk_bdev_io *queued_reset; 7265 7266 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7267 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7268 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7269 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7270 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7271 } 7272 7273 spdk_bdev_for_each_channel_continue(i, 0); 7274 } 7275 7276 static void 7277 bdev_io_complete_sequence_cb(void *ctx, int status) 7278 { 7279 struct spdk_bdev_io *bdev_io = ctx; 7280 7281 /* u.bdev.accel_sequence should have already been cleared at this point */ 7282 assert(bdev_io->u.bdev.accel_sequence == NULL); 7283 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7284 bdev_io->internal.accel_sequence = NULL; 7285 7286 if (spdk_unlikely(status != 0)) { 7287 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7288 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7289 } 7290 7291 bdev_io_complete(bdev_io); 7292 } 7293 7294 void 7295 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7296 { 7297 struct spdk_bdev *bdev = bdev_io->bdev; 7298 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7299 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7300 7301 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7302 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7303 spdk_bdev_get_module_name(bdev), 7304 bdev_io_status_get_string(bdev_io->internal.status)); 7305 assert(false); 7306 } 7307 bdev_io->internal.status = status; 7308 7309 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7310 bool unlock_channels = false; 7311 7312 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7313 SPDK_ERRLOG("NOMEM returned for reset\n"); 7314 } 7315 spdk_spin_lock(&bdev->internal.spinlock); 7316 if (bdev_io == bdev->internal.reset_in_progress) { 7317 bdev->internal.reset_in_progress = NULL; 7318 unlock_channels = true; 7319 } 7320 spdk_spin_unlock(&bdev->internal.spinlock); 7321 7322 if (unlock_channels) { 7323 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7324 bdev_reset_complete); 7325 return; 7326 } 7327 } else { 7328 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7329 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7330 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7331 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7332 return; 7333 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7334 !bdev_io_use_accel_sequence(bdev_io))) { 7335 _bdev_io_push_bounce_data_buffer(bdev_io, 7336 _bdev_io_complete_push_bounce_done); 7337 /* bdev IO will be completed in the callback */ 7338 return; 7339 } 7340 } 7341 7342 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7343 return; 7344 } 7345 } 7346 7347 bdev_io_complete(bdev_io); 7348 } 7349 7350 void 7351 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7352 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7353 { 7354 enum spdk_bdev_io_status status; 7355 7356 if (sc == SPDK_SCSI_STATUS_GOOD) { 7357 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7358 } else { 7359 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7360 bdev_io->internal.error.scsi.sc = sc; 7361 bdev_io->internal.error.scsi.sk = sk; 7362 bdev_io->internal.error.scsi.asc = asc; 7363 bdev_io->internal.error.scsi.ascq = ascq; 7364 } 7365 7366 spdk_bdev_io_complete(bdev_io, status); 7367 } 7368 7369 void 7370 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7371 int *sc, int *sk, int *asc, int *ascq) 7372 { 7373 assert(sc != NULL); 7374 assert(sk != NULL); 7375 assert(asc != NULL); 7376 assert(ascq != NULL); 7377 7378 switch (bdev_io->internal.status) { 7379 case SPDK_BDEV_IO_STATUS_SUCCESS: 7380 *sc = SPDK_SCSI_STATUS_GOOD; 7381 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7382 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7383 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7384 break; 7385 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7386 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7387 break; 7388 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7389 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7390 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7391 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7392 *ascq = bdev_io->internal.error.scsi.ascq; 7393 break; 7394 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7395 *sc = bdev_io->internal.error.scsi.sc; 7396 *sk = bdev_io->internal.error.scsi.sk; 7397 *asc = bdev_io->internal.error.scsi.asc; 7398 *ascq = bdev_io->internal.error.scsi.ascq; 7399 break; 7400 default: 7401 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7402 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7403 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7404 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7405 break; 7406 } 7407 } 7408 7409 void 7410 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7411 { 7412 enum spdk_bdev_io_status status; 7413 7414 if (aio_result == 0) { 7415 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7416 } else { 7417 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7418 } 7419 7420 bdev_io->internal.error.aio_result = aio_result; 7421 7422 spdk_bdev_io_complete(bdev_io, status); 7423 } 7424 7425 void 7426 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7427 { 7428 assert(aio_result != NULL); 7429 7430 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7431 *aio_result = bdev_io->internal.error.aio_result; 7432 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7433 *aio_result = 0; 7434 } else { 7435 *aio_result = -EIO; 7436 } 7437 } 7438 7439 void 7440 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7441 { 7442 enum spdk_bdev_io_status status; 7443 7444 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7445 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7446 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7447 status = SPDK_BDEV_IO_STATUS_ABORTED; 7448 } else { 7449 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7450 } 7451 7452 bdev_io->internal.error.nvme.cdw0 = cdw0; 7453 bdev_io->internal.error.nvme.sct = sct; 7454 bdev_io->internal.error.nvme.sc = sc; 7455 7456 spdk_bdev_io_complete(bdev_io, status); 7457 } 7458 7459 void 7460 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7461 { 7462 assert(sct != NULL); 7463 assert(sc != NULL); 7464 assert(cdw0 != NULL); 7465 7466 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7467 *sct = SPDK_NVME_SCT_GENERIC; 7468 *sc = SPDK_NVME_SC_SUCCESS; 7469 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7470 *cdw0 = 0; 7471 } else { 7472 *cdw0 = 1U; 7473 } 7474 return; 7475 } 7476 7477 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7478 *sct = SPDK_NVME_SCT_GENERIC; 7479 *sc = SPDK_NVME_SC_SUCCESS; 7480 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7481 *sct = bdev_io->internal.error.nvme.sct; 7482 *sc = bdev_io->internal.error.nvme.sc; 7483 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7484 *sct = SPDK_NVME_SCT_GENERIC; 7485 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7486 } else { 7487 *sct = SPDK_NVME_SCT_GENERIC; 7488 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7489 } 7490 7491 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7492 } 7493 7494 void 7495 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7496 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7497 { 7498 assert(first_sct != NULL); 7499 assert(first_sc != NULL); 7500 assert(second_sct != NULL); 7501 assert(second_sc != NULL); 7502 assert(cdw0 != NULL); 7503 7504 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7505 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7506 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7507 *first_sct = bdev_io->internal.error.nvme.sct; 7508 *first_sc = bdev_io->internal.error.nvme.sc; 7509 *second_sct = SPDK_NVME_SCT_GENERIC; 7510 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7511 } else { 7512 *first_sct = SPDK_NVME_SCT_GENERIC; 7513 *first_sc = SPDK_NVME_SC_SUCCESS; 7514 *second_sct = bdev_io->internal.error.nvme.sct; 7515 *second_sc = bdev_io->internal.error.nvme.sc; 7516 } 7517 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7518 *first_sct = SPDK_NVME_SCT_GENERIC; 7519 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7520 *second_sct = SPDK_NVME_SCT_GENERIC; 7521 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7522 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7523 *first_sct = SPDK_NVME_SCT_GENERIC; 7524 *first_sc = SPDK_NVME_SC_SUCCESS; 7525 *second_sct = SPDK_NVME_SCT_GENERIC; 7526 *second_sc = SPDK_NVME_SC_SUCCESS; 7527 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7528 *first_sct = SPDK_NVME_SCT_GENERIC; 7529 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7530 *second_sct = SPDK_NVME_SCT_GENERIC; 7531 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7532 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7533 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7534 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7535 *second_sct = SPDK_NVME_SCT_GENERIC; 7536 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7537 } else { 7538 *first_sct = SPDK_NVME_SCT_GENERIC; 7539 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7540 *second_sct = SPDK_NVME_SCT_GENERIC; 7541 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7542 } 7543 7544 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7545 } 7546 7547 void 7548 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7549 const struct spdk_bdev_io *base_io) 7550 { 7551 switch (base_io->internal.status) { 7552 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7553 spdk_bdev_io_complete_nvme_status(bdev_io, 7554 base_io->internal.error.nvme.cdw0, 7555 base_io->internal.error.nvme.sct, 7556 base_io->internal.error.nvme.sc); 7557 break; 7558 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7559 spdk_bdev_io_complete_scsi_status(bdev_io, 7560 base_io->internal.error.scsi.sc, 7561 base_io->internal.error.scsi.sk, 7562 base_io->internal.error.scsi.asc, 7563 base_io->internal.error.scsi.ascq); 7564 break; 7565 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7566 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7567 break; 7568 default: 7569 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7570 break; 7571 } 7572 } 7573 7574 struct spdk_thread * 7575 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7576 { 7577 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7578 } 7579 7580 struct spdk_io_channel * 7581 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7582 { 7583 return bdev_io->internal.ch->channel; 7584 } 7585 7586 static int 7587 bdev_register(struct spdk_bdev *bdev) 7588 { 7589 char *bdev_name; 7590 char uuid[SPDK_UUID_STRING_LEN]; 7591 struct spdk_iobuf_opts iobuf_opts; 7592 int ret; 7593 7594 assert(bdev->module != NULL); 7595 7596 if (!bdev->name) { 7597 SPDK_ERRLOG("Bdev name is NULL\n"); 7598 return -EINVAL; 7599 } 7600 7601 if (!strlen(bdev->name)) { 7602 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7603 return -EINVAL; 7604 } 7605 7606 /* Users often register their own I/O devices using the bdev name. In 7607 * order to avoid conflicts, prepend bdev_. */ 7608 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7609 if (!bdev_name) { 7610 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7611 return -ENOMEM; 7612 } 7613 7614 bdev->internal.stat = bdev_alloc_io_stat(true); 7615 if (!bdev->internal.stat) { 7616 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7617 free(bdev_name); 7618 return -ENOMEM; 7619 } 7620 7621 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7622 bdev->internal.measured_queue_depth = UINT64_MAX; 7623 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7624 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7625 bdev->internal.qd_poller = NULL; 7626 bdev->internal.qos = NULL; 7627 7628 TAILQ_INIT(&bdev->internal.open_descs); 7629 TAILQ_INIT(&bdev->internal.locked_ranges); 7630 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7631 TAILQ_INIT(&bdev->aliases); 7632 7633 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7634 if (ret != 0) { 7635 bdev_free_io_stat(bdev->internal.stat); 7636 free(bdev_name); 7637 return ret; 7638 } 7639 7640 /* UUID may be specified by the user or defined by bdev itself. 7641 * Otherwise it will be generated here, so this field will never be empty. */ 7642 if (spdk_uuid_is_null(&bdev->uuid)) { 7643 spdk_uuid_generate(&bdev->uuid); 7644 } 7645 7646 /* Add the UUID alias only if it's different than the name */ 7647 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7648 if (strcmp(bdev->name, uuid) != 0) { 7649 ret = spdk_bdev_alias_add(bdev, uuid); 7650 if (ret != 0) { 7651 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7652 bdev_name_del(&bdev->internal.bdev_name); 7653 bdev_free_io_stat(bdev->internal.stat); 7654 free(bdev_name); 7655 return ret; 7656 } 7657 } 7658 7659 spdk_iobuf_get_opts(&iobuf_opts); 7660 if (spdk_bdev_get_buf_align(bdev) > 1) { 7661 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7662 iobuf_opts.large_bufsize / bdev->blocklen); 7663 } 7664 7665 /* If the user didn't specify a write unit size, set it to one. */ 7666 if (bdev->write_unit_size == 0) { 7667 bdev->write_unit_size = 1; 7668 } 7669 7670 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7671 if (bdev->acwu == 0) { 7672 bdev->acwu = bdev->write_unit_size; 7673 } 7674 7675 if (bdev->phys_blocklen == 0) { 7676 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7677 } 7678 7679 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7680 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7681 } 7682 7683 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7684 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7685 } 7686 7687 bdev->internal.reset_in_progress = NULL; 7688 bdev->internal.qd_poll_in_progress = false; 7689 bdev->internal.period = 0; 7690 bdev->internal.new_period = 0; 7691 7692 spdk_io_device_register(__bdev_to_io_dev(bdev), 7693 bdev_channel_create, bdev_channel_destroy, 7694 sizeof(struct spdk_bdev_channel), 7695 bdev_name); 7696 7697 free(bdev_name); 7698 7699 spdk_spin_init(&bdev->internal.spinlock); 7700 7701 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7702 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7703 7704 return 0; 7705 } 7706 7707 static void 7708 bdev_destroy_cb(void *io_device) 7709 { 7710 int rc; 7711 struct spdk_bdev *bdev; 7712 spdk_bdev_unregister_cb cb_fn; 7713 void *cb_arg; 7714 7715 bdev = __bdev_from_io_dev(io_device); 7716 7717 if (bdev->internal.unregister_td != spdk_get_thread()) { 7718 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7719 return; 7720 } 7721 7722 cb_fn = bdev->internal.unregister_cb; 7723 cb_arg = bdev->internal.unregister_ctx; 7724 7725 spdk_spin_destroy(&bdev->internal.spinlock); 7726 free(bdev->internal.qos); 7727 bdev_free_io_stat(bdev->internal.stat); 7728 7729 rc = bdev->fn_table->destruct(bdev->ctxt); 7730 if (rc < 0) { 7731 SPDK_ERRLOG("destruct failed\n"); 7732 } 7733 if (rc <= 0 && cb_fn != NULL) { 7734 cb_fn(cb_arg, rc); 7735 } 7736 } 7737 7738 void 7739 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7740 { 7741 if (bdev->internal.unregister_cb != NULL) { 7742 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7743 } 7744 } 7745 7746 static void 7747 _remove_notify(void *arg) 7748 { 7749 struct spdk_bdev_desc *desc = arg; 7750 7751 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7752 } 7753 7754 /* returns: 0 - bdev removed and ready to be destructed. 7755 * -EBUSY - bdev can't be destructed yet. */ 7756 static int 7757 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7758 { 7759 struct spdk_bdev_desc *desc, *tmp; 7760 int rc = 0; 7761 char uuid[SPDK_UUID_STRING_LEN]; 7762 7763 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7764 assert(spdk_spin_held(&bdev->internal.spinlock)); 7765 7766 /* Notify each descriptor about hotremoval */ 7767 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7768 rc = -EBUSY; 7769 /* 7770 * Defer invocation of the event_cb to a separate message that will 7771 * run later on its thread. This ensures this context unwinds and 7772 * we don't recursively unregister this bdev again if the event_cb 7773 * immediately closes its descriptor. 7774 */ 7775 event_notify(desc, _remove_notify); 7776 } 7777 7778 /* If there are no descriptors, proceed removing the bdev */ 7779 if (rc == 0) { 7780 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7781 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7782 7783 /* Delete the name and the UUID alias */ 7784 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7785 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7786 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7787 7788 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7789 7790 if (bdev->internal.reset_in_progress != NULL) { 7791 /* If reset is in progress, let the completion callback for reset 7792 * unregister the bdev. 7793 */ 7794 rc = -EBUSY; 7795 } 7796 } 7797 7798 return rc; 7799 } 7800 7801 static void 7802 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7803 struct spdk_io_channel *io_ch, void *_ctx) 7804 { 7805 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7806 7807 bdev_channel_abort_queued_ios(bdev_ch); 7808 spdk_bdev_for_each_channel_continue(i, 0); 7809 } 7810 7811 static void 7812 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7813 { 7814 int rc; 7815 7816 spdk_spin_lock(&g_bdev_mgr.spinlock); 7817 spdk_spin_lock(&bdev->internal.spinlock); 7818 /* 7819 * Set the status to REMOVING after completing to abort channels. Otherwise, 7820 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7821 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7822 * may fail. 7823 */ 7824 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7825 rc = bdev_unregister_unsafe(bdev); 7826 spdk_spin_unlock(&bdev->internal.spinlock); 7827 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7828 7829 if (rc == 0) { 7830 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7831 } 7832 } 7833 7834 void 7835 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7836 { 7837 struct spdk_thread *thread; 7838 7839 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7840 7841 thread = spdk_get_thread(); 7842 if (!thread) { 7843 /* The user called this from a non-SPDK thread. */ 7844 if (cb_fn != NULL) { 7845 cb_fn(cb_arg, -ENOTSUP); 7846 } 7847 return; 7848 } 7849 7850 spdk_spin_lock(&g_bdev_mgr.spinlock); 7851 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7852 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7853 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7854 if (cb_fn) { 7855 cb_fn(cb_arg, -EBUSY); 7856 } 7857 return; 7858 } 7859 7860 spdk_spin_lock(&bdev->internal.spinlock); 7861 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7862 bdev->internal.unregister_cb = cb_fn; 7863 bdev->internal.unregister_ctx = cb_arg; 7864 bdev->internal.unregister_td = thread; 7865 spdk_spin_unlock(&bdev->internal.spinlock); 7866 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7867 7868 spdk_bdev_set_qd_sampling_period(bdev, 0); 7869 7870 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7871 bdev_unregister); 7872 } 7873 7874 int 7875 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7876 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7877 { 7878 struct spdk_bdev_desc *desc; 7879 struct spdk_bdev *bdev; 7880 int rc; 7881 7882 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7883 if (rc != 0) { 7884 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7885 return rc; 7886 } 7887 7888 bdev = spdk_bdev_desc_get_bdev(desc); 7889 7890 if (bdev->module != module) { 7891 spdk_bdev_close(desc); 7892 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7893 bdev_name); 7894 return -ENODEV; 7895 } 7896 7897 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7898 7899 spdk_bdev_close(desc); 7900 7901 return 0; 7902 } 7903 7904 static int 7905 bdev_start_qos(struct spdk_bdev *bdev) 7906 { 7907 struct set_qos_limit_ctx *ctx; 7908 7909 /* Enable QoS */ 7910 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7911 ctx = calloc(1, sizeof(*ctx)); 7912 if (ctx == NULL) { 7913 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7914 return -ENOMEM; 7915 } 7916 ctx->bdev = bdev; 7917 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7918 } 7919 7920 return 0; 7921 } 7922 7923 static void 7924 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7925 struct spdk_bdev *bdev) 7926 { 7927 enum spdk_bdev_claim_type type; 7928 const char *typename, *modname; 7929 extern struct spdk_log_flag SPDK_LOG_bdev; 7930 7931 assert(spdk_spin_held(&bdev->internal.spinlock)); 7932 7933 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7934 return; 7935 } 7936 7937 type = bdev->internal.claim_type; 7938 typename = spdk_bdev_claim_get_name(type); 7939 7940 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7941 modname = bdev->internal.claim.v1.module->name; 7942 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7943 bdev->name, detail, typename, modname); 7944 return; 7945 } 7946 7947 if (claim_type_is_v2(type)) { 7948 struct spdk_bdev_module_claim *claim; 7949 7950 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7951 modname = claim->module->name; 7952 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7953 bdev->name, detail, typename, modname); 7954 } 7955 return; 7956 } 7957 7958 assert(false); 7959 } 7960 7961 static int 7962 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7963 { 7964 struct spdk_thread *thread; 7965 int rc = 0; 7966 7967 thread = spdk_get_thread(); 7968 if (!thread) { 7969 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7970 return -ENOTSUP; 7971 } 7972 7973 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7974 spdk_get_thread()); 7975 7976 desc->bdev = bdev; 7977 desc->thread = thread; 7978 desc->write = write; 7979 7980 spdk_spin_lock(&bdev->internal.spinlock); 7981 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7982 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7983 spdk_spin_unlock(&bdev->internal.spinlock); 7984 return -ENODEV; 7985 } 7986 7987 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7988 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7989 spdk_spin_unlock(&bdev->internal.spinlock); 7990 return -EPERM; 7991 } 7992 7993 rc = bdev_start_qos(bdev); 7994 if (rc != 0) { 7995 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7996 spdk_spin_unlock(&bdev->internal.spinlock); 7997 return rc; 7998 } 7999 8000 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8001 8002 spdk_spin_unlock(&bdev->internal.spinlock); 8003 8004 return 0; 8005 } 8006 8007 static int 8008 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8009 struct spdk_bdev_desc **_desc) 8010 { 8011 struct spdk_bdev_desc *desc; 8012 unsigned int i; 8013 8014 desc = calloc(1, sizeof(*desc)); 8015 if (desc == NULL) { 8016 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8017 return -ENOMEM; 8018 } 8019 8020 TAILQ_INIT(&desc->pending_media_events); 8021 TAILQ_INIT(&desc->free_media_events); 8022 8023 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8024 desc->callback.event_fn = event_cb; 8025 desc->callback.ctx = event_ctx; 8026 spdk_spin_init(&desc->spinlock); 8027 8028 if (bdev->media_events) { 8029 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8030 sizeof(*desc->media_events_buffer)); 8031 if (desc->media_events_buffer == NULL) { 8032 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8033 bdev_desc_free(desc); 8034 return -ENOMEM; 8035 } 8036 8037 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8038 TAILQ_INSERT_TAIL(&desc->free_media_events, 8039 &desc->media_events_buffer[i], tailq); 8040 } 8041 } 8042 8043 if (bdev->fn_table->accel_sequence_supported != NULL) { 8044 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8045 desc->accel_sequence_supported[i] = 8046 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8047 (enum spdk_bdev_io_type)i); 8048 } 8049 } 8050 8051 *_desc = desc; 8052 8053 return 0; 8054 } 8055 8056 static int 8057 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8058 void *event_ctx, struct spdk_bdev_desc **_desc) 8059 { 8060 struct spdk_bdev_desc *desc; 8061 struct spdk_bdev *bdev; 8062 int rc; 8063 8064 bdev = bdev_get_by_name(bdev_name); 8065 8066 if (bdev == NULL) { 8067 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8068 return -ENODEV; 8069 } 8070 8071 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8072 if (rc != 0) { 8073 return rc; 8074 } 8075 8076 rc = bdev_open(bdev, write, desc); 8077 if (rc != 0) { 8078 bdev_desc_free(desc); 8079 desc = NULL; 8080 } 8081 8082 *_desc = desc; 8083 8084 return rc; 8085 } 8086 8087 int 8088 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8089 void *event_ctx, struct spdk_bdev_desc **_desc) 8090 { 8091 int rc; 8092 8093 if (event_cb == NULL) { 8094 SPDK_ERRLOG("Missing event callback function\n"); 8095 return -EINVAL; 8096 } 8097 8098 spdk_spin_lock(&g_bdev_mgr.spinlock); 8099 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8100 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8101 8102 return rc; 8103 } 8104 8105 struct spdk_bdev_open_async_ctx { 8106 char *bdev_name; 8107 spdk_bdev_event_cb_t event_cb; 8108 void *event_ctx; 8109 bool write; 8110 int rc; 8111 spdk_bdev_open_async_cb_t cb_fn; 8112 void *cb_arg; 8113 struct spdk_bdev_desc *desc; 8114 struct spdk_bdev_open_async_opts opts; 8115 uint64_t start_ticks; 8116 struct spdk_thread *orig_thread; 8117 struct spdk_poller *poller; 8118 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8119 }; 8120 8121 static void 8122 bdev_open_async_done(void *arg) 8123 { 8124 struct spdk_bdev_open_async_ctx *ctx = arg; 8125 8126 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8127 8128 free(ctx->bdev_name); 8129 free(ctx); 8130 } 8131 8132 static void 8133 bdev_open_async_cancel(void *arg) 8134 { 8135 struct spdk_bdev_open_async_ctx *ctx = arg; 8136 8137 assert(ctx->rc == -ESHUTDOWN); 8138 8139 spdk_poller_unregister(&ctx->poller); 8140 8141 bdev_open_async_done(ctx); 8142 } 8143 8144 /* This is called when the bdev library finishes at shutdown. */ 8145 static void 8146 bdev_open_async_fini(void) 8147 { 8148 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8149 8150 spdk_spin_lock(&g_bdev_mgr.spinlock); 8151 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8152 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8153 /* 8154 * We have to move to ctx->orig_thread to unregister ctx->poller. 8155 * However, there is a chance that ctx->poller is executed before 8156 * message is executed, which could result in bdev_open_async_done() 8157 * being called twice. To avoid such race condition, set ctx->rc to 8158 * -ESHUTDOWN. 8159 */ 8160 ctx->rc = -ESHUTDOWN; 8161 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8162 } 8163 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8164 } 8165 8166 static int bdev_open_async(void *arg); 8167 8168 static void 8169 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8170 { 8171 uint64_t timeout_ticks; 8172 8173 if (ctx->rc == -ESHUTDOWN) { 8174 /* This context is being canceled. Do nothing. */ 8175 return; 8176 } 8177 8178 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8179 &ctx->desc); 8180 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8181 goto exit; 8182 } 8183 8184 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8185 if (spdk_get_ticks() >= timeout_ticks) { 8186 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8187 ctx->rc = -ETIMEDOUT; 8188 goto exit; 8189 } 8190 8191 return; 8192 8193 exit: 8194 spdk_poller_unregister(&ctx->poller); 8195 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8196 8197 /* Completion callback is processed after stack unwinding. */ 8198 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8199 } 8200 8201 static int 8202 bdev_open_async(void *arg) 8203 { 8204 struct spdk_bdev_open_async_ctx *ctx = arg; 8205 8206 spdk_spin_lock(&g_bdev_mgr.spinlock); 8207 8208 _bdev_open_async(ctx); 8209 8210 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8211 8212 return SPDK_POLLER_BUSY; 8213 } 8214 8215 static void 8216 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8217 struct spdk_bdev_open_async_opts *opts_src, 8218 size_t size) 8219 { 8220 assert(opts); 8221 assert(opts_src); 8222 8223 opts->size = size; 8224 8225 #define SET_FIELD(field) \ 8226 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8227 opts->field = opts_src->field; \ 8228 } \ 8229 8230 SET_FIELD(timeout_ms); 8231 8232 /* Do not remove this statement, you should always update this statement when you adding a new field, 8233 * and do not forget to add the SET_FIELD statement for your added field. */ 8234 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8235 8236 #undef SET_FIELD 8237 } 8238 8239 static void 8240 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8241 { 8242 assert(opts); 8243 8244 opts->size = size; 8245 8246 #define SET_FIELD(field, value) \ 8247 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8248 opts->field = value; \ 8249 } \ 8250 8251 SET_FIELD(timeout_ms, 0); 8252 8253 #undef SET_FIELD 8254 } 8255 8256 int 8257 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8258 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8259 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8260 { 8261 struct spdk_bdev_open_async_ctx *ctx; 8262 8263 if (event_cb == NULL) { 8264 SPDK_ERRLOG("Missing event callback function\n"); 8265 return -EINVAL; 8266 } 8267 8268 if (open_cb == NULL) { 8269 SPDK_ERRLOG("Missing open callback function\n"); 8270 return -EINVAL; 8271 } 8272 8273 if (opts != NULL && opts->size == 0) { 8274 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8275 return -EINVAL; 8276 } 8277 8278 ctx = calloc(1, sizeof(*ctx)); 8279 if (ctx == NULL) { 8280 SPDK_ERRLOG("Failed to allocate open context\n"); 8281 return -ENOMEM; 8282 } 8283 8284 ctx->bdev_name = strdup(bdev_name); 8285 if (ctx->bdev_name == NULL) { 8286 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8287 free(ctx); 8288 return -ENOMEM; 8289 } 8290 8291 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8292 if (ctx->poller == NULL) { 8293 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8294 free(ctx->bdev_name); 8295 free(ctx); 8296 return -ENOMEM; 8297 } 8298 8299 ctx->cb_fn = open_cb; 8300 ctx->cb_arg = open_cb_arg; 8301 ctx->write = write; 8302 ctx->event_cb = event_cb; 8303 ctx->event_ctx = event_ctx; 8304 ctx->orig_thread = spdk_get_thread(); 8305 ctx->start_ticks = spdk_get_ticks(); 8306 8307 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8308 if (opts != NULL) { 8309 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8310 } 8311 8312 spdk_spin_lock(&g_bdev_mgr.spinlock); 8313 8314 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8315 _bdev_open_async(ctx); 8316 8317 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8318 8319 return 0; 8320 } 8321 8322 static void 8323 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8324 { 8325 int rc; 8326 8327 spdk_spin_lock(&bdev->internal.spinlock); 8328 spdk_spin_lock(&desc->spinlock); 8329 8330 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8331 8332 desc->closed = true; 8333 8334 if (desc->claim != NULL) { 8335 bdev_desc_release_claims(desc); 8336 } 8337 8338 if (0 == desc->refs) { 8339 spdk_spin_unlock(&desc->spinlock); 8340 bdev_desc_free(desc); 8341 } else { 8342 spdk_spin_unlock(&desc->spinlock); 8343 } 8344 8345 /* If no more descriptors, kill QoS channel */ 8346 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8347 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8348 bdev->name, spdk_get_thread()); 8349 8350 if (bdev_qos_destroy(bdev)) { 8351 /* There isn't anything we can do to recover here. Just let the 8352 * old QoS poller keep running. The QoS handling won't change 8353 * cores when the user allocates a new channel, but it won't break. */ 8354 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8355 } 8356 } 8357 8358 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8359 rc = bdev_unregister_unsafe(bdev); 8360 spdk_spin_unlock(&bdev->internal.spinlock); 8361 8362 if (rc == 0) { 8363 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8364 } 8365 } else { 8366 spdk_spin_unlock(&bdev->internal.spinlock); 8367 } 8368 } 8369 8370 void 8371 spdk_bdev_close(struct spdk_bdev_desc *desc) 8372 { 8373 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8374 8375 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8376 spdk_get_thread()); 8377 8378 assert(desc->thread == spdk_get_thread()); 8379 8380 spdk_poller_unregister(&desc->io_timeout_poller); 8381 8382 spdk_spin_lock(&g_bdev_mgr.spinlock); 8383 8384 bdev_close(bdev, desc); 8385 8386 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8387 } 8388 8389 static void 8390 bdev_register_finished(void *arg) 8391 { 8392 struct spdk_bdev_desc *desc = arg; 8393 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8394 8395 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8396 8397 spdk_spin_lock(&g_bdev_mgr.spinlock); 8398 8399 bdev_close(bdev, desc); 8400 8401 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8402 } 8403 8404 int 8405 spdk_bdev_register(struct spdk_bdev *bdev) 8406 { 8407 struct spdk_bdev_desc *desc; 8408 struct spdk_thread *thread = spdk_get_thread(); 8409 int rc; 8410 8411 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8412 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 8413 thread ? spdk_thread_get_name(thread) : "null"); 8414 return -EINVAL; 8415 } 8416 8417 rc = bdev_register(bdev); 8418 if (rc != 0) { 8419 return rc; 8420 } 8421 8422 /* A descriptor is opened to prevent bdev deletion during examination */ 8423 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8424 if (rc != 0) { 8425 spdk_bdev_unregister(bdev, NULL, NULL); 8426 return rc; 8427 } 8428 8429 rc = bdev_open(bdev, false, desc); 8430 if (rc != 0) { 8431 bdev_desc_free(desc); 8432 spdk_bdev_unregister(bdev, NULL, NULL); 8433 return rc; 8434 } 8435 8436 /* Examine configuration before initializing I/O */ 8437 bdev_examine(bdev); 8438 8439 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8440 if (rc != 0) { 8441 bdev_close(bdev, desc); 8442 spdk_bdev_unregister(bdev, NULL, NULL); 8443 } 8444 8445 return rc; 8446 } 8447 8448 int 8449 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8450 struct spdk_bdev_module *module) 8451 { 8452 spdk_spin_lock(&bdev->internal.spinlock); 8453 8454 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8455 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8456 spdk_spin_unlock(&bdev->internal.spinlock); 8457 return -EPERM; 8458 } 8459 8460 if (desc && !desc->write) { 8461 desc->write = true; 8462 } 8463 8464 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8465 bdev->internal.claim.v1.module = module; 8466 8467 spdk_spin_unlock(&bdev->internal.spinlock); 8468 return 0; 8469 } 8470 8471 void 8472 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8473 { 8474 spdk_spin_lock(&bdev->internal.spinlock); 8475 8476 assert(bdev->internal.claim.v1.module != NULL); 8477 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8478 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8479 bdev->internal.claim.v1.module = NULL; 8480 8481 spdk_spin_unlock(&bdev->internal.spinlock); 8482 } 8483 8484 /* 8485 * Start claims v2 8486 */ 8487 8488 const char * 8489 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8490 { 8491 switch (type) { 8492 case SPDK_BDEV_CLAIM_NONE: 8493 return "not_claimed"; 8494 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8495 return "exclusive_write"; 8496 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8497 return "read_many_write_one"; 8498 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8499 return "read_many_write_none"; 8500 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8501 return "read_many_write_many"; 8502 default: 8503 break; 8504 } 8505 return "invalid_claim"; 8506 } 8507 8508 static bool 8509 claim_type_is_v2(enum spdk_bdev_claim_type type) 8510 { 8511 switch (type) { 8512 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8513 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8514 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8515 return true; 8516 default: 8517 break; 8518 } 8519 return false; 8520 } 8521 8522 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8523 static bool 8524 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8525 { 8526 switch (type) { 8527 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8528 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8529 return true; 8530 default: 8531 break; 8532 } 8533 return false; 8534 } 8535 8536 void 8537 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8538 { 8539 if (opts == NULL) { 8540 SPDK_ERRLOG("opts should not be NULL\n"); 8541 assert(opts != NULL); 8542 return; 8543 } 8544 if (size == 0) { 8545 SPDK_ERRLOG("size should not be zero\n"); 8546 assert(size != 0); 8547 return; 8548 } 8549 8550 memset(opts, 0, size); 8551 opts->opts_size = size; 8552 8553 #define FIELD_OK(field) \ 8554 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8555 8556 #define SET_FIELD(field, value) \ 8557 if (FIELD_OK(field)) { \ 8558 opts->field = value; \ 8559 } \ 8560 8561 SET_FIELD(shared_claim_key, 0); 8562 8563 #undef FIELD_OK 8564 #undef SET_FIELD 8565 } 8566 8567 static int 8568 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8569 { 8570 if (src->opts_size == 0) { 8571 SPDK_ERRLOG("size should not be zero\n"); 8572 return -1; 8573 } 8574 8575 memset(dst, 0, sizeof(*dst)); 8576 dst->opts_size = src->opts_size; 8577 8578 #define FIELD_OK(field) \ 8579 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8580 8581 #define SET_FIELD(field) \ 8582 if (FIELD_OK(field)) { \ 8583 dst->field = src->field; \ 8584 } \ 8585 8586 if (FIELD_OK(name)) { 8587 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8588 } 8589 8590 SET_FIELD(shared_claim_key); 8591 8592 /* You should not remove this statement, but need to update the assert statement 8593 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8594 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8595 8596 #undef FIELD_OK 8597 #undef SET_FIELD 8598 return 0; 8599 } 8600 8601 /* Returns 0 if a read-write-once claim can be taken. */ 8602 static int 8603 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8604 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8605 { 8606 struct spdk_bdev *bdev = desc->bdev; 8607 struct spdk_bdev_desc *open_desc; 8608 8609 assert(spdk_spin_held(&bdev->internal.spinlock)); 8610 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8611 8612 if (opts->shared_claim_key != 0) { 8613 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8614 bdev->name); 8615 return -EINVAL; 8616 } 8617 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8618 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8619 return -EPERM; 8620 } 8621 if (desc->claim != NULL) { 8622 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8623 bdev->name, desc->claim->module->name); 8624 return -EPERM; 8625 } 8626 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8627 if (desc != open_desc && open_desc->write) { 8628 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8629 "another descriptor is open for writing\n", 8630 bdev->name); 8631 return -EPERM; 8632 } 8633 } 8634 8635 return 0; 8636 } 8637 8638 /* Returns 0 if a read-only-many claim can be taken. */ 8639 static int 8640 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8641 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8642 { 8643 struct spdk_bdev *bdev = desc->bdev; 8644 struct spdk_bdev_desc *open_desc; 8645 8646 assert(spdk_spin_held(&bdev->internal.spinlock)); 8647 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8648 assert(desc->claim == NULL); 8649 8650 if (desc->write) { 8651 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8652 bdev->name); 8653 return -EINVAL; 8654 } 8655 if (opts->shared_claim_key != 0) { 8656 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8657 return -EINVAL; 8658 } 8659 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8660 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8661 if (open_desc->write) { 8662 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8663 "another descriptor is open for writing\n", 8664 bdev->name); 8665 return -EPERM; 8666 } 8667 } 8668 } 8669 8670 return 0; 8671 } 8672 8673 /* Returns 0 if a read-write-many claim can be taken. */ 8674 static int 8675 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8676 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8677 { 8678 struct spdk_bdev *bdev = desc->bdev; 8679 struct spdk_bdev_desc *open_desc; 8680 8681 assert(spdk_spin_held(&bdev->internal.spinlock)); 8682 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8683 assert(desc->claim == NULL); 8684 8685 if (opts->shared_claim_key == 0) { 8686 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8687 bdev->name); 8688 return -EINVAL; 8689 } 8690 switch (bdev->internal.claim_type) { 8691 case SPDK_BDEV_CLAIM_NONE: 8692 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8693 if (open_desc == desc) { 8694 continue; 8695 } 8696 if (open_desc->write) { 8697 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8698 "another descriptor is open for writing without a " 8699 "claim\n", bdev->name); 8700 return -EPERM; 8701 } 8702 } 8703 break; 8704 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8705 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8706 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8707 return -EPERM; 8708 } 8709 break; 8710 default: 8711 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8712 return -EBUSY; 8713 } 8714 8715 return 0; 8716 } 8717 8718 /* Updates desc and its bdev with a v2 claim. */ 8719 static int 8720 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8721 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8722 { 8723 struct spdk_bdev *bdev = desc->bdev; 8724 struct spdk_bdev_module_claim *claim; 8725 8726 assert(spdk_spin_held(&bdev->internal.spinlock)); 8727 assert(claim_type_is_v2(type)); 8728 assert(desc->claim == NULL); 8729 8730 claim = calloc(1, sizeof(*desc->claim)); 8731 if (claim == NULL) { 8732 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8733 return -ENOMEM; 8734 } 8735 claim->module = module; 8736 claim->desc = desc; 8737 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8738 memcpy(claim->name, opts->name, sizeof(claim->name)); 8739 desc->claim = claim; 8740 8741 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8742 bdev->internal.claim_type = type; 8743 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8744 bdev->internal.claim.v2.key = opts->shared_claim_key; 8745 } 8746 assert(type == bdev->internal.claim_type); 8747 8748 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8749 8750 if (!desc->write && claim_type_promotes_to_write(type)) { 8751 desc->write = true; 8752 } 8753 8754 return 0; 8755 } 8756 8757 int 8758 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8759 struct spdk_bdev_claim_opts *_opts, 8760 struct spdk_bdev_module *module) 8761 { 8762 struct spdk_bdev *bdev; 8763 struct spdk_bdev_claim_opts opts; 8764 int rc = 0; 8765 8766 if (desc == NULL) { 8767 SPDK_ERRLOG("descriptor must not be NULL\n"); 8768 return -EINVAL; 8769 } 8770 8771 bdev = desc->bdev; 8772 8773 if (_opts == NULL) { 8774 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8775 } else if (claim_opts_copy(_opts, &opts) != 0) { 8776 return -EINVAL; 8777 } 8778 8779 spdk_spin_lock(&bdev->internal.spinlock); 8780 8781 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8782 bdev->internal.claim_type != type) { 8783 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8784 spdk_spin_unlock(&bdev->internal.spinlock); 8785 return -EPERM; 8786 } 8787 8788 if (claim_type_is_v2(type) && desc->claim != NULL) { 8789 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8790 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8791 spdk_spin_unlock(&bdev->internal.spinlock); 8792 return -EPERM; 8793 } 8794 8795 switch (type) { 8796 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8797 spdk_spin_unlock(&bdev->internal.spinlock); 8798 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8799 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8800 rc = claim_verify_rwo(desc, type, &opts, module); 8801 break; 8802 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8803 rc = claim_verify_rom(desc, type, &opts, module); 8804 break; 8805 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8806 rc = claim_verify_rwm(desc, type, &opts, module); 8807 break; 8808 default: 8809 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8810 rc = -ENOTSUP; 8811 } 8812 8813 if (rc == 0) { 8814 rc = claim_bdev(desc, type, &opts, module); 8815 } 8816 8817 spdk_spin_unlock(&bdev->internal.spinlock); 8818 return rc; 8819 } 8820 8821 static void 8822 claim_reset(struct spdk_bdev *bdev) 8823 { 8824 assert(spdk_spin_held(&bdev->internal.spinlock)); 8825 assert(claim_type_is_v2(bdev->internal.claim_type)); 8826 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8827 8828 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8829 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8830 } 8831 8832 static void 8833 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8834 { 8835 struct spdk_bdev *bdev = desc->bdev; 8836 8837 assert(spdk_spin_held(&bdev->internal.spinlock)); 8838 assert(claim_type_is_v2(bdev->internal.claim_type)); 8839 8840 if (bdev->internal.examine_in_progress == 0) { 8841 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8842 free(desc->claim); 8843 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8844 claim_reset(bdev); 8845 } 8846 } else { 8847 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8848 desc->claim->module = NULL; 8849 desc->claim->desc = NULL; 8850 } 8851 desc->claim = NULL; 8852 } 8853 8854 /* 8855 * End claims v2 8856 */ 8857 8858 struct spdk_bdev * 8859 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8860 { 8861 assert(desc != NULL); 8862 return desc->bdev; 8863 } 8864 8865 int 8866 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8867 { 8868 struct spdk_bdev *bdev, *tmp; 8869 struct spdk_bdev_desc *desc; 8870 int rc = 0; 8871 8872 assert(fn != NULL); 8873 8874 spdk_spin_lock(&g_bdev_mgr.spinlock); 8875 bdev = spdk_bdev_first(); 8876 while (bdev != NULL) { 8877 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8878 if (rc != 0) { 8879 break; 8880 } 8881 rc = bdev_open(bdev, false, desc); 8882 if (rc != 0) { 8883 bdev_desc_free(desc); 8884 if (rc == -ENODEV) { 8885 /* Ignore the error and move to the next bdev. */ 8886 rc = 0; 8887 bdev = spdk_bdev_next(bdev); 8888 continue; 8889 } 8890 break; 8891 } 8892 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8893 8894 rc = fn(ctx, bdev); 8895 8896 spdk_spin_lock(&g_bdev_mgr.spinlock); 8897 tmp = spdk_bdev_next(bdev); 8898 bdev_close(bdev, desc); 8899 if (rc != 0) { 8900 break; 8901 } 8902 bdev = tmp; 8903 } 8904 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8905 8906 return rc; 8907 } 8908 8909 int 8910 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8911 { 8912 struct spdk_bdev *bdev, *tmp; 8913 struct spdk_bdev_desc *desc; 8914 int rc = 0; 8915 8916 assert(fn != NULL); 8917 8918 spdk_spin_lock(&g_bdev_mgr.spinlock); 8919 bdev = spdk_bdev_first_leaf(); 8920 while (bdev != NULL) { 8921 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8922 if (rc != 0) { 8923 break; 8924 } 8925 rc = bdev_open(bdev, false, desc); 8926 if (rc != 0) { 8927 bdev_desc_free(desc); 8928 if (rc == -ENODEV) { 8929 /* Ignore the error and move to the next bdev. */ 8930 rc = 0; 8931 bdev = spdk_bdev_next_leaf(bdev); 8932 continue; 8933 } 8934 break; 8935 } 8936 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8937 8938 rc = fn(ctx, bdev); 8939 8940 spdk_spin_lock(&g_bdev_mgr.spinlock); 8941 tmp = spdk_bdev_next_leaf(bdev); 8942 bdev_close(bdev, desc); 8943 if (rc != 0) { 8944 break; 8945 } 8946 bdev = tmp; 8947 } 8948 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8949 8950 return rc; 8951 } 8952 8953 void 8954 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8955 { 8956 struct iovec *iovs; 8957 int iovcnt; 8958 8959 if (bdev_io == NULL) { 8960 return; 8961 } 8962 8963 switch (bdev_io->type) { 8964 case SPDK_BDEV_IO_TYPE_READ: 8965 case SPDK_BDEV_IO_TYPE_WRITE: 8966 case SPDK_BDEV_IO_TYPE_ZCOPY: 8967 iovs = bdev_io->u.bdev.iovs; 8968 iovcnt = bdev_io->u.bdev.iovcnt; 8969 break; 8970 default: 8971 iovs = NULL; 8972 iovcnt = 0; 8973 break; 8974 } 8975 8976 if (iovp) { 8977 *iovp = iovs; 8978 } 8979 if (iovcntp) { 8980 *iovcntp = iovcnt; 8981 } 8982 } 8983 8984 void * 8985 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8986 { 8987 if (bdev_io == NULL) { 8988 return NULL; 8989 } 8990 8991 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8992 return NULL; 8993 } 8994 8995 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8996 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8997 return bdev_io->u.bdev.md_buf; 8998 } 8999 9000 return NULL; 9001 } 9002 9003 void * 9004 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9005 { 9006 if (bdev_io == NULL) { 9007 assert(false); 9008 return NULL; 9009 } 9010 9011 return bdev_io->internal.caller_ctx; 9012 } 9013 9014 void 9015 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9016 { 9017 9018 if (spdk_bdev_module_list_find(bdev_module->name)) { 9019 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9020 assert(false); 9021 } 9022 9023 spdk_spin_init(&bdev_module->internal.spinlock); 9024 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9025 9026 /* 9027 * Modules with examine callbacks must be initialized first, so they are 9028 * ready to handle examine callbacks from later modules that will 9029 * register physical bdevs. 9030 */ 9031 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9032 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9033 } else { 9034 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9035 } 9036 } 9037 9038 struct spdk_bdev_module * 9039 spdk_bdev_module_list_find(const char *name) 9040 { 9041 struct spdk_bdev_module *bdev_module; 9042 9043 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9044 if (strcmp(name, bdev_module->name) == 0) { 9045 break; 9046 } 9047 } 9048 9049 return bdev_module; 9050 } 9051 9052 static int 9053 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9054 { 9055 uint64_t num_blocks; 9056 void *md_buf = NULL; 9057 9058 num_blocks = bdev_io->u.bdev.num_blocks; 9059 9060 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9061 md_buf = (char *)g_bdev_mgr.zero_buffer + 9062 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9063 } 9064 9065 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9066 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9067 g_bdev_mgr.zero_buffer, md_buf, 9068 bdev_io->u.bdev.offset_blocks, num_blocks, 9069 bdev_write_zero_buffer_done, bdev_io); 9070 } 9071 9072 static void 9073 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9074 { 9075 struct spdk_bdev_io *parent_io = cb_arg; 9076 9077 spdk_bdev_free_io(bdev_io); 9078 9079 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9080 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9081 } 9082 9083 static void 9084 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9085 { 9086 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9087 ctx->bdev->internal.qos_mod_in_progress = false; 9088 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9089 9090 if (ctx->cb_fn) { 9091 ctx->cb_fn(ctx->cb_arg, status); 9092 } 9093 free(ctx); 9094 } 9095 9096 static void 9097 bdev_disable_qos_done(void *cb_arg) 9098 { 9099 struct set_qos_limit_ctx *ctx = cb_arg; 9100 struct spdk_bdev *bdev = ctx->bdev; 9101 struct spdk_bdev_qos *qos; 9102 9103 spdk_spin_lock(&bdev->internal.spinlock); 9104 qos = bdev->internal.qos; 9105 bdev->internal.qos = NULL; 9106 spdk_spin_unlock(&bdev->internal.spinlock); 9107 9108 if (qos->thread != NULL) { 9109 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9110 spdk_poller_unregister(&qos->poller); 9111 } 9112 9113 free(qos); 9114 9115 bdev_set_qos_limit_done(ctx, 0); 9116 } 9117 9118 static void 9119 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9120 { 9121 struct set_qos_limit_ctx *ctx = _ctx; 9122 struct spdk_thread *thread; 9123 9124 spdk_spin_lock(&bdev->internal.spinlock); 9125 thread = bdev->internal.qos->thread; 9126 spdk_spin_unlock(&bdev->internal.spinlock); 9127 9128 if (thread != NULL) { 9129 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9130 } else { 9131 bdev_disable_qos_done(ctx); 9132 } 9133 } 9134 9135 static void 9136 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9137 struct spdk_io_channel *ch, void *_ctx) 9138 { 9139 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9140 struct spdk_bdev_io *bdev_io; 9141 9142 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9143 9144 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9145 /* Re-submit the queued I/O. */ 9146 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9147 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9148 _bdev_io_submit(bdev_io); 9149 } 9150 9151 spdk_bdev_for_each_channel_continue(i, 0); 9152 } 9153 9154 static void 9155 bdev_update_qos_rate_limit_msg(void *cb_arg) 9156 { 9157 struct set_qos_limit_ctx *ctx = cb_arg; 9158 struct spdk_bdev *bdev = ctx->bdev; 9159 9160 spdk_spin_lock(&bdev->internal.spinlock); 9161 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9162 spdk_spin_unlock(&bdev->internal.spinlock); 9163 9164 bdev_set_qos_limit_done(ctx, 0); 9165 } 9166 9167 static void 9168 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9169 struct spdk_io_channel *ch, void *_ctx) 9170 { 9171 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9172 9173 spdk_spin_lock(&bdev->internal.spinlock); 9174 bdev_enable_qos(bdev, bdev_ch); 9175 spdk_spin_unlock(&bdev->internal.spinlock); 9176 spdk_bdev_for_each_channel_continue(i, 0); 9177 } 9178 9179 static void 9180 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9181 { 9182 struct set_qos_limit_ctx *ctx = _ctx; 9183 9184 bdev_set_qos_limit_done(ctx, status); 9185 } 9186 9187 static void 9188 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9189 { 9190 int i; 9191 9192 assert(bdev->internal.qos != NULL); 9193 9194 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9195 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9196 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9197 9198 if (limits[i] == 0) { 9199 bdev->internal.qos->rate_limits[i].limit = 9200 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9201 } 9202 } 9203 } 9204 } 9205 9206 void 9207 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9208 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9209 { 9210 struct set_qos_limit_ctx *ctx; 9211 uint32_t limit_set_complement; 9212 uint64_t min_limit_per_sec; 9213 int i; 9214 bool disable_rate_limit = true; 9215 9216 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9217 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9218 continue; 9219 } 9220 9221 if (limits[i] > 0) { 9222 disable_rate_limit = false; 9223 } 9224 9225 if (bdev_qos_is_iops_rate_limit(i) == true) { 9226 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9227 } else { 9228 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9229 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9230 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9231 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9232 } 9233 /* Change from megabyte to byte rate limit */ 9234 limits[i] = limits[i] * 1024 * 1024; 9235 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9236 } 9237 9238 limit_set_complement = limits[i] % min_limit_per_sec; 9239 if (limit_set_complement) { 9240 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9241 limits[i], min_limit_per_sec); 9242 limits[i] += min_limit_per_sec - limit_set_complement; 9243 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9244 } 9245 } 9246 9247 ctx = calloc(1, sizeof(*ctx)); 9248 if (ctx == NULL) { 9249 cb_fn(cb_arg, -ENOMEM); 9250 return; 9251 } 9252 9253 ctx->cb_fn = cb_fn; 9254 ctx->cb_arg = cb_arg; 9255 ctx->bdev = bdev; 9256 9257 spdk_spin_lock(&bdev->internal.spinlock); 9258 if (bdev->internal.qos_mod_in_progress) { 9259 spdk_spin_unlock(&bdev->internal.spinlock); 9260 free(ctx); 9261 cb_fn(cb_arg, -EAGAIN); 9262 return; 9263 } 9264 bdev->internal.qos_mod_in_progress = true; 9265 9266 if (disable_rate_limit == true && bdev->internal.qos) { 9267 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9268 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9269 (bdev->internal.qos->rate_limits[i].limit > 0 && 9270 bdev->internal.qos->rate_limits[i].limit != 9271 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9272 disable_rate_limit = false; 9273 break; 9274 } 9275 } 9276 } 9277 9278 if (disable_rate_limit == false) { 9279 if (bdev->internal.qos == NULL) { 9280 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9281 if (!bdev->internal.qos) { 9282 spdk_spin_unlock(&bdev->internal.spinlock); 9283 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9284 bdev_set_qos_limit_done(ctx, -ENOMEM); 9285 return; 9286 } 9287 } 9288 9289 if (bdev->internal.qos->thread == NULL) { 9290 /* Enabling */ 9291 bdev_set_qos_rate_limits(bdev, limits); 9292 9293 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9294 bdev_enable_qos_done); 9295 } else { 9296 /* Updating */ 9297 bdev_set_qos_rate_limits(bdev, limits); 9298 9299 spdk_thread_send_msg(bdev->internal.qos->thread, 9300 bdev_update_qos_rate_limit_msg, ctx); 9301 } 9302 } else { 9303 if (bdev->internal.qos != NULL) { 9304 bdev_set_qos_rate_limits(bdev, limits); 9305 9306 /* Disabling */ 9307 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9308 bdev_disable_qos_msg_done); 9309 } else { 9310 spdk_spin_unlock(&bdev->internal.spinlock); 9311 bdev_set_qos_limit_done(ctx, 0); 9312 return; 9313 } 9314 } 9315 9316 spdk_spin_unlock(&bdev->internal.spinlock); 9317 } 9318 9319 struct spdk_bdev_histogram_ctx { 9320 spdk_bdev_histogram_status_cb cb_fn; 9321 void *cb_arg; 9322 struct spdk_bdev *bdev; 9323 int status; 9324 }; 9325 9326 static void 9327 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9328 { 9329 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9330 9331 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9332 ctx->bdev->internal.histogram_in_progress = false; 9333 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9334 ctx->cb_fn(ctx->cb_arg, ctx->status); 9335 free(ctx); 9336 } 9337 9338 static void 9339 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9340 struct spdk_io_channel *_ch, void *_ctx) 9341 { 9342 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9343 9344 if (ch->histogram != NULL) { 9345 spdk_histogram_data_free(ch->histogram); 9346 ch->histogram = NULL; 9347 } 9348 spdk_bdev_for_each_channel_continue(i, 0); 9349 } 9350 9351 static void 9352 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9353 { 9354 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9355 9356 if (status != 0) { 9357 ctx->status = status; 9358 ctx->bdev->internal.histogram_enabled = false; 9359 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9360 bdev_histogram_disable_channel_cb); 9361 } else { 9362 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9363 ctx->bdev->internal.histogram_in_progress = false; 9364 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9365 ctx->cb_fn(ctx->cb_arg, ctx->status); 9366 free(ctx); 9367 } 9368 } 9369 9370 static void 9371 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9372 struct spdk_io_channel *_ch, void *_ctx) 9373 { 9374 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9375 int status = 0; 9376 9377 if (ch->histogram == NULL) { 9378 ch->histogram = spdk_histogram_data_alloc(); 9379 if (ch->histogram == NULL) { 9380 status = -ENOMEM; 9381 } 9382 } 9383 9384 spdk_bdev_for_each_channel_continue(i, status); 9385 } 9386 9387 void 9388 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9389 void *cb_arg, bool enable) 9390 { 9391 struct spdk_bdev_histogram_ctx *ctx; 9392 9393 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9394 if (ctx == NULL) { 9395 cb_fn(cb_arg, -ENOMEM); 9396 return; 9397 } 9398 9399 ctx->bdev = bdev; 9400 ctx->status = 0; 9401 ctx->cb_fn = cb_fn; 9402 ctx->cb_arg = cb_arg; 9403 9404 spdk_spin_lock(&bdev->internal.spinlock); 9405 if (bdev->internal.histogram_in_progress) { 9406 spdk_spin_unlock(&bdev->internal.spinlock); 9407 free(ctx); 9408 cb_fn(cb_arg, -EAGAIN); 9409 return; 9410 } 9411 9412 bdev->internal.histogram_in_progress = true; 9413 spdk_spin_unlock(&bdev->internal.spinlock); 9414 9415 bdev->internal.histogram_enabled = enable; 9416 9417 if (enable) { 9418 /* Allocate histogram for each channel */ 9419 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9420 bdev_histogram_enable_channel_cb); 9421 } else { 9422 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9423 bdev_histogram_disable_channel_cb); 9424 } 9425 } 9426 9427 struct spdk_bdev_histogram_data_ctx { 9428 spdk_bdev_histogram_data_cb cb_fn; 9429 void *cb_arg; 9430 struct spdk_bdev *bdev; 9431 /** merged histogram data from all channels */ 9432 struct spdk_histogram_data *histogram; 9433 }; 9434 9435 static void 9436 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9437 { 9438 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9439 9440 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9441 free(ctx); 9442 } 9443 9444 static void 9445 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9446 struct spdk_io_channel *_ch, void *_ctx) 9447 { 9448 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9449 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9450 int status = 0; 9451 9452 if (ch->histogram == NULL) { 9453 status = -EFAULT; 9454 } else { 9455 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9456 } 9457 9458 spdk_bdev_for_each_channel_continue(i, status); 9459 } 9460 9461 void 9462 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9463 spdk_bdev_histogram_data_cb cb_fn, 9464 void *cb_arg) 9465 { 9466 struct spdk_bdev_histogram_data_ctx *ctx; 9467 9468 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9469 if (ctx == NULL) { 9470 cb_fn(cb_arg, -ENOMEM, NULL); 9471 return; 9472 } 9473 9474 ctx->bdev = bdev; 9475 ctx->cb_fn = cb_fn; 9476 ctx->cb_arg = cb_arg; 9477 9478 ctx->histogram = histogram; 9479 9480 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9481 bdev_histogram_get_channel_cb); 9482 } 9483 9484 void 9485 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9486 void *cb_arg) 9487 { 9488 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9489 int status = 0; 9490 9491 assert(cb_fn != NULL); 9492 9493 if (bdev_ch->histogram == NULL) { 9494 status = -EFAULT; 9495 } 9496 cb_fn(cb_arg, status, bdev_ch->histogram); 9497 } 9498 9499 size_t 9500 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9501 size_t max_events) 9502 { 9503 struct media_event_entry *entry; 9504 size_t num_events = 0; 9505 9506 for (; num_events < max_events; ++num_events) { 9507 entry = TAILQ_FIRST(&desc->pending_media_events); 9508 if (entry == NULL) { 9509 break; 9510 } 9511 9512 events[num_events] = entry->event; 9513 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9514 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9515 } 9516 9517 return num_events; 9518 } 9519 9520 int 9521 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9522 size_t num_events) 9523 { 9524 struct spdk_bdev_desc *desc; 9525 struct media_event_entry *entry; 9526 size_t event_id; 9527 int rc = 0; 9528 9529 assert(bdev->media_events); 9530 9531 spdk_spin_lock(&bdev->internal.spinlock); 9532 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9533 if (desc->write) { 9534 break; 9535 } 9536 } 9537 9538 if (desc == NULL || desc->media_events_buffer == NULL) { 9539 rc = -ENODEV; 9540 goto out; 9541 } 9542 9543 for (event_id = 0; event_id < num_events; ++event_id) { 9544 entry = TAILQ_FIRST(&desc->free_media_events); 9545 if (entry == NULL) { 9546 break; 9547 } 9548 9549 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9550 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9551 entry->event = events[event_id]; 9552 } 9553 9554 rc = event_id; 9555 out: 9556 spdk_spin_unlock(&bdev->internal.spinlock); 9557 return rc; 9558 } 9559 9560 static void 9561 _media_management_notify(void *arg) 9562 { 9563 struct spdk_bdev_desc *desc = arg; 9564 9565 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9566 } 9567 9568 void 9569 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9570 { 9571 struct spdk_bdev_desc *desc; 9572 9573 spdk_spin_lock(&bdev->internal.spinlock); 9574 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9575 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9576 event_notify(desc, _media_management_notify); 9577 } 9578 } 9579 spdk_spin_unlock(&bdev->internal.spinlock); 9580 } 9581 9582 struct locked_lba_range_ctx { 9583 struct lba_range range; 9584 struct lba_range *current_range; 9585 struct lba_range *owner_range; 9586 struct spdk_poller *poller; 9587 lock_range_cb cb_fn; 9588 void *cb_arg; 9589 }; 9590 9591 static void 9592 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9593 { 9594 struct locked_lba_range_ctx *ctx = _ctx; 9595 9596 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9597 free(ctx); 9598 } 9599 9600 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9601 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9602 9603 static void 9604 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9605 { 9606 struct locked_lba_range_ctx *ctx = _ctx; 9607 9608 if (status == -ENOMEM) { 9609 /* One of the channels could not allocate a range object. 9610 * So we have to go back and clean up any ranges that were 9611 * allocated successfully before we return error status to 9612 * the caller. We can reuse the unlock function to do that 9613 * clean up. 9614 */ 9615 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9616 bdev_lock_error_cleanup_cb); 9617 return; 9618 } 9619 9620 /* All channels have locked this range and no I/O overlapping the range 9621 * are outstanding! Set the owner_ch for the range object for the 9622 * locking channel, so that this channel will know that it is allowed 9623 * to write to this range. 9624 */ 9625 if (ctx->owner_range != NULL) { 9626 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9627 } 9628 9629 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9630 9631 /* Don't free the ctx here. Its range is in the bdev's global list of 9632 * locked ranges still, and will be removed and freed when this range 9633 * is later unlocked. 9634 */ 9635 } 9636 9637 static int 9638 bdev_lock_lba_range_check_io(void *_i) 9639 { 9640 struct spdk_bdev_channel_iter *i = _i; 9641 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9642 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9643 struct locked_lba_range_ctx *ctx = i->ctx; 9644 struct lba_range *range = ctx->current_range; 9645 struct spdk_bdev_io *bdev_io; 9646 9647 spdk_poller_unregister(&ctx->poller); 9648 9649 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9650 * range. But we need to wait until any outstanding IO overlapping with this range 9651 * are completed. 9652 */ 9653 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9654 if (bdev_io_range_is_locked(bdev_io, range)) { 9655 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9656 return SPDK_POLLER_BUSY; 9657 } 9658 } 9659 9660 spdk_bdev_for_each_channel_continue(i, 0); 9661 return SPDK_POLLER_BUSY; 9662 } 9663 9664 static void 9665 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9666 struct spdk_io_channel *_ch, void *_ctx) 9667 { 9668 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9669 struct locked_lba_range_ctx *ctx = _ctx; 9670 struct lba_range *range; 9671 9672 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9673 if (range->length == ctx->range.length && 9674 range->offset == ctx->range.offset && 9675 range->locked_ctx == ctx->range.locked_ctx) { 9676 /* This range already exists on this channel, so don't add 9677 * it again. This can happen when a new channel is created 9678 * while the for_each_channel operation is in progress. 9679 * Do not check for outstanding I/O in that case, since the 9680 * range was locked before any I/O could be submitted to the 9681 * new channel. 9682 */ 9683 spdk_bdev_for_each_channel_continue(i, 0); 9684 return; 9685 } 9686 } 9687 9688 range = calloc(1, sizeof(*range)); 9689 if (range == NULL) { 9690 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9691 return; 9692 } 9693 9694 range->length = ctx->range.length; 9695 range->offset = ctx->range.offset; 9696 range->locked_ctx = ctx->range.locked_ctx; 9697 range->quiesce = ctx->range.quiesce; 9698 ctx->current_range = range; 9699 if (ctx->range.owner_ch == ch) { 9700 /* This is the range object for the channel that will hold 9701 * the lock. Store it in the ctx object so that we can easily 9702 * set its owner_ch after the lock is finally acquired. 9703 */ 9704 ctx->owner_range = range; 9705 } 9706 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9707 bdev_lock_lba_range_check_io(i); 9708 } 9709 9710 static void 9711 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9712 { 9713 assert(spdk_get_thread() == ctx->range.owner_thread); 9714 assert(ctx->range.owner_ch == NULL || 9715 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9716 9717 /* We will add a copy of this range to each channel now. */ 9718 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9719 bdev_lock_lba_range_cb); 9720 } 9721 9722 static bool 9723 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9724 { 9725 struct lba_range *r; 9726 9727 TAILQ_FOREACH(r, tailq, tailq) { 9728 if (bdev_lba_range_overlapped(range, r)) { 9729 return true; 9730 } 9731 } 9732 return false; 9733 } 9734 9735 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9736 9737 static int 9738 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9739 uint64_t offset, uint64_t length, 9740 lock_range_cb cb_fn, void *cb_arg) 9741 { 9742 struct locked_lba_range_ctx *ctx; 9743 9744 ctx = calloc(1, sizeof(*ctx)); 9745 if (ctx == NULL) { 9746 return -ENOMEM; 9747 } 9748 9749 ctx->range.offset = offset; 9750 ctx->range.length = length; 9751 ctx->range.owner_thread = spdk_get_thread(); 9752 ctx->range.owner_ch = ch; 9753 ctx->range.locked_ctx = cb_arg; 9754 ctx->range.bdev = bdev; 9755 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 9756 ctx->cb_fn = cb_fn; 9757 ctx->cb_arg = cb_arg; 9758 9759 spdk_spin_lock(&bdev->internal.spinlock); 9760 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9761 /* There is an active lock overlapping with this range. 9762 * Put it on the pending list until this range no 9763 * longer overlaps with another. 9764 */ 9765 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9766 } else { 9767 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9768 bdev_lock_lba_range_ctx(bdev, ctx); 9769 } 9770 spdk_spin_unlock(&bdev->internal.spinlock); 9771 return 0; 9772 } 9773 9774 static int 9775 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9776 uint64_t offset, uint64_t length, 9777 lock_range_cb cb_fn, void *cb_arg) 9778 { 9779 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9780 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9781 9782 if (cb_arg == NULL) { 9783 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9784 return -EINVAL; 9785 } 9786 9787 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9788 } 9789 9790 static void 9791 bdev_lock_lba_range_ctx_msg(void *_ctx) 9792 { 9793 struct locked_lba_range_ctx *ctx = _ctx; 9794 9795 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9796 } 9797 9798 static void 9799 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9800 { 9801 struct locked_lba_range_ctx *ctx = _ctx; 9802 struct locked_lba_range_ctx *pending_ctx; 9803 struct lba_range *range, *tmp; 9804 9805 spdk_spin_lock(&bdev->internal.spinlock); 9806 /* Check if there are any pending locked ranges that overlap with this range 9807 * that was just unlocked. If there are, check that it doesn't overlap with any 9808 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9809 * the lock process. 9810 */ 9811 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9812 if (bdev_lba_range_overlapped(range, &ctx->range) && 9813 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9814 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9815 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9816 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9817 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9818 bdev_lock_lba_range_ctx_msg, pending_ctx); 9819 } 9820 } 9821 spdk_spin_unlock(&bdev->internal.spinlock); 9822 9823 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9824 free(ctx); 9825 } 9826 9827 static void 9828 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9829 struct spdk_io_channel *_ch, void *_ctx) 9830 { 9831 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9832 struct locked_lba_range_ctx *ctx = _ctx; 9833 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9834 struct spdk_bdev_io *bdev_io; 9835 struct lba_range *range; 9836 9837 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9838 if (ctx->range.offset == range->offset && 9839 ctx->range.length == range->length && 9840 ctx->range.locked_ctx == range->locked_ctx) { 9841 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9842 free(range); 9843 break; 9844 } 9845 } 9846 9847 /* Note: we should almost always be able to assert that the range specified 9848 * was found. But there are some very rare corner cases where a new channel 9849 * gets created simultaneously with a range unlock, where this function 9850 * would execute on that new channel and wouldn't have the range. 9851 * We also use this to clean up range allocations when a later allocation 9852 * fails in the locking path. 9853 * So we can't actually assert() here. 9854 */ 9855 9856 /* Swap the locked IO into a temporary list, and then try to submit them again. 9857 * We could hyper-optimize this to only resubmit locked I/O that overlap 9858 * with the range that was just unlocked, but this isn't a performance path so 9859 * we go for simplicity here. 9860 */ 9861 TAILQ_INIT(&io_locked); 9862 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9863 while (!TAILQ_EMPTY(&io_locked)) { 9864 bdev_io = TAILQ_FIRST(&io_locked); 9865 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9866 bdev_io_submit(bdev_io); 9867 } 9868 9869 spdk_bdev_for_each_channel_continue(i, 0); 9870 } 9871 9872 static int 9873 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9874 lock_range_cb cb_fn, void *cb_arg) 9875 { 9876 struct locked_lba_range_ctx *ctx; 9877 struct lba_range *range; 9878 9879 spdk_spin_lock(&bdev->internal.spinlock); 9880 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9881 * and remove it. This ensures new channels don't inherit the locked range. 9882 * Then we will send a message to each channel to remove the range from its 9883 * per-channel list. 9884 */ 9885 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9886 if (range->offset == offset && range->length == length && 9887 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9888 break; 9889 } 9890 } 9891 if (range == NULL) { 9892 assert(false); 9893 spdk_spin_unlock(&bdev->internal.spinlock); 9894 return -EINVAL; 9895 } 9896 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9897 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9898 spdk_spin_unlock(&bdev->internal.spinlock); 9899 9900 ctx->cb_fn = cb_fn; 9901 ctx->cb_arg = cb_arg; 9902 9903 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9904 bdev_unlock_lba_range_cb); 9905 return 0; 9906 } 9907 9908 static int 9909 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9910 uint64_t offset, uint64_t length, 9911 lock_range_cb cb_fn, void *cb_arg) 9912 { 9913 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9914 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9915 struct lba_range *range; 9916 bool range_found = false; 9917 9918 /* Let's make sure the specified channel actually has a lock on 9919 * the specified range. Note that the range must match exactly. 9920 */ 9921 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9922 if (range->offset == offset && range->length == length && 9923 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9924 range_found = true; 9925 break; 9926 } 9927 } 9928 9929 if (!range_found) { 9930 return -EINVAL; 9931 } 9932 9933 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9934 } 9935 9936 struct bdev_quiesce_ctx { 9937 spdk_bdev_quiesce_cb cb_fn; 9938 void *cb_arg; 9939 }; 9940 9941 static void 9942 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9943 { 9944 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9945 9946 if (quiesce_ctx->cb_fn != NULL) { 9947 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9948 } 9949 9950 free(quiesce_ctx); 9951 } 9952 9953 static void 9954 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9955 { 9956 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9957 struct spdk_bdev_module *module = range->bdev->module; 9958 9959 if (status != 0) { 9960 if (quiesce_ctx->cb_fn != NULL) { 9961 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9962 } 9963 free(quiesce_ctx); 9964 return; 9965 } 9966 9967 spdk_spin_lock(&module->internal.spinlock); 9968 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9969 spdk_spin_unlock(&module->internal.spinlock); 9970 9971 if (quiesce_ctx->cb_fn != NULL) { 9972 /* copy the context in case the range is unlocked by the callback */ 9973 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 9974 9975 quiesce_ctx->cb_fn = NULL; 9976 quiesce_ctx->cb_arg = NULL; 9977 9978 tmp.cb_fn(tmp.cb_arg, status); 9979 } 9980 /* quiesce_ctx will be freed on unquiesce */ 9981 } 9982 9983 static int 9984 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9985 uint64_t offset, uint64_t length, 9986 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9987 bool unquiesce) 9988 { 9989 struct bdev_quiesce_ctx *quiesce_ctx; 9990 int rc; 9991 9992 if (module != bdev->module) { 9993 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9994 return -EINVAL; 9995 } 9996 9997 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9998 return -EINVAL; 9999 } 10000 10001 if (unquiesce) { 10002 struct lba_range *range; 10003 10004 /* Make sure the specified range is actually quiesced in the specified module and 10005 * then remove it from the list. Note that the range must match exactly. 10006 */ 10007 spdk_spin_lock(&module->internal.spinlock); 10008 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10009 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10010 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10011 break; 10012 } 10013 } 10014 spdk_spin_unlock(&module->internal.spinlock); 10015 10016 if (range == NULL) { 10017 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10018 return -EINVAL; 10019 } 10020 10021 quiesce_ctx = range->locked_ctx; 10022 quiesce_ctx->cb_fn = cb_fn; 10023 quiesce_ctx->cb_arg = cb_arg; 10024 10025 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10026 } else { 10027 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10028 if (quiesce_ctx == NULL) { 10029 return -ENOMEM; 10030 } 10031 10032 quiesce_ctx->cb_fn = cb_fn; 10033 quiesce_ctx->cb_arg = cb_arg; 10034 10035 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10036 if (rc != 0) { 10037 free(quiesce_ctx); 10038 } 10039 } 10040 10041 return rc; 10042 } 10043 10044 int 10045 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10046 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10047 { 10048 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10049 } 10050 10051 int 10052 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10053 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10054 { 10055 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10056 } 10057 10058 int 10059 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10060 uint64_t offset, uint64_t length, 10061 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10062 { 10063 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10064 } 10065 10066 int 10067 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10068 uint64_t offset, uint64_t length, 10069 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10070 { 10071 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10072 } 10073 10074 int 10075 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10076 int array_size) 10077 { 10078 if (!bdev) { 10079 return -EINVAL; 10080 } 10081 10082 if (bdev->fn_table->get_memory_domains) { 10083 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10084 } 10085 10086 return 0; 10087 } 10088 10089 struct spdk_bdev_for_each_io_ctx { 10090 void *ctx; 10091 spdk_bdev_io_fn fn; 10092 spdk_bdev_for_each_io_cb cb; 10093 }; 10094 10095 static void 10096 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10097 struct spdk_io_channel *io_ch, void *_ctx) 10098 { 10099 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10100 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10101 struct spdk_bdev_io *bdev_io; 10102 int rc = 0; 10103 10104 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10105 rc = ctx->fn(ctx->ctx, bdev_io); 10106 if (rc != 0) { 10107 break; 10108 } 10109 } 10110 10111 spdk_bdev_for_each_channel_continue(i, rc); 10112 } 10113 10114 static void 10115 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10116 { 10117 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10118 10119 ctx->cb(ctx->ctx, status); 10120 10121 free(ctx); 10122 } 10123 10124 void 10125 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10126 spdk_bdev_for_each_io_cb cb) 10127 { 10128 struct spdk_bdev_for_each_io_ctx *ctx; 10129 10130 assert(fn != NULL && cb != NULL); 10131 10132 ctx = calloc(1, sizeof(*ctx)); 10133 if (ctx == NULL) { 10134 SPDK_ERRLOG("Failed to allocate context.\n"); 10135 cb(_ctx, -ENOMEM); 10136 return; 10137 } 10138 10139 ctx->ctx = _ctx; 10140 ctx->fn = fn; 10141 ctx->cb = cb; 10142 10143 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10144 bdev_for_each_io_done); 10145 } 10146 10147 void 10148 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10149 { 10150 spdk_for_each_channel_continue(iter->i, status); 10151 } 10152 10153 static struct spdk_bdev * 10154 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10155 { 10156 void *io_device = spdk_io_channel_iter_get_io_device(i); 10157 10158 return __bdev_from_io_dev(io_device); 10159 } 10160 10161 static void 10162 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10163 { 10164 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10165 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10166 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10167 10168 iter->i = i; 10169 iter->fn(iter, bdev, ch, iter->ctx); 10170 } 10171 10172 static void 10173 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10174 { 10175 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10176 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10177 10178 iter->i = i; 10179 iter->cpl(bdev, iter->ctx, status); 10180 10181 free(iter); 10182 } 10183 10184 void 10185 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10186 void *ctx, spdk_bdev_for_each_channel_done cpl) 10187 { 10188 struct spdk_bdev_channel_iter *iter; 10189 10190 assert(bdev != NULL && fn != NULL && ctx != NULL); 10191 10192 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10193 if (iter == NULL) { 10194 SPDK_ERRLOG("Unable to allocate iterator\n"); 10195 assert(false); 10196 return; 10197 } 10198 10199 iter->fn = fn; 10200 iter->cpl = cpl; 10201 iter->ctx = ctx; 10202 10203 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10204 iter, bdev_each_channel_cpl); 10205 } 10206 10207 static void 10208 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10209 { 10210 struct spdk_bdev_io *parent_io = cb_arg; 10211 10212 spdk_bdev_free_io(bdev_io); 10213 10214 /* Check return status of write */ 10215 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10216 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10217 } 10218 10219 static void 10220 bdev_copy_do_write(void *_bdev_io) 10221 { 10222 struct spdk_bdev_io *bdev_io = _bdev_io; 10223 int rc; 10224 10225 /* Write blocks */ 10226 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10227 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10228 bdev_io->u.bdev.iovs[0].iov_base, 10229 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10230 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10231 10232 if (rc == -ENOMEM) { 10233 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10234 } else if (rc != 0) { 10235 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10236 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10237 } 10238 } 10239 10240 static void 10241 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10242 { 10243 struct spdk_bdev_io *parent_io = cb_arg; 10244 10245 spdk_bdev_free_io(bdev_io); 10246 10247 /* Check return status of read */ 10248 if (!success) { 10249 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10250 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10251 return; 10252 } 10253 10254 /* Do write */ 10255 bdev_copy_do_write(parent_io); 10256 } 10257 10258 static void 10259 bdev_copy_do_read(void *_bdev_io) 10260 { 10261 struct spdk_bdev_io *bdev_io = _bdev_io; 10262 int rc; 10263 10264 /* Read blocks */ 10265 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10266 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10267 bdev_io->u.bdev.iovs[0].iov_base, 10268 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10269 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10270 10271 if (rc == -ENOMEM) { 10272 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10273 } else if (rc != 0) { 10274 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10275 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10276 } 10277 } 10278 10279 static void 10280 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10281 { 10282 if (!success) { 10283 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10284 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10285 return; 10286 } 10287 10288 bdev_copy_do_read(bdev_io); 10289 } 10290 10291 int 10292 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10293 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10294 spdk_bdev_io_completion_cb cb, void *cb_arg) 10295 { 10296 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10297 struct spdk_bdev_io *bdev_io; 10298 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10299 10300 if (!desc->write) { 10301 return -EBADF; 10302 } 10303 10304 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10305 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10306 SPDK_DEBUGLOG(bdev, 10307 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10308 dst_offset_blocks, src_offset_blocks, num_blocks); 10309 return -EINVAL; 10310 } 10311 10312 bdev_io = bdev_channel_get_io(channel); 10313 if (!bdev_io) { 10314 return -ENOMEM; 10315 } 10316 10317 bdev_io->internal.ch = channel; 10318 bdev_io->internal.desc = desc; 10319 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10320 10321 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10322 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10323 bdev_io->u.bdev.num_blocks = num_blocks; 10324 bdev_io->u.bdev.memory_domain = NULL; 10325 bdev_io->u.bdev.memory_domain_ctx = NULL; 10326 bdev_io->u.bdev.iovs = NULL; 10327 bdev_io->u.bdev.iovcnt = 0; 10328 bdev_io->u.bdev.md_buf = NULL; 10329 bdev_io->u.bdev.accel_sequence = NULL; 10330 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10331 10332 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10333 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10334 return 0; 10335 } 10336 10337 10338 /* If the copy size is large and should be split, use the generic split logic 10339 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10340 * 10341 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10342 * emulate it using regular read and write requests otherwise. 10343 */ 10344 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10345 bdev_io->internal.split) { 10346 bdev_io_submit(bdev_io); 10347 return 0; 10348 } 10349 10350 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10351 10352 return 0; 10353 } 10354 10355 SPDK_LOG_REGISTER_COMPONENT(bdev) 10356 10357 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10358 { 10359 struct spdk_trace_tpoint_opts opts[] = { 10360 { 10361 "BDEV_IO_START", TRACE_BDEV_IO_START, 10362 OWNER_BDEV, OBJECT_BDEV_IO, 1, 10363 { 10364 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10365 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10366 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10367 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10368 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10369 } 10370 }, 10371 { 10372 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10373 OWNER_BDEV, OBJECT_BDEV_IO, 0, 10374 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10375 }, 10376 { 10377 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10378 OWNER_BDEV, OBJECT_NONE, 1, 10379 { 10380 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10381 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10382 } 10383 }, 10384 { 10385 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10386 OWNER_BDEV, OBJECT_NONE, 0, 10387 { 10388 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10389 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10390 } 10391 }, 10392 }; 10393 10394 10395 spdk_trace_register_owner(OWNER_BDEV, 'b'); 10396 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10397 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10398 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10399 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10400 } 10401