1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 bool write; 337 bool memory_domains_supported; 338 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 339 struct spdk_bdev_open_opts opts; 340 struct spdk_thread *thread; 341 struct { 342 spdk_bdev_event_cb_t event_fn; 343 void *ctx; 344 } callback; 345 bool closed; 346 struct spdk_spinlock spinlock; 347 uint32_t refs; 348 TAILQ_HEAD(, media_event_entry) pending_media_events; 349 TAILQ_HEAD(, media_event_entry) free_media_events; 350 struct media_event_entry *media_events_buffer; 351 TAILQ_ENTRY(spdk_bdev_desc) link; 352 353 uint64_t timeout_in_sec; 354 spdk_bdev_io_timeout_cb cb_fn; 355 void *cb_arg; 356 struct spdk_poller *io_timeout_poller; 357 struct spdk_bdev_module_claim *claim; 358 }; 359 360 struct spdk_bdev_iostat_ctx { 361 struct spdk_bdev_io_stat *stat; 362 enum spdk_bdev_reset_stat_mode reset_mode; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 }; 392 393 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 394 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 395 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 396 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 397 398 static inline void bdev_io_complete(void *ctx); 399 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 404 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 405 406 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 407 struct spdk_io_channel *ch, void *_ctx); 408 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 409 410 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 412 uint64_t num_blocks, 413 struct spdk_memory_domain *domain, void *domain_ctx, 414 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 415 spdk_bdev_io_completion_cb cb, void *cb_arg); 416 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 417 struct iovec *iov, int iovcnt, void *md_buf, 418 uint64_t offset_blocks, uint64_t num_blocks, 419 struct spdk_memory_domain *domain, void *domain_ctx, 420 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 421 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 422 spdk_bdev_io_completion_cb cb, void *cb_arg); 423 424 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 425 uint64_t offset, uint64_t length, 426 lock_range_cb cb_fn, void *cb_arg); 427 428 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 429 uint64_t offset, uint64_t length, 430 lock_range_cb cb_fn, void *cb_arg); 431 432 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 433 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 434 435 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 436 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 437 static void claim_reset(struct spdk_bdev *bdev); 438 439 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 440 441 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 442 443 #define bdev_get_ext_io_opt(opts, field, defval) \ 444 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 445 446 static inline void 447 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 448 { 449 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 450 bdev_io->internal.ch->queue_depth++; 451 } 452 453 static inline void 454 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 455 { 456 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 457 bdev_io->internal.ch->queue_depth--; 458 } 459 460 void 461 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 462 { 463 if (!opts) { 464 SPDK_ERRLOG("opts should not be NULL\n"); 465 return; 466 } 467 468 if (!opts_size) { 469 SPDK_ERRLOG("opts_size should not be zero value\n"); 470 return; 471 } 472 473 opts->opts_size = opts_size; 474 475 #define SET_FIELD(field) \ 476 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 477 opts->field = g_bdev_opts.field; \ 478 } \ 479 480 SET_FIELD(bdev_io_pool_size); 481 SET_FIELD(bdev_io_cache_size); 482 SET_FIELD(bdev_auto_examine); 483 SET_FIELD(iobuf_small_cache_size); 484 SET_FIELD(iobuf_large_cache_size); 485 486 /* Do not remove this statement, you should always update this statement when you adding a new field, 487 * and do not forget to add the SET_FIELD statement for your added field. */ 488 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 489 490 #undef SET_FIELD 491 } 492 493 int 494 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 495 { 496 uint32_t min_pool_size; 497 498 if (!opts) { 499 SPDK_ERRLOG("opts cannot be NULL\n"); 500 return -1; 501 } 502 503 if (!opts->opts_size) { 504 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 505 return -1; 506 } 507 508 /* 509 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 510 * initialization. A second mgmt_ch will be created on the same thread when the application starts 511 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 512 */ 513 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 514 if (opts->bdev_io_pool_size < min_pool_size) { 515 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 516 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 517 spdk_thread_get_count()); 518 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 519 return -1; 520 } 521 522 #define SET_FIELD(field) \ 523 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 524 g_bdev_opts.field = opts->field; \ 525 } \ 526 527 SET_FIELD(bdev_io_pool_size); 528 SET_FIELD(bdev_io_cache_size); 529 SET_FIELD(bdev_auto_examine); 530 SET_FIELD(iobuf_small_cache_size); 531 SET_FIELD(iobuf_large_cache_size); 532 533 g_bdev_opts.opts_size = opts->opts_size; 534 535 #undef SET_FIELD 536 537 return 0; 538 } 539 540 static struct spdk_bdev * 541 bdev_get_by_name(const char *bdev_name) 542 { 543 struct spdk_bdev_name find; 544 struct spdk_bdev_name *res; 545 546 find.name = (char *)bdev_name; 547 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 548 if (res != NULL) { 549 return res->bdev; 550 } 551 552 return NULL; 553 } 554 555 struct spdk_bdev * 556 spdk_bdev_get_by_name(const char *bdev_name) 557 { 558 struct spdk_bdev *bdev; 559 560 spdk_spin_lock(&g_bdev_mgr.spinlock); 561 bdev = bdev_get_by_name(bdev_name); 562 spdk_spin_unlock(&g_bdev_mgr.spinlock); 563 564 return bdev; 565 } 566 567 struct bdev_io_status_string { 568 enum spdk_bdev_io_status status; 569 const char *str; 570 }; 571 572 static const struct bdev_io_status_string bdev_io_status_strings[] = { 573 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 574 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 575 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 576 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 577 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 578 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 579 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 580 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 581 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 582 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 583 }; 584 585 static const char * 586 bdev_io_status_get_string(enum spdk_bdev_io_status status) 587 { 588 uint32_t i; 589 590 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 591 if (bdev_io_status_strings[i].status == status) { 592 return bdev_io_status_strings[i].str; 593 } 594 } 595 596 return "reserved"; 597 } 598 599 struct spdk_bdev_wait_for_examine_ctx { 600 struct spdk_poller *poller; 601 spdk_bdev_wait_for_examine_cb cb_fn; 602 void *cb_arg; 603 }; 604 605 static bool bdev_module_all_actions_completed(void); 606 607 static int 608 bdev_wait_for_examine_cb(void *arg) 609 { 610 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 611 612 if (!bdev_module_all_actions_completed()) { 613 return SPDK_POLLER_IDLE; 614 } 615 616 spdk_poller_unregister(&ctx->poller); 617 ctx->cb_fn(ctx->cb_arg); 618 free(ctx); 619 620 return SPDK_POLLER_BUSY; 621 } 622 623 int 624 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 625 { 626 struct spdk_bdev_wait_for_examine_ctx *ctx; 627 628 ctx = calloc(1, sizeof(*ctx)); 629 if (ctx == NULL) { 630 return -ENOMEM; 631 } 632 ctx->cb_fn = cb_fn; 633 ctx->cb_arg = cb_arg; 634 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 635 636 return 0; 637 } 638 639 struct spdk_bdev_examine_item { 640 char *name; 641 TAILQ_ENTRY(spdk_bdev_examine_item) link; 642 }; 643 644 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 645 646 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 647 g_bdev_examine_allowlist); 648 649 static inline bool 650 bdev_examine_allowlist_check(const char *name) 651 { 652 struct spdk_bdev_examine_item *item; 653 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 654 if (strcmp(name, item->name) == 0) { 655 return true; 656 } 657 } 658 return false; 659 } 660 661 static inline void 662 bdev_examine_allowlist_remove(const char *name) 663 { 664 struct spdk_bdev_examine_item *item; 665 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 666 if (strcmp(name, item->name) == 0) { 667 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 668 free(item->name); 669 free(item); 670 break; 671 } 672 } 673 } 674 675 static inline void 676 bdev_examine_allowlist_free(void) 677 { 678 struct spdk_bdev_examine_item *item; 679 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 680 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 681 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 682 free(item->name); 683 free(item); 684 } 685 } 686 687 static inline bool 688 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 689 { 690 struct spdk_bdev_alias *tmp; 691 if (bdev_examine_allowlist_check(bdev->name)) { 692 return true; 693 } 694 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 695 if (bdev_examine_allowlist_check(tmp->alias.name)) { 696 return true; 697 } 698 } 699 return false; 700 } 701 702 static inline bool 703 bdev_ok_to_examine(struct spdk_bdev *bdev) 704 { 705 /* Some bdevs may not support the READ command. 706 * Do not try to examine them. 707 */ 708 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 709 return false; 710 } 711 712 if (g_bdev_opts.bdev_auto_examine) { 713 return true; 714 } else { 715 return bdev_in_examine_allowlist(bdev); 716 } 717 } 718 719 static void 720 bdev_examine(struct spdk_bdev *bdev) 721 { 722 struct spdk_bdev_module *module; 723 struct spdk_bdev_module_claim *claim, *tmpclaim; 724 uint32_t action; 725 726 if (!bdev_ok_to_examine(bdev)) { 727 return; 728 } 729 730 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 731 if (module->examine_config) { 732 spdk_spin_lock(&module->internal.spinlock); 733 action = module->internal.action_in_progress; 734 module->internal.action_in_progress++; 735 spdk_spin_unlock(&module->internal.spinlock); 736 module->examine_config(bdev); 737 if (action != module->internal.action_in_progress) { 738 SPDK_ERRLOG("examine_config for module %s did not call " 739 "spdk_bdev_module_examine_done()\n", module->name); 740 } 741 } 742 } 743 744 spdk_spin_lock(&bdev->internal.spinlock); 745 746 switch (bdev->internal.claim_type) { 747 case SPDK_BDEV_CLAIM_NONE: 748 /* Examine by all bdev modules */ 749 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 750 if (module->examine_disk) { 751 spdk_spin_lock(&module->internal.spinlock); 752 module->internal.action_in_progress++; 753 spdk_spin_unlock(&module->internal.spinlock); 754 spdk_spin_unlock(&bdev->internal.spinlock); 755 module->examine_disk(bdev); 756 spdk_spin_lock(&bdev->internal.spinlock); 757 } 758 } 759 break; 760 case SPDK_BDEV_CLAIM_EXCL_WRITE: 761 /* Examine by the one bdev module with a v1 claim */ 762 module = bdev->internal.claim.v1.module; 763 if (module->examine_disk) { 764 spdk_spin_lock(&module->internal.spinlock); 765 module->internal.action_in_progress++; 766 spdk_spin_unlock(&module->internal.spinlock); 767 spdk_spin_unlock(&bdev->internal.spinlock); 768 module->examine_disk(bdev); 769 return; 770 } 771 break; 772 default: 773 /* Examine by all bdev modules with a v2 claim */ 774 assert(claim_type_is_v2(bdev->internal.claim_type)); 775 /* 776 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 777 * list, perhaps accessing freed memory. Without protection, this could happen 778 * while the lock is dropped during the examine callback. 779 */ 780 bdev->internal.examine_in_progress++; 781 782 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 783 module = claim->module; 784 785 if (module == NULL) { 786 /* This is a vestigial claim, held by examine_count */ 787 continue; 788 } 789 790 if (module->examine_disk == NULL) { 791 continue; 792 } 793 794 spdk_spin_lock(&module->internal.spinlock); 795 module->internal.action_in_progress++; 796 spdk_spin_unlock(&module->internal.spinlock); 797 798 /* Call examine_disk without holding internal.spinlock. */ 799 spdk_spin_unlock(&bdev->internal.spinlock); 800 module->examine_disk(bdev); 801 spdk_spin_lock(&bdev->internal.spinlock); 802 } 803 804 assert(bdev->internal.examine_in_progress > 0); 805 bdev->internal.examine_in_progress--; 806 if (bdev->internal.examine_in_progress == 0) { 807 /* Remove any claims that were released during examine_disk */ 808 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 809 if (claim->desc != NULL) { 810 continue; 811 } 812 813 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 814 free(claim); 815 } 816 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 817 claim_reset(bdev); 818 } 819 } 820 } 821 822 spdk_spin_unlock(&bdev->internal.spinlock); 823 } 824 825 int 826 spdk_bdev_examine(const char *name) 827 { 828 struct spdk_bdev *bdev; 829 struct spdk_bdev_examine_item *item; 830 struct spdk_thread *thread = spdk_get_thread(); 831 832 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 833 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 834 thread ? spdk_thread_get_name(thread) : "null"); 835 return -EINVAL; 836 } 837 838 if (g_bdev_opts.bdev_auto_examine) { 839 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 840 return -EINVAL; 841 } 842 843 if (bdev_examine_allowlist_check(name)) { 844 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 845 return -EEXIST; 846 } 847 848 item = calloc(1, sizeof(*item)); 849 if (!item) { 850 return -ENOMEM; 851 } 852 item->name = strdup(name); 853 if (!item->name) { 854 free(item); 855 return -ENOMEM; 856 } 857 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 858 859 bdev = spdk_bdev_get_by_name(name); 860 if (bdev) { 861 bdev_examine(bdev); 862 } 863 return 0; 864 } 865 866 static inline void 867 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 868 { 869 struct spdk_bdev_examine_item *item; 870 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 871 spdk_json_write_object_begin(w); 872 spdk_json_write_named_string(w, "method", "bdev_examine"); 873 spdk_json_write_named_object_begin(w, "params"); 874 spdk_json_write_named_string(w, "name", item->name); 875 spdk_json_write_object_end(w); 876 spdk_json_write_object_end(w); 877 } 878 } 879 880 struct spdk_bdev * 881 spdk_bdev_first(void) 882 { 883 struct spdk_bdev *bdev; 884 885 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 struct spdk_bdev * 894 spdk_bdev_next(struct spdk_bdev *prev) 895 { 896 struct spdk_bdev *bdev; 897 898 bdev = TAILQ_NEXT(prev, internal.link); 899 if (bdev) { 900 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 901 } 902 903 return bdev; 904 } 905 906 static struct spdk_bdev * 907 _bdev_next_leaf(struct spdk_bdev *bdev) 908 { 909 while (bdev != NULL) { 910 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 911 return bdev; 912 } else { 913 bdev = TAILQ_NEXT(bdev, internal.link); 914 } 915 } 916 917 return bdev; 918 } 919 920 struct spdk_bdev * 921 spdk_bdev_first_leaf(void) 922 { 923 struct spdk_bdev *bdev; 924 925 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 926 927 if (bdev) { 928 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 929 } 930 931 return bdev; 932 } 933 934 struct spdk_bdev * 935 spdk_bdev_next_leaf(struct spdk_bdev *prev) 936 { 937 struct spdk_bdev *bdev; 938 939 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 940 941 if (bdev) { 942 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 943 } 944 945 return bdev; 946 } 947 948 static inline bool 949 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 950 { 951 return bdev_io->internal.f.has_memory_domain; 952 } 953 954 static inline bool 955 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 956 { 957 return bdev_io->internal.f.has_accel_sequence; 958 } 959 960 static inline uint32_t 961 bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 962 { 963 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 964 965 if (spdk_unlikely(desc->opts.hide_metadata)) { 966 return bdev->blocklen - bdev->md_len; 967 } else { 968 return bdev->blocklen; 969 } 970 } 971 972 static inline uint32_t 973 bdev_io_get_block_size(struct spdk_bdev_io *bdev_io) 974 { 975 return bdev_desc_get_block_size(bdev_io->internal.desc); 976 } 977 978 static inline void 979 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 980 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 981 { 982 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 983 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 984 * channels we will instead wait for half to complete. 985 */ 986 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 987 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 988 989 assert(state != BDEV_IO_RETRY_STATE_INVALID); 990 bdev_io->internal.retry_state = state; 991 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 992 } 993 994 static inline void 995 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 996 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 997 { 998 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 999 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 1000 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 1001 1002 assert(state != BDEV_IO_RETRY_STATE_INVALID); 1003 bdev_io->internal.retry_state = state; 1004 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1005 } 1006 1007 void 1008 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 1009 { 1010 struct iovec *iovs; 1011 1012 if (bdev_io->u.bdev.iovs == NULL) { 1013 bdev_io->u.bdev.iovs = &bdev_io->iov; 1014 bdev_io->u.bdev.iovcnt = 1; 1015 } 1016 1017 iovs = bdev_io->u.bdev.iovs; 1018 1019 assert(iovs != NULL); 1020 assert(bdev_io->u.bdev.iovcnt >= 1); 1021 1022 iovs[0].iov_base = buf; 1023 iovs[0].iov_len = len; 1024 } 1025 1026 void 1027 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1028 { 1029 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1030 bdev_io->u.bdev.md_buf = md_buf; 1031 } 1032 1033 static bool 1034 _is_buf_allocated(const struct iovec *iovs) 1035 { 1036 if (iovs == NULL) { 1037 return false; 1038 } 1039 1040 return iovs[0].iov_base != NULL; 1041 } 1042 1043 static bool 1044 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1045 { 1046 int i; 1047 uintptr_t iov_base; 1048 1049 if (spdk_likely(alignment == 1)) { 1050 return true; 1051 } 1052 1053 for (i = 0; i < iovcnt; i++) { 1054 iov_base = (uintptr_t)iovs[i].iov_base; 1055 if ((iov_base & (alignment - 1)) != 0) { 1056 return false; 1057 } 1058 } 1059 1060 return true; 1061 } 1062 1063 static inline bool 1064 bdev_io_needs_metadata(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1065 { 1066 return desc->opts.hide_metadata && bdev_io->bdev->md_len != 0; 1067 } 1068 1069 static inline bool 1070 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1071 { 1072 if (!bdev_io_use_accel_sequence(bdev_io)) { 1073 return false; 1074 } 1075 1076 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1077 * bdev module didn't support accel sequences */ 1078 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1079 } 1080 1081 static inline void 1082 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1083 struct spdk_bdev_shared_resource *shared_resource) 1084 { 1085 bdev_ch->io_outstanding++; 1086 shared_resource->io_outstanding++; 1087 } 1088 1089 static inline void 1090 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1091 struct spdk_bdev_shared_resource *shared_resource) 1092 { 1093 assert(bdev_ch->io_outstanding > 0); 1094 assert(shared_resource->io_outstanding > 0); 1095 bdev_ch->io_outstanding--; 1096 shared_resource->io_outstanding--; 1097 } 1098 1099 static void 1100 bdev_io_submit_sequence_cb(void *ctx, int status) 1101 { 1102 struct spdk_bdev_io *bdev_io = ctx; 1103 1104 assert(bdev_io_use_accel_sequence(bdev_io)); 1105 1106 bdev_io->u.bdev.accel_sequence = NULL; 1107 bdev_io->internal.f.has_accel_sequence = false; 1108 1109 if (spdk_unlikely(status != 0)) { 1110 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1111 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1112 bdev_io_complete_unsubmitted(bdev_io); 1113 return; 1114 } 1115 1116 bdev_io_submit(bdev_io); 1117 } 1118 1119 static void 1120 bdev_io_exec_sequence_cb(void *ctx, int status) 1121 { 1122 struct spdk_bdev_io *bdev_io = ctx; 1123 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1124 1125 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1126 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1127 1128 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1129 bdev_ch_retry_io(ch); 1130 } 1131 1132 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1133 } 1134 1135 static void 1136 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1137 { 1138 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1139 1140 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1141 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1142 assert(bdev_io_use_accel_sequence(bdev_io)); 1143 1144 /* Since the operations are appended during submission, they're in the opposite order than 1145 * how we want to execute them for reads (i.e. we need to execute the most recently added 1146 * operation first), so reverse the sequence before executing it. 1147 */ 1148 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1149 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1150 } 1151 1152 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1153 bdev_io_increment_outstanding(ch, ch->shared_resource); 1154 bdev_io->internal.data_transfer_cpl = cb_fn; 1155 1156 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1157 bdev_io_exec_sequence_cb, bdev_io); 1158 } 1159 1160 static void 1161 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1162 { 1163 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1164 void *buf; 1165 1166 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1167 buf = bdev_io->internal.buf.ptr; 1168 bdev_io->internal.buf.ptr = NULL; 1169 bdev_io->internal.f.has_buf = false; 1170 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1171 bdev_io->internal.get_aux_buf_cb = NULL; 1172 } else { 1173 assert(bdev_io->internal.get_buf_cb != NULL); 1174 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1175 bdev_io->internal.get_buf_cb = NULL; 1176 } 1177 } 1178 1179 static void 1180 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1181 { 1182 struct spdk_bdev_io *bdev_io = ctx; 1183 1184 if (rc) { 1185 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1186 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1187 } 1188 bdev_io_get_buf_complete(bdev_io, !rc); 1189 } 1190 1191 static void 1192 bdev_io_pull_md_buf_done(void *ctx, int status) 1193 { 1194 struct spdk_bdev_io *bdev_io = ctx; 1195 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1196 1197 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1198 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1199 1200 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1201 bdev_ch_retry_io(ch); 1202 } 1203 1204 assert(bdev_io->internal.data_transfer_cpl); 1205 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1206 } 1207 1208 static void 1209 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1210 { 1211 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1212 int rc = 0; 1213 1214 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1215 assert(bdev_io->internal.f.has_bounce_buf); 1216 if (bdev_io_use_memory_domain(bdev_io)) { 1217 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1218 bdev_io_increment_outstanding(ch, ch->shared_resource); 1219 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1220 bdev_io->internal.memory_domain_ctx, 1221 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1222 &bdev_io->internal.bounce_buf.md_iov, 1, 1223 bdev_io_pull_md_buf_done, bdev_io); 1224 if (rc == 0) { 1225 /* Continue to submit IO in completion callback */ 1226 return; 1227 } 1228 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1229 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1230 if (rc != -ENOMEM) { 1231 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1232 spdk_memory_domain_get_dma_device_id( 1233 bdev_io->internal.memory_domain), rc); 1234 } 1235 } else { 1236 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1237 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1238 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1239 } 1240 } 1241 1242 if (spdk_unlikely(rc == -ENOMEM)) { 1243 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1244 } else { 1245 assert(bdev_io->internal.data_transfer_cpl); 1246 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1247 } 1248 } 1249 1250 static void 1251 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1252 { 1253 assert(bdev_io->internal.f.has_bounce_buf); 1254 1255 /* save original md_buf */ 1256 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1257 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1258 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1259 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1260 /* set bounce md_buf */ 1261 bdev_io->u.bdev.md_buf = md_buf; 1262 1263 bdev_io_pull_md_buf(bdev_io); 1264 } 1265 1266 static void 1267 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1268 { 1269 struct spdk_bdev *bdev = bdev_io->bdev; 1270 uint64_t md_len; 1271 void *buf; 1272 1273 if (spdk_bdev_is_md_separate(bdev)) { 1274 assert(!bdev_io_use_accel_sequence(bdev_io)); 1275 1276 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1277 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1278 1279 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1280 1281 if (bdev_io->u.bdev.md_buf != NULL) { 1282 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1283 return; 1284 } else { 1285 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1286 } 1287 } 1288 1289 bdev_io_get_buf_complete(bdev_io, true); 1290 } 1291 1292 static inline void 1293 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1294 { 1295 if (rc) { 1296 SPDK_ERRLOG("Failed to get data buffer\n"); 1297 assert(bdev_io->internal.data_transfer_cpl); 1298 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1299 return; 1300 } 1301 1302 _bdev_io_set_md_buf(bdev_io); 1303 } 1304 1305 static void 1306 bdev_io_pull_data_done_and_track(void *ctx, int status) 1307 { 1308 struct spdk_bdev_io *bdev_io = ctx; 1309 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1310 1311 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1312 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1313 1314 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1315 bdev_ch_retry_io(ch); 1316 } 1317 1318 bdev_io_pull_data_done(bdev_io, status); 1319 } 1320 1321 static void 1322 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1323 { 1324 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1325 int rc = 0; 1326 1327 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1328 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1329 * operation */ 1330 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1331 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1332 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1333 assert(bdev_io_use_accel_sequence(bdev_io)); 1334 assert(bdev_io->internal.f.has_bounce_buf); 1335 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1336 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1337 NULL, NULL, 1338 bdev_io->internal.bounce_buf.orig_iovs, 1339 bdev_io->internal.bounce_buf.orig_iovcnt, 1340 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1341 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1342 NULL, NULL); 1343 } else { 1344 /* We need to reverse the src/dst for reads */ 1345 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1346 assert(bdev_io_use_accel_sequence(bdev_io)); 1347 assert(bdev_io->internal.f.has_bounce_buf); 1348 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1349 bdev_io->internal.bounce_buf.orig_iovs, 1350 bdev_io->internal.bounce_buf.orig_iovcnt, 1351 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1352 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1353 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1354 NULL, NULL, NULL, NULL); 1355 } 1356 1357 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1358 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1359 bdev_io->internal.accel_sequence); 1360 } 1361 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1362 /* if this is write path, copy data from original buffer to bounce buffer */ 1363 if (bdev_io_use_memory_domain(bdev_io)) { 1364 assert(bdev_io->internal.f.has_bounce_buf); 1365 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1366 bdev_io_increment_outstanding(ch, ch->shared_resource); 1367 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1368 bdev_io->internal.memory_domain_ctx, 1369 bdev_io->internal.bounce_buf.orig_iovs, 1370 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1371 bdev_io->u.bdev.iovs, 1, 1372 bdev_io_pull_data_done_and_track, 1373 bdev_io); 1374 if (rc == 0) { 1375 /* Continue to submit IO in completion callback */ 1376 return; 1377 } 1378 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1379 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1380 if (rc != -ENOMEM) { 1381 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1382 spdk_memory_domain_get_dma_device_id( 1383 bdev_io->internal.memory_domain)); 1384 } 1385 } else { 1386 assert(bdev_io->u.bdev.iovcnt == 1); 1387 assert(bdev_io->internal.f.has_bounce_buf); 1388 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1389 bdev_io->u.bdev.iovs[0].iov_len, 1390 bdev_io->internal.bounce_buf.orig_iovs, 1391 bdev_io->internal.bounce_buf.orig_iovcnt); 1392 } 1393 } 1394 1395 if (spdk_unlikely(rc == -ENOMEM)) { 1396 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1397 } else { 1398 bdev_io_pull_data_done(bdev_io, rc); 1399 } 1400 } 1401 1402 static void 1403 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1404 bdev_copy_bounce_buffer_cpl cpl_cb) 1405 { 1406 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1407 1408 assert(bdev_io->internal.f.has_bounce_buf == false); 1409 1410 bdev_io->internal.data_transfer_cpl = cpl_cb; 1411 bdev_io->internal.f.has_bounce_buf = true; 1412 /* save original iovec */ 1413 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1414 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1415 /* zero the other data members */ 1416 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1417 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1418 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1419 /* set bounce iov */ 1420 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1421 bdev_io->u.bdev.iovcnt = 1; 1422 /* set bounce buffer for this operation */ 1423 bdev_io->u.bdev.iovs[0].iov_base = buf; 1424 bdev_io->u.bdev.iovs[0].iov_len = len; 1425 /* Now we use 1 iov, the split condition could have been changed */ 1426 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1427 1428 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1429 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1430 } else { 1431 bdev_io_pull_data(bdev_io); 1432 } 1433 } 1434 1435 static void 1436 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1437 { 1438 struct spdk_bdev *bdev = bdev_io->bdev; 1439 bool buf_allocated; 1440 uint64_t alignment; 1441 void *aligned_buf; 1442 1443 bdev_io->internal.buf.ptr = buf; 1444 bdev_io->internal.f.has_buf = true; 1445 1446 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1447 bdev_io_get_buf_complete(bdev_io, true); 1448 return; 1449 } 1450 1451 alignment = spdk_bdev_get_buf_align(bdev); 1452 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1453 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1454 1455 if (buf_allocated) { 1456 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1457 /* Continue in completion callback */ 1458 return; 1459 } else { 1460 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1461 } 1462 1463 _bdev_io_set_md_buf(bdev_io); 1464 } 1465 1466 static inline uint64_t 1467 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1468 { 1469 struct spdk_bdev *bdev = bdev_io->bdev; 1470 uint64_t md_len, alignment; 1471 1472 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1473 1474 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1475 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1476 1477 return len + alignment + md_len; 1478 } 1479 1480 static void 1481 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1482 { 1483 struct spdk_bdev_mgmt_channel *ch; 1484 1485 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1486 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1487 } 1488 1489 static void 1490 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1491 { 1492 assert(bdev_io->internal.f.has_buf); 1493 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1494 bdev_io->internal.buf.ptr = NULL; 1495 bdev_io->internal.f.has_buf = false; 1496 } 1497 1498 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1499 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1500 1501 void 1502 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1503 { 1504 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1505 1506 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1507 1508 assert(buf != NULL); 1509 _bdev_io_put_buf(bdev_io, buf, len); 1510 } 1511 1512 static inline void 1513 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1514 struct spdk_bdev_io *bdev_io) 1515 { 1516 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1517 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1518 * sequence pointer to make sure we won't touch it anymore. */ 1519 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1520 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1521 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1522 bdev_io->internal.f.has_accel_sequence = false; 1523 } 1524 1525 bdev->fn_table->submit_request(ioch, bdev_io); 1526 } 1527 1528 static inline void 1529 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1530 { 1531 struct spdk_bdev *bdev = bdev_io->bdev; 1532 1533 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1534 bdev_io->internal.error.nvme.cdw0 = 0; 1535 bdev_io->num_retries++; 1536 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1537 } 1538 1539 static void 1540 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1541 { 1542 struct spdk_bdev_io *bdev_io; 1543 1544 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1545 /* 1546 * Allow some more I/O to complete before retrying the nomem_io queue. 1547 * Some drivers (such as nvme) cannot immediately take a new I/O in 1548 * the context of a completion, because the resources for the I/O are 1549 * not released until control returns to the bdev poller. Also, we 1550 * may require several small I/O to complete before a larger I/O 1551 * (that requires splitting) can be submitted. 1552 */ 1553 return; 1554 } 1555 1556 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1557 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1558 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1559 1560 switch (bdev_io->internal.retry_state) { 1561 case BDEV_IO_RETRY_STATE_SUBMIT: 1562 bdev_ch_resubmit_io(shared_resource, bdev_io); 1563 break; 1564 case BDEV_IO_RETRY_STATE_PULL: 1565 bdev_io_pull_data(bdev_io); 1566 break; 1567 case BDEV_IO_RETRY_STATE_PULL_MD: 1568 bdev_io_pull_md_buf(bdev_io); 1569 break; 1570 case BDEV_IO_RETRY_STATE_PUSH: 1571 bdev_io_push_bounce_data(bdev_io); 1572 break; 1573 case BDEV_IO_RETRY_STATE_PUSH_MD: 1574 bdev_io_push_bounce_md_buf(bdev_io); 1575 break; 1576 default: 1577 assert(0 && "invalid retry state"); 1578 break; 1579 } 1580 1581 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1582 /* This IO completed again with NOMEM status, so break the loop and 1583 * don't try anymore. Note that a bdev_io that fails with NOMEM 1584 * always gets requeued at the front of the list, to maintain 1585 * ordering. 1586 */ 1587 break; 1588 } 1589 } 1590 } 1591 1592 static void 1593 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1594 { 1595 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1596 } 1597 1598 static int 1599 bdev_no_mem_poller(void *ctx) 1600 { 1601 struct spdk_bdev_shared_resource *shared_resource = ctx; 1602 1603 spdk_poller_unregister(&shared_resource->nomem_poller); 1604 1605 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1606 bdev_shared_ch_retry_io(shared_resource); 1607 } 1608 /* the retry cb may re-register the poller so double check */ 1609 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1610 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1611 /* No IOs were submitted, try again */ 1612 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1613 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1614 } 1615 1616 return SPDK_POLLER_BUSY; 1617 } 1618 1619 static inline bool 1620 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1621 { 1622 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1623 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1624 1625 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1626 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1627 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1628 1629 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1630 /* Special case when we have nomem IOs and no outstanding IOs which completions 1631 * could trigger retry of queued IOs 1632 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1633 * new IOs submitted, e.g. qd==1 */ 1634 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1635 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1636 } 1637 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1638 * ownership of that sequence is transferred back to the bdev layer, so we need to 1639 * restore internal.accel_sequence to make sure that the sequence is handled 1640 * correctly in case the I/O is later aborted. */ 1641 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1642 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1643 assert(!bdev_io_use_accel_sequence(bdev_io)); 1644 bdev_io->internal.f.has_accel_sequence = true; 1645 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1646 } 1647 1648 return true; 1649 } 1650 1651 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1652 bdev_ch_retry_io(bdev_ch); 1653 } 1654 1655 return false; 1656 } 1657 1658 static void 1659 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1660 { 1661 struct spdk_bdev_io *bdev_io = ctx; 1662 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1663 1664 if (rc) { 1665 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1666 } 1667 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1668 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1669 */ 1670 bdev_io_put_buf(bdev_io); 1671 1672 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1673 bdev_ch_retry_io(ch); 1674 } 1675 1676 /* Continue with IO completion flow */ 1677 bdev_io_complete(bdev_io); 1678 } 1679 1680 static void 1681 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1682 { 1683 struct spdk_bdev_io *bdev_io = ctx; 1684 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1685 1686 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1687 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1688 bdev_io->internal.f.has_bounce_buf = false; 1689 1690 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1691 bdev_ch_retry_io(ch); 1692 } 1693 1694 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1695 } 1696 1697 static inline void 1698 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1699 { 1700 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1701 int rc = 0; 1702 1703 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1704 assert(bdev_io->internal.f.has_bounce_buf); 1705 1706 /* do the same for metadata buffer */ 1707 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1708 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1709 1710 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1711 if (bdev_io_use_memory_domain(bdev_io)) { 1712 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1713 bdev_io_increment_outstanding(ch, ch->shared_resource); 1714 /* If memory domain is used then we need to call async push function */ 1715 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1716 bdev_io->internal.memory_domain_ctx, 1717 &bdev_io->internal.bounce_buf.orig_md_iov, 1718 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1719 &bdev_io->internal.bounce_buf.md_iov, 1, 1720 bdev_io_push_bounce_md_buf_done, 1721 bdev_io); 1722 if (rc == 0) { 1723 /* Continue IO completion in async callback */ 1724 return; 1725 } 1726 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1727 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1728 if (rc != -ENOMEM) { 1729 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1730 spdk_memory_domain_get_dma_device_id( 1731 bdev_io->internal.memory_domain)); 1732 } 1733 } else { 1734 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1735 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1736 } 1737 } 1738 } 1739 1740 if (spdk_unlikely(rc == -ENOMEM)) { 1741 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1742 } else { 1743 assert(bdev_io->internal.data_transfer_cpl); 1744 bdev_io->internal.f.has_bounce_buf = false; 1745 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1746 } 1747 } 1748 1749 static inline void 1750 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1751 { 1752 assert(bdev_io->internal.data_transfer_cpl); 1753 if (rc) { 1754 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1755 return; 1756 } 1757 1758 /* set original buffer for this io */ 1759 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1760 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1761 1762 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1763 * we still need to clear the md buf */ 1764 1765 bdev_io_push_bounce_md_buf(bdev_io); 1766 } 1767 1768 static void 1769 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1770 { 1771 struct spdk_bdev_io *bdev_io = ctx; 1772 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1773 1774 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1775 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1776 1777 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1778 bdev_ch_retry_io(ch); 1779 } 1780 1781 bdev_io_push_bounce_data_done(bdev_io, status); 1782 } 1783 1784 static inline void 1785 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1786 { 1787 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1788 int rc = 0; 1789 1790 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1791 assert(!bdev_io_use_accel_sequence(bdev_io)); 1792 assert(bdev_io->internal.f.has_bounce_buf); 1793 1794 /* if this is read path, copy data from bounce buffer to original buffer */ 1795 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1796 if (bdev_io_use_memory_domain(bdev_io)) { 1797 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1798 bdev_io_increment_outstanding(ch, ch->shared_resource); 1799 /* If memory domain is used then we need to call async push function */ 1800 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1801 bdev_io->internal.memory_domain_ctx, 1802 bdev_io->internal.bounce_buf.orig_iovs, 1803 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1804 &bdev_io->internal.bounce_buf.iov, 1, 1805 bdev_io_push_bounce_data_done_and_track, 1806 bdev_io); 1807 if (rc == 0) { 1808 /* Continue IO completion in async callback */ 1809 return; 1810 } 1811 1812 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1813 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1814 if (rc != -ENOMEM) { 1815 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1816 spdk_memory_domain_get_dma_device_id( 1817 bdev_io->internal.memory_domain)); 1818 } 1819 } else { 1820 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1821 bdev_io->internal.bounce_buf.orig_iovcnt, 1822 bdev_io->internal.bounce_buf.iov.iov_base, 1823 bdev_io->internal.bounce_buf.iov.iov_len); 1824 } 1825 } 1826 1827 if (spdk_unlikely(rc == -ENOMEM)) { 1828 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1829 } else { 1830 bdev_io_push_bounce_data_done(bdev_io, rc); 1831 } 1832 } 1833 1834 static inline void 1835 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1836 { 1837 bdev_io->internal.data_transfer_cpl = cpl_cb; 1838 bdev_io_push_bounce_data(bdev_io); 1839 } 1840 1841 static void 1842 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1843 { 1844 struct spdk_bdev_io *bdev_io; 1845 1846 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1847 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1848 } 1849 1850 static void 1851 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1852 { 1853 struct spdk_bdev_mgmt_channel *mgmt_ch; 1854 uint64_t max_len; 1855 void *buf; 1856 1857 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1858 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1859 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1860 1861 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1862 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1863 bdev_io_get_buf_complete(bdev_io, false); 1864 return; 1865 } 1866 1867 bdev_io->internal.buf.len = len; 1868 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1869 bdev_io_get_iobuf_cb); 1870 if (buf != NULL) { 1871 _bdev_io_set_buf(bdev_io, buf, len); 1872 } 1873 } 1874 1875 void 1876 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1877 { 1878 struct spdk_bdev *bdev = bdev_io->bdev; 1879 uint64_t alignment; 1880 1881 assert(cb != NULL); 1882 bdev_io->internal.get_buf_cb = cb; 1883 1884 alignment = spdk_bdev_get_buf_align(bdev); 1885 1886 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1887 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1888 /* Buffer already present and aligned */ 1889 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1890 return; 1891 } 1892 1893 bdev_io_get_buf(bdev_io, len); 1894 } 1895 1896 static void 1897 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1898 bool success) 1899 { 1900 if (!success) { 1901 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1902 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1903 bdev_io_complete_unsubmitted(bdev_io); 1904 return; 1905 } 1906 1907 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1908 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1909 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1910 return; 1911 } 1912 /* For reads we'll execute the sequence after the data is read, so, for now, only 1913 * clear out accel_sequence pointer and submit the IO */ 1914 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1915 bdev_io->u.bdev.accel_sequence = NULL; 1916 } 1917 1918 bdev_io_submit(bdev_io); 1919 } 1920 1921 static void 1922 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1923 uint64_t len) 1924 { 1925 assert(cb != NULL); 1926 bdev_io->internal.get_buf_cb = cb; 1927 1928 bdev_io_get_buf(bdev_io, len); 1929 } 1930 1931 1932 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 1933 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 1934 1935 void 1936 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1937 { 1938 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1939 1940 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 1941 1942 assert(cb != NULL); 1943 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1944 bdev_io->internal.get_aux_buf_cb = cb; 1945 bdev_io_get_buf(bdev_io, len); 1946 } 1947 1948 static int 1949 bdev_module_get_max_ctx_size(void) 1950 { 1951 struct spdk_bdev_module *bdev_module; 1952 int max_bdev_module_size = 0; 1953 1954 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1955 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1956 max_bdev_module_size = bdev_module->get_ctx_size(); 1957 } 1958 } 1959 1960 return max_bdev_module_size; 1961 } 1962 1963 static void 1964 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1965 { 1966 if (!bdev->internal.histogram_enabled) { 1967 return; 1968 } 1969 1970 spdk_json_write_object_begin(w); 1971 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1972 1973 spdk_json_write_named_object_begin(w, "params"); 1974 spdk_json_write_named_string(w, "name", bdev->name); 1975 1976 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1977 1978 if (bdev->internal.histogram_io_type) { 1979 spdk_json_write_named_string(w, "opc", 1980 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1981 } 1982 1983 spdk_json_write_object_end(w); 1984 1985 spdk_json_write_object_end(w); 1986 } 1987 1988 static void 1989 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1990 { 1991 int i; 1992 struct spdk_bdev_qos *qos = bdev->internal.qos; 1993 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1994 1995 if (!qos) { 1996 return; 1997 } 1998 1999 spdk_bdev_get_qos_rate_limits(bdev, limits); 2000 2001 spdk_json_write_object_begin(w); 2002 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 2003 2004 spdk_json_write_named_object_begin(w, "params"); 2005 spdk_json_write_named_string(w, "name", bdev->name); 2006 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2007 if (limits[i] > 0) { 2008 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 2009 } 2010 } 2011 spdk_json_write_object_end(w); 2012 2013 spdk_json_write_object_end(w); 2014 } 2015 2016 void 2017 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 2018 { 2019 struct spdk_bdev_module *bdev_module; 2020 struct spdk_bdev *bdev; 2021 2022 assert(w != NULL); 2023 2024 spdk_json_write_array_begin(w); 2025 2026 spdk_json_write_object_begin(w); 2027 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2028 spdk_json_write_named_object_begin(w, "params"); 2029 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2030 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2031 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2032 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2033 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2034 spdk_json_write_object_end(w); 2035 spdk_json_write_object_end(w); 2036 2037 bdev_examine_allowlist_config_json(w); 2038 2039 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2040 if (bdev_module->config_json) { 2041 bdev_module->config_json(w); 2042 } 2043 } 2044 2045 spdk_spin_lock(&g_bdev_mgr.spinlock); 2046 2047 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2048 if (bdev->fn_table->write_config_json) { 2049 bdev->fn_table->write_config_json(bdev, w); 2050 } 2051 2052 bdev_qos_config_json(bdev, w); 2053 bdev_enable_histogram_config_json(bdev, w); 2054 } 2055 2056 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2057 2058 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2059 spdk_json_write_object_begin(w); 2060 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2061 spdk_json_write_object_end(w); 2062 2063 spdk_json_write_array_end(w); 2064 } 2065 2066 static void 2067 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2068 { 2069 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2070 struct spdk_bdev_io *bdev_io; 2071 2072 spdk_iobuf_channel_fini(&ch->iobuf); 2073 2074 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2075 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2076 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2077 ch->per_thread_cache_count--; 2078 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2079 } 2080 2081 assert(ch->per_thread_cache_count == 0); 2082 } 2083 2084 static int 2085 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2086 { 2087 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2088 struct spdk_bdev_io *bdev_io; 2089 uint32_t i; 2090 int rc; 2091 2092 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2093 g_bdev_opts.iobuf_small_cache_size, 2094 g_bdev_opts.iobuf_large_cache_size); 2095 if (rc != 0) { 2096 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2097 return -1; 2098 } 2099 2100 STAILQ_INIT(&ch->per_thread_cache); 2101 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2102 2103 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2104 ch->per_thread_cache_count = 0; 2105 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2106 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2107 if (bdev_io == NULL) { 2108 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2109 assert(false); 2110 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2111 return -1; 2112 } 2113 ch->per_thread_cache_count++; 2114 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2115 } 2116 2117 TAILQ_INIT(&ch->shared_resources); 2118 TAILQ_INIT(&ch->io_wait_queue); 2119 2120 return 0; 2121 } 2122 2123 static void 2124 bdev_init_complete(int rc) 2125 { 2126 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2127 void *cb_arg = g_init_cb_arg; 2128 struct spdk_bdev_module *m; 2129 2130 g_bdev_mgr.init_complete = true; 2131 g_init_cb_fn = NULL; 2132 g_init_cb_arg = NULL; 2133 2134 /* 2135 * For modules that need to know when subsystem init is complete, 2136 * inform them now. 2137 */ 2138 if (rc == 0) { 2139 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2140 if (m->init_complete) { 2141 m->init_complete(); 2142 } 2143 } 2144 } 2145 2146 cb_fn(cb_arg, rc); 2147 } 2148 2149 static bool 2150 bdev_module_all_actions_completed(void) 2151 { 2152 struct spdk_bdev_module *m; 2153 2154 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2155 if (m->internal.action_in_progress > 0) { 2156 return false; 2157 } 2158 } 2159 return true; 2160 } 2161 2162 static void 2163 bdev_module_action_complete(void) 2164 { 2165 /* 2166 * Don't finish bdev subsystem initialization if 2167 * module pre-initialization is still in progress, or 2168 * the subsystem been already initialized. 2169 */ 2170 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2171 return; 2172 } 2173 2174 /* 2175 * Check all bdev modules for inits/examinations in progress. If any 2176 * exist, return immediately since we cannot finish bdev subsystem 2177 * initialization until all are completed. 2178 */ 2179 if (!bdev_module_all_actions_completed()) { 2180 return; 2181 } 2182 2183 /* 2184 * Modules already finished initialization - now that all 2185 * the bdev modules have finished their asynchronous I/O 2186 * processing, the entire bdev layer can be marked as complete. 2187 */ 2188 bdev_init_complete(0); 2189 } 2190 2191 static void 2192 bdev_module_action_done(struct spdk_bdev_module *module) 2193 { 2194 spdk_spin_lock(&module->internal.spinlock); 2195 assert(module->internal.action_in_progress > 0); 2196 module->internal.action_in_progress--; 2197 spdk_spin_unlock(&module->internal.spinlock); 2198 bdev_module_action_complete(); 2199 } 2200 2201 void 2202 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2203 { 2204 assert(module->async_init); 2205 bdev_module_action_done(module); 2206 } 2207 2208 void 2209 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2210 { 2211 bdev_module_action_done(module); 2212 } 2213 2214 /** The last initialized bdev module */ 2215 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2216 2217 static void 2218 bdev_init_failed(void *cb_arg) 2219 { 2220 struct spdk_bdev_module *module = cb_arg; 2221 2222 spdk_spin_lock(&module->internal.spinlock); 2223 assert(module->internal.action_in_progress > 0); 2224 module->internal.action_in_progress--; 2225 spdk_spin_unlock(&module->internal.spinlock); 2226 bdev_init_complete(-1); 2227 } 2228 2229 static int 2230 bdev_modules_init(void) 2231 { 2232 struct spdk_bdev_module *module; 2233 int rc = 0; 2234 2235 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2236 g_resume_bdev_module = module; 2237 if (module->async_init) { 2238 spdk_spin_lock(&module->internal.spinlock); 2239 module->internal.action_in_progress = 1; 2240 spdk_spin_unlock(&module->internal.spinlock); 2241 } 2242 rc = module->module_init(); 2243 if (rc != 0) { 2244 /* Bump action_in_progress to prevent other modules from completion of modules_init 2245 * Send message to defer application shutdown until resources are cleaned up */ 2246 spdk_spin_lock(&module->internal.spinlock); 2247 module->internal.action_in_progress = 1; 2248 spdk_spin_unlock(&module->internal.spinlock); 2249 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2250 return rc; 2251 } 2252 } 2253 2254 g_resume_bdev_module = NULL; 2255 return 0; 2256 } 2257 2258 void 2259 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2260 { 2261 int rc = 0; 2262 char mempool_name[32]; 2263 2264 assert(cb_fn != NULL); 2265 2266 g_init_cb_fn = cb_fn; 2267 g_init_cb_arg = cb_arg; 2268 2269 spdk_notify_type_register("bdev_register"); 2270 spdk_notify_type_register("bdev_unregister"); 2271 2272 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2273 2274 rc = spdk_iobuf_register_module("bdev"); 2275 if (rc != 0) { 2276 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2277 bdev_init_complete(-1); 2278 return; 2279 } 2280 2281 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2282 g_bdev_opts.bdev_io_pool_size, 2283 sizeof(struct spdk_bdev_io) + 2284 bdev_module_get_max_ctx_size(), 2285 0, 2286 SPDK_ENV_NUMA_ID_ANY); 2287 2288 if (g_bdev_mgr.bdev_io_pool == NULL) { 2289 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2290 bdev_init_complete(-1); 2291 return; 2292 } 2293 2294 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2295 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2296 if (!g_bdev_mgr.zero_buffer) { 2297 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2298 bdev_init_complete(-1); 2299 return; 2300 } 2301 2302 #ifdef SPDK_CONFIG_VTUNE 2303 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2304 #endif 2305 2306 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2307 bdev_mgmt_channel_destroy, 2308 sizeof(struct spdk_bdev_mgmt_channel), 2309 "bdev_mgr"); 2310 2311 rc = bdev_modules_init(); 2312 g_bdev_mgr.module_init_complete = true; 2313 if (rc != 0) { 2314 SPDK_ERRLOG("bdev modules init failed\n"); 2315 return; 2316 } 2317 2318 bdev_module_action_complete(); 2319 } 2320 2321 static void 2322 bdev_mgr_unregister_cb(void *io_device) 2323 { 2324 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2325 2326 if (g_bdev_mgr.bdev_io_pool) { 2327 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2328 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2329 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2330 g_bdev_opts.bdev_io_pool_size); 2331 } 2332 2333 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2334 } 2335 2336 spdk_free(g_bdev_mgr.zero_buffer); 2337 2338 bdev_examine_allowlist_free(); 2339 2340 cb_fn(g_fini_cb_arg); 2341 g_fini_cb_fn = NULL; 2342 g_fini_cb_arg = NULL; 2343 g_bdev_mgr.init_complete = false; 2344 g_bdev_mgr.module_init_complete = false; 2345 } 2346 2347 static void 2348 bdev_module_fini_iter(void *arg) 2349 { 2350 struct spdk_bdev_module *bdev_module; 2351 2352 /* FIXME: Handling initialization failures is broken now, 2353 * so we won't even try cleaning up after successfully 2354 * initialized modules. if module_init_complete is false, 2355 * just call spdk_bdev_mgr_unregister_cb 2356 */ 2357 if (!g_bdev_mgr.module_init_complete) { 2358 bdev_mgr_unregister_cb(NULL); 2359 return; 2360 } 2361 2362 /* Start iterating from the last touched module */ 2363 if (!g_resume_bdev_module) { 2364 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2365 } else { 2366 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2367 internal.tailq); 2368 } 2369 2370 while (bdev_module) { 2371 if (bdev_module->async_fini) { 2372 /* Save our place so we can resume later. We must 2373 * save the variable here, before calling module_fini() 2374 * below, because in some cases the module may immediately 2375 * call spdk_bdev_module_fini_done() and re-enter 2376 * this function to continue iterating. */ 2377 g_resume_bdev_module = bdev_module; 2378 } 2379 2380 if (bdev_module->module_fini) { 2381 bdev_module->module_fini(); 2382 } 2383 2384 if (bdev_module->async_fini) { 2385 return; 2386 } 2387 2388 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2389 internal.tailq); 2390 } 2391 2392 g_resume_bdev_module = NULL; 2393 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2394 } 2395 2396 void 2397 spdk_bdev_module_fini_done(void) 2398 { 2399 if (spdk_get_thread() != g_fini_thread) { 2400 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2401 } else { 2402 bdev_module_fini_iter(NULL); 2403 } 2404 } 2405 2406 static void 2407 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2408 { 2409 struct spdk_bdev *bdev = cb_arg; 2410 2411 if (bdeverrno && bdev) { 2412 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2413 bdev->name); 2414 2415 /* 2416 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2417 * bdev; try to continue by manually removing this bdev from the list and continue 2418 * with the next bdev in the list. 2419 */ 2420 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2421 } 2422 2423 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2424 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2425 /* 2426 * Bdev module finish need to be deferred as we might be in the middle of some context 2427 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2428 * after returning. 2429 */ 2430 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2431 return; 2432 } 2433 2434 /* 2435 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2436 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2437 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2438 * base bdevs. 2439 * 2440 * Also, walk the list in the reverse order. 2441 */ 2442 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2443 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2444 spdk_spin_lock(&bdev->internal.spinlock); 2445 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2446 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2447 spdk_spin_unlock(&bdev->internal.spinlock); 2448 continue; 2449 } 2450 spdk_spin_unlock(&bdev->internal.spinlock); 2451 2452 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2453 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2454 return; 2455 } 2456 2457 /* 2458 * If any bdev fails to unclaim underlying bdev properly, we may face the 2459 * case of bdev list consisting of claimed bdevs only (if claims are managed 2460 * correctly, this would mean there's a loop in the claims graph which is 2461 * clearly impossible). Warn and unregister last bdev on the list then. 2462 */ 2463 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2464 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2465 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2466 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2467 return; 2468 } 2469 } 2470 2471 static void 2472 bdev_module_fini_start_iter(void *arg) 2473 { 2474 struct spdk_bdev_module *bdev_module; 2475 2476 if (!g_resume_bdev_module) { 2477 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2478 } else { 2479 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2480 } 2481 2482 while (bdev_module) { 2483 if (bdev_module->async_fini_start) { 2484 /* Save our place so we can resume later. We must 2485 * save the variable here, before calling fini_start() 2486 * below, because in some cases the module may immediately 2487 * call spdk_bdev_module_fini_start_done() and re-enter 2488 * this function to continue iterating. */ 2489 g_resume_bdev_module = bdev_module; 2490 } 2491 2492 if (bdev_module->fini_start) { 2493 bdev_module->fini_start(); 2494 } 2495 2496 if (bdev_module->async_fini_start) { 2497 return; 2498 } 2499 2500 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2501 } 2502 2503 g_resume_bdev_module = NULL; 2504 2505 bdev_finish_unregister_bdevs_iter(NULL, 0); 2506 } 2507 2508 void 2509 spdk_bdev_module_fini_start_done(void) 2510 { 2511 if (spdk_get_thread() != g_fini_thread) { 2512 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2513 } else { 2514 bdev_module_fini_start_iter(NULL); 2515 } 2516 } 2517 2518 static void 2519 bdev_finish_wait_for_examine_done(void *cb_arg) 2520 { 2521 bdev_module_fini_start_iter(NULL); 2522 } 2523 2524 static void bdev_open_async_fini(void); 2525 2526 void 2527 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2528 { 2529 int rc; 2530 2531 assert(cb_fn != NULL); 2532 2533 g_fini_thread = spdk_get_thread(); 2534 2535 g_fini_cb_fn = cb_fn; 2536 g_fini_cb_arg = cb_arg; 2537 2538 bdev_open_async_fini(); 2539 2540 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2541 if (rc != 0) { 2542 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2543 bdev_finish_wait_for_examine_done(NULL); 2544 } 2545 } 2546 2547 struct spdk_bdev_io * 2548 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2549 { 2550 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2551 struct spdk_bdev_io *bdev_io; 2552 2553 if (ch->per_thread_cache_count > 0) { 2554 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2555 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2556 ch->per_thread_cache_count--; 2557 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2558 /* 2559 * Don't try to look for bdev_ios in the global pool if there are 2560 * waiters on bdev_ios - we don't want this caller to jump the line. 2561 */ 2562 bdev_io = NULL; 2563 } else { 2564 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2565 } 2566 2567 return bdev_io; 2568 } 2569 2570 void 2571 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2572 { 2573 struct spdk_bdev_mgmt_channel *ch; 2574 2575 assert(bdev_io != NULL); 2576 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2577 2578 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2579 2580 if (bdev_io->internal.f.has_buf) { 2581 bdev_io_put_buf(bdev_io); 2582 } 2583 2584 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2585 ch->per_thread_cache_count++; 2586 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2587 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2588 struct spdk_bdev_io_wait_entry *entry; 2589 2590 entry = TAILQ_FIRST(&ch->io_wait_queue); 2591 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2592 entry->cb_fn(entry->cb_arg); 2593 } 2594 } else { 2595 /* We should never have a full cache with entries on the io wait queue. */ 2596 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2597 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2598 } 2599 } 2600 2601 static bool 2602 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2603 { 2604 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2605 2606 switch (limit) { 2607 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2608 return true; 2609 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2610 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2611 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2612 return false; 2613 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2614 default: 2615 return false; 2616 } 2617 } 2618 2619 static bool 2620 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2621 { 2622 switch (bdev_io->type) { 2623 case SPDK_BDEV_IO_TYPE_NVME_IO: 2624 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2625 case SPDK_BDEV_IO_TYPE_READ: 2626 case SPDK_BDEV_IO_TYPE_WRITE: 2627 return true; 2628 case SPDK_BDEV_IO_TYPE_ZCOPY: 2629 if (bdev_io->u.bdev.zcopy.start) { 2630 return true; 2631 } else { 2632 return false; 2633 } 2634 default: 2635 return false; 2636 } 2637 } 2638 2639 static bool 2640 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2641 { 2642 switch (bdev_io->type) { 2643 case SPDK_BDEV_IO_TYPE_NVME_IO: 2644 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2645 /* Bit 1 (0x2) set for read operation */ 2646 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2647 return true; 2648 } else { 2649 return false; 2650 } 2651 case SPDK_BDEV_IO_TYPE_READ: 2652 return true; 2653 case SPDK_BDEV_IO_TYPE_ZCOPY: 2654 /* Populate to read from disk */ 2655 if (bdev_io->u.bdev.zcopy.populate) { 2656 return true; 2657 } else { 2658 return false; 2659 } 2660 default: 2661 return false; 2662 } 2663 } 2664 2665 static uint64_t 2666 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2667 { 2668 uint32_t blocklen = bdev_io_get_block_size(bdev_io); 2669 2670 switch (bdev_io->type) { 2671 case SPDK_BDEV_IO_TYPE_NVME_IO: 2672 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2673 return bdev_io->u.nvme_passthru.nbytes; 2674 case SPDK_BDEV_IO_TYPE_READ: 2675 case SPDK_BDEV_IO_TYPE_WRITE: 2676 return bdev_io->u.bdev.num_blocks * blocklen; 2677 case SPDK_BDEV_IO_TYPE_ZCOPY: 2678 /* Track the data in the start phase only */ 2679 if (bdev_io->u.bdev.zcopy.start) { 2680 return bdev_io->u.bdev.num_blocks * blocklen; 2681 } else { 2682 return 0; 2683 } 2684 default: 2685 return 0; 2686 } 2687 } 2688 2689 static inline bool 2690 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2691 { 2692 int64_t remaining_this_timeslice; 2693 2694 if (!limit->max_per_timeslice) { 2695 /* The QoS is disabled */ 2696 return false; 2697 } 2698 2699 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2700 __ATOMIC_RELAXED); 2701 if (remaining_this_timeslice + (int64_t)delta > 0) { 2702 /* There was still a quota for this delta -> the IO shouldn't be queued 2703 * 2704 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2705 * quota can be allowed once a while. Such overrun then taken into account in 2706 * the QoS poller, where the next timeslice quota is calculated. 2707 */ 2708 return false; 2709 } 2710 2711 /* There was no quota for this delta -> the IO should be queued 2712 * The remaining_this_timeslice must be rewinded so it reflects the real 2713 * amount of IOs or bytes allowed. 2714 */ 2715 __atomic_add_fetch( 2716 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2717 return true; 2718 } 2719 2720 static inline void 2721 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2722 { 2723 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2724 } 2725 2726 static bool 2727 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2728 { 2729 return bdev_qos_rw_queue_io(limit, io, 1); 2730 } 2731 2732 static void 2733 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2734 { 2735 bdev_qos_rw_rewind_io(limit, io, 1); 2736 } 2737 2738 static bool 2739 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2740 { 2741 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2742 } 2743 2744 static void 2745 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2746 { 2747 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2748 } 2749 2750 static bool 2751 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2752 { 2753 if (bdev_is_read_io(io) == false) { 2754 return false; 2755 } 2756 2757 return bdev_qos_rw_bps_queue(limit, io); 2758 } 2759 2760 static void 2761 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2762 { 2763 if (bdev_is_read_io(io) != false) { 2764 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2765 } 2766 } 2767 2768 static bool 2769 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2770 { 2771 if (bdev_is_read_io(io) == true) { 2772 return false; 2773 } 2774 2775 return bdev_qos_rw_bps_queue(limit, io); 2776 } 2777 2778 static void 2779 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2780 { 2781 if (bdev_is_read_io(io) != true) { 2782 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2783 } 2784 } 2785 2786 static void 2787 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2788 { 2789 int i; 2790 2791 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2792 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2793 qos->rate_limits[i].queue_io = NULL; 2794 continue; 2795 } 2796 2797 switch (i) { 2798 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2799 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2800 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2801 break; 2802 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2803 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2804 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2805 break; 2806 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2807 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2808 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2809 break; 2810 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2811 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2812 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2813 break; 2814 default: 2815 break; 2816 } 2817 } 2818 } 2819 2820 static void 2821 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2822 struct spdk_bdev_io *bdev_io, 2823 enum spdk_bdev_io_status status) 2824 { 2825 bdev_io->internal.f.in_submit_request = true; 2826 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2827 spdk_bdev_io_complete(bdev_io, status); 2828 bdev_io->internal.f.in_submit_request = false; 2829 } 2830 2831 static inline void 2832 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2833 { 2834 struct spdk_bdev *bdev = bdev_io->bdev; 2835 struct spdk_io_channel *ch = bdev_ch->channel; 2836 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2837 2838 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2839 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2840 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2841 2842 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2843 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2844 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2845 SPDK_BDEV_IO_STATUS_SUCCESS); 2846 return; 2847 } 2848 } 2849 2850 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2851 bdev_io->bdev->split_on_write_unit && 2852 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2853 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2854 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2855 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2856 return; 2857 } 2858 2859 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2860 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2861 bdev_io->internal.f.in_submit_request = true; 2862 bdev_submit_request(bdev, ch, bdev_io); 2863 bdev_io->internal.f.in_submit_request = false; 2864 } else { 2865 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2866 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2867 /* Special case when we have nomem IOs and no outstanding IOs which completions 2868 * could trigger retry of queued IOs */ 2869 bdev_shared_ch_retry_io(shared_resource); 2870 } 2871 } 2872 } 2873 2874 static bool 2875 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2876 { 2877 int i; 2878 2879 if (bdev_qos_io_to_limit(bdev_io) == true) { 2880 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2881 if (!qos->rate_limits[i].queue_io) { 2882 continue; 2883 } 2884 2885 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2886 bdev_io) == true) { 2887 for (i -= 1; i >= 0 ; i--) { 2888 if (!qos->rate_limits[i].queue_io) { 2889 continue; 2890 } 2891 2892 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2893 } 2894 return true; 2895 } 2896 } 2897 } 2898 2899 return false; 2900 } 2901 2902 static int 2903 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2904 { 2905 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2906 int submitted_ios = 0; 2907 2908 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2909 if (!bdev_qos_queue_io(qos, bdev_io)) { 2910 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2911 bdev_io_do_submit(ch, bdev_io); 2912 2913 submitted_ios++; 2914 } 2915 } 2916 2917 return submitted_ios; 2918 } 2919 2920 static void 2921 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2922 { 2923 int rc; 2924 2925 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2926 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2927 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2928 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2929 &bdev_io->internal.waitq_entry); 2930 if (rc != 0) { 2931 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2932 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2933 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2934 } 2935 } 2936 2937 static bool 2938 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2939 { 2940 uint32_t io_boundary; 2941 struct spdk_bdev *bdev = bdev_io->bdev; 2942 uint32_t max_segment_size = bdev->max_segment_size; 2943 uint32_t max_size = bdev->max_rw_size; 2944 int max_segs = bdev->max_num_segments; 2945 2946 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2947 io_boundary = bdev->write_unit_size; 2948 } else if (bdev->split_on_optimal_io_boundary) { 2949 io_boundary = bdev->optimal_io_boundary; 2950 } else { 2951 io_boundary = 0; 2952 } 2953 2954 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2955 return false; 2956 } 2957 2958 if (io_boundary) { 2959 uint64_t start_stripe, end_stripe; 2960 2961 start_stripe = bdev_io->u.bdev.offset_blocks; 2962 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2963 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2964 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2965 start_stripe >>= spdk_u32log2(io_boundary); 2966 end_stripe >>= spdk_u32log2(io_boundary); 2967 } else { 2968 start_stripe /= io_boundary; 2969 end_stripe /= io_boundary; 2970 } 2971 2972 if (start_stripe != end_stripe) { 2973 return true; 2974 } 2975 } 2976 2977 if (max_segs) { 2978 if (bdev_io->u.bdev.iovcnt > max_segs) { 2979 return true; 2980 } 2981 } 2982 2983 if (max_segment_size) { 2984 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2985 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2986 return true; 2987 } 2988 } 2989 } 2990 2991 if (max_size) { 2992 if (bdev_io->u.bdev.num_blocks > max_size) { 2993 return true; 2994 } 2995 } 2996 2997 return false; 2998 } 2999 3000 static bool 3001 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 3002 { 3003 uint32_t num_unmap_segments; 3004 3005 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 3006 return false; 3007 } 3008 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 3009 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 3010 return true; 3011 } 3012 3013 return false; 3014 } 3015 3016 static bool 3017 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 3018 { 3019 if (!bdev_io->bdev->max_write_zeroes) { 3020 return false; 3021 } 3022 3023 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 3024 return true; 3025 } 3026 3027 return false; 3028 } 3029 3030 static bool 3031 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3032 { 3033 if (bdev_io->bdev->max_copy != 0 && 3034 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3035 return true; 3036 } 3037 3038 return false; 3039 } 3040 3041 static bool 3042 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3043 { 3044 switch (bdev_io->type) { 3045 case SPDK_BDEV_IO_TYPE_READ: 3046 case SPDK_BDEV_IO_TYPE_WRITE: 3047 return bdev_rw_should_split(bdev_io); 3048 case SPDK_BDEV_IO_TYPE_UNMAP: 3049 return bdev_unmap_should_split(bdev_io); 3050 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3051 return bdev_write_zeroes_should_split(bdev_io); 3052 case SPDK_BDEV_IO_TYPE_COPY: 3053 return bdev_copy_should_split(bdev_io); 3054 default: 3055 return false; 3056 } 3057 } 3058 3059 static uint32_t 3060 _to_next_boundary(uint64_t offset, uint32_t boundary) 3061 { 3062 return (boundary - (offset % boundary)); 3063 } 3064 3065 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3066 3067 static void _bdev_rw_split(void *_bdev_io); 3068 3069 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3070 3071 static void 3072 _bdev_unmap_split(void *_bdev_io) 3073 { 3074 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3075 } 3076 3077 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3078 3079 static void 3080 _bdev_write_zeroes_split(void *_bdev_io) 3081 { 3082 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3083 } 3084 3085 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3086 3087 static void 3088 _bdev_copy_split(void *_bdev_io) 3089 { 3090 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3091 } 3092 3093 static int 3094 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3095 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3096 { 3097 int rc; 3098 uint64_t current_offset, current_remaining, current_src_offset; 3099 spdk_bdev_io_wait_cb io_wait_fn; 3100 3101 current_offset = *offset; 3102 current_remaining = *remaining; 3103 3104 assert(bdev_io->internal.f.split); 3105 3106 bdev_io->internal.split.outstanding++; 3107 3108 io_wait_fn = _bdev_rw_split; 3109 switch (bdev_io->type) { 3110 case SPDK_BDEV_IO_TYPE_READ: 3111 assert(bdev_io->u.bdev.accel_sequence == NULL); 3112 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3113 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3114 iov, iovcnt, md_buf, current_offset, 3115 num_blocks, 3116 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3117 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3118 NULL, 3119 bdev_io->u.bdev.dif_check_flags, 3120 bdev_io_split_done, bdev_io); 3121 break; 3122 case SPDK_BDEV_IO_TYPE_WRITE: 3123 assert(bdev_io->u.bdev.accel_sequence == NULL); 3124 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3125 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3126 iov, iovcnt, md_buf, current_offset, 3127 num_blocks, 3128 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3129 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3130 NULL, 3131 bdev_io->u.bdev.dif_check_flags, 3132 bdev_io->u.bdev.nvme_cdw12.raw, 3133 bdev_io->u.bdev.nvme_cdw13.raw, 3134 bdev_io_split_done, bdev_io); 3135 break; 3136 case SPDK_BDEV_IO_TYPE_UNMAP: 3137 io_wait_fn = _bdev_unmap_split; 3138 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3139 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3140 current_offset, num_blocks, 3141 bdev_io_split_done, bdev_io); 3142 break; 3143 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3144 io_wait_fn = _bdev_write_zeroes_split; 3145 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3146 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3147 current_offset, num_blocks, 3148 bdev_io_split_done, bdev_io); 3149 break; 3150 case SPDK_BDEV_IO_TYPE_COPY: 3151 io_wait_fn = _bdev_copy_split; 3152 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3153 (current_offset - bdev_io->u.bdev.offset_blocks); 3154 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3155 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3156 current_offset, current_src_offset, num_blocks, 3157 bdev_io_split_done, bdev_io); 3158 break; 3159 default: 3160 assert(false); 3161 rc = -EINVAL; 3162 break; 3163 } 3164 3165 if (rc == 0) { 3166 current_offset += num_blocks; 3167 current_remaining -= num_blocks; 3168 bdev_io->internal.split.current_offset_blocks = current_offset; 3169 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3170 *offset = current_offset; 3171 *remaining = current_remaining; 3172 } else { 3173 bdev_io->internal.split.outstanding--; 3174 if (rc == -ENOMEM) { 3175 if (bdev_io->internal.split.outstanding == 0) { 3176 /* No I/O is outstanding. Hence we should wait here. */ 3177 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3178 } 3179 } else { 3180 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3181 if (bdev_io->internal.split.outstanding == 0) { 3182 bdev_ch_remove_from_io_submitted(bdev_io); 3183 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3184 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3185 bdev_io->internal.ch->queue_depth); 3186 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3187 } 3188 } 3189 } 3190 3191 return rc; 3192 } 3193 3194 static void 3195 _bdev_rw_split(void *_bdev_io) 3196 { 3197 struct iovec *parent_iov, *iov; 3198 struct spdk_bdev_io *bdev_io = _bdev_io; 3199 struct spdk_bdev *bdev = bdev_io->bdev; 3200 uint64_t parent_offset, current_offset, remaining; 3201 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3202 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3203 uint32_t iovcnt, iov_len, child_iovsize; 3204 uint32_t blocklen; 3205 uint32_t io_boundary; 3206 uint32_t max_segment_size = bdev->max_segment_size; 3207 uint32_t max_child_iovcnt = bdev->max_num_segments; 3208 uint32_t max_size = bdev->max_rw_size; 3209 void *md_buf = NULL; 3210 int rc; 3211 3212 blocklen = bdev_io_get_block_size(bdev_io); 3213 3214 max_size = max_size ? max_size : UINT32_MAX; 3215 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3216 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3217 SPDK_BDEV_IO_NUM_CHILD_IOV; 3218 3219 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3220 io_boundary = bdev->write_unit_size; 3221 } else if (bdev->split_on_optimal_io_boundary) { 3222 io_boundary = bdev->optimal_io_boundary; 3223 } else { 3224 io_boundary = UINT32_MAX; 3225 } 3226 3227 assert(bdev_io->internal.f.split); 3228 3229 remaining = bdev_io->internal.split.remaining_num_blocks; 3230 current_offset = bdev_io->internal.split.current_offset_blocks; 3231 parent_offset = bdev_io->u.bdev.offset_blocks; 3232 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3233 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3234 3235 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3236 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3237 if (parent_iov_offset < parent_iov->iov_len) { 3238 break; 3239 } 3240 parent_iov_offset -= parent_iov->iov_len; 3241 } 3242 3243 child_iovcnt = 0; 3244 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3245 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3246 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3247 to_next_boundary = spdk_min(remaining, to_next_boundary); 3248 to_next_boundary = spdk_min(max_size, to_next_boundary); 3249 to_next_boundary_bytes = to_next_boundary * blocklen; 3250 3251 iov = &bdev_io->child_iov[child_iovcnt]; 3252 iovcnt = 0; 3253 3254 if (bdev_io->u.bdev.md_buf) { 3255 md_buf = (char *)bdev_io->u.bdev.md_buf + 3256 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3257 } 3258 3259 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3260 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3261 iovcnt < child_iovsize) { 3262 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3263 iov_len = parent_iov->iov_len - parent_iov_offset; 3264 3265 iov_len = spdk_min(iov_len, max_segment_size); 3266 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3267 to_next_boundary_bytes -= iov_len; 3268 3269 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3270 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3271 3272 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3273 parent_iov_offset += iov_len; 3274 } else { 3275 parent_iovpos++; 3276 parent_iov_offset = 0; 3277 } 3278 child_iovcnt++; 3279 iovcnt++; 3280 } 3281 3282 if (to_next_boundary_bytes > 0) { 3283 /* We had to stop this child I/O early because we ran out of 3284 * child_iov space or were limited by max_num_segments. 3285 * Ensure the iovs to be aligned with block size and 3286 * then adjust to_next_boundary before starting the 3287 * child I/O. 3288 */ 3289 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3290 iovcnt == child_iovsize); 3291 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3292 if (to_last_block_bytes != 0) { 3293 uint32_t child_iovpos = child_iovcnt - 1; 3294 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3295 * so the loop will naturally end 3296 */ 3297 3298 to_last_block_bytes = blocklen - to_last_block_bytes; 3299 to_next_boundary_bytes += to_last_block_bytes; 3300 while (to_last_block_bytes > 0 && iovcnt > 0) { 3301 iov_len = spdk_min(to_last_block_bytes, 3302 bdev_io->child_iov[child_iovpos].iov_len); 3303 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3304 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3305 child_iovpos--; 3306 if (--iovcnt == 0) { 3307 /* If the child IO is less than a block size just return. 3308 * If the first child IO of any split round is less than 3309 * a block size, an error exit. 3310 */ 3311 if (bdev_io->internal.split.outstanding == 0) { 3312 SPDK_ERRLOG("The first child io was less than a block size\n"); 3313 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3314 bdev_ch_remove_from_io_submitted(bdev_io); 3315 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3316 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3317 bdev_io->internal.ch->queue_depth); 3318 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3319 } 3320 3321 return; 3322 } 3323 } 3324 3325 to_last_block_bytes -= iov_len; 3326 3327 if (parent_iov_offset == 0) { 3328 parent_iovpos--; 3329 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3330 } 3331 parent_iov_offset -= iov_len; 3332 } 3333 3334 assert(to_last_block_bytes == 0); 3335 } 3336 to_next_boundary -= to_next_boundary_bytes / blocklen; 3337 } 3338 3339 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3340 ¤t_offset, &remaining); 3341 if (spdk_unlikely(rc)) { 3342 return; 3343 } 3344 } 3345 } 3346 3347 static void 3348 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3349 { 3350 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3351 uint32_t num_children_reqs = 0; 3352 int rc; 3353 3354 assert(bdev_io->internal.f.split); 3355 3356 offset = bdev_io->internal.split.current_offset_blocks; 3357 remaining = bdev_io->internal.split.remaining_num_blocks; 3358 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3359 3360 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3361 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3362 3363 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3364 &offset, &remaining); 3365 if (spdk_likely(rc == 0)) { 3366 num_children_reqs++; 3367 } else { 3368 return; 3369 } 3370 } 3371 } 3372 3373 static void 3374 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3375 { 3376 uint64_t offset, write_zeroes_blocks, remaining; 3377 uint32_t num_children_reqs = 0; 3378 int rc; 3379 3380 assert(bdev_io->internal.f.split); 3381 3382 offset = bdev_io->internal.split.current_offset_blocks; 3383 remaining = bdev_io->internal.split.remaining_num_blocks; 3384 3385 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3386 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3387 3388 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3389 &offset, &remaining); 3390 if (spdk_likely(rc == 0)) { 3391 num_children_reqs++; 3392 } else { 3393 return; 3394 } 3395 } 3396 } 3397 3398 static void 3399 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3400 { 3401 uint64_t offset, copy_blocks, remaining; 3402 uint32_t num_children_reqs = 0; 3403 int rc; 3404 3405 assert(bdev_io->internal.f.split); 3406 3407 offset = bdev_io->internal.split.current_offset_blocks; 3408 remaining = bdev_io->internal.split.remaining_num_blocks; 3409 3410 assert(bdev_io->bdev->max_copy != 0); 3411 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3412 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3413 3414 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3415 &offset, &remaining); 3416 if (spdk_likely(rc == 0)) { 3417 num_children_reqs++; 3418 } else { 3419 return; 3420 } 3421 } 3422 } 3423 3424 static void 3425 parent_bdev_io_complete(void *ctx, int rc) 3426 { 3427 struct spdk_bdev_io *parent_io = ctx; 3428 3429 if (rc) { 3430 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3431 } 3432 3433 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3434 parent_io->internal.caller_ctx); 3435 } 3436 3437 static void 3438 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3439 { 3440 struct spdk_bdev_io *bdev_io = ctx; 3441 3442 /* u.bdev.accel_sequence should have already been cleared at this point */ 3443 assert(bdev_io->u.bdev.accel_sequence == NULL); 3444 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3445 bdev_io->internal.f.has_accel_sequence = false; 3446 3447 if (spdk_unlikely(status != 0)) { 3448 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3449 } 3450 3451 parent_bdev_io_complete(bdev_io, status); 3452 } 3453 3454 static void 3455 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3456 { 3457 struct spdk_bdev_io *parent_io = cb_arg; 3458 3459 spdk_bdev_free_io(bdev_io); 3460 3461 assert(parent_io->internal.f.split); 3462 3463 if (!success) { 3464 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3465 /* If any child I/O failed, stop further splitting process. */ 3466 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3467 parent_io->internal.split.remaining_num_blocks = 0; 3468 } 3469 parent_io->internal.split.outstanding--; 3470 if (parent_io->internal.split.outstanding != 0) { 3471 return; 3472 } 3473 3474 /* 3475 * Parent I/O finishes when all blocks are consumed. 3476 */ 3477 if (parent_io->internal.split.remaining_num_blocks == 0) { 3478 assert(parent_io->internal.cb != bdev_io_split_done); 3479 bdev_ch_remove_from_io_submitted(parent_io); 3480 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3481 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3482 parent_io->internal.ch->queue_depth); 3483 3484 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3485 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3486 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3487 return; 3488 } else if (parent_io->internal.f.has_bounce_buf && 3489 !bdev_io_use_accel_sequence(bdev_io)) { 3490 /* bdev IO will be completed in the callback */ 3491 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3492 return; 3493 } 3494 } 3495 3496 parent_bdev_io_complete(parent_io, 0); 3497 return; 3498 } 3499 3500 /* 3501 * Continue with the splitting process. This function will complete the parent I/O if the 3502 * splitting is done. 3503 */ 3504 switch (parent_io->type) { 3505 case SPDK_BDEV_IO_TYPE_READ: 3506 case SPDK_BDEV_IO_TYPE_WRITE: 3507 _bdev_rw_split(parent_io); 3508 break; 3509 case SPDK_BDEV_IO_TYPE_UNMAP: 3510 bdev_unmap_split(parent_io); 3511 break; 3512 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3513 bdev_write_zeroes_split(parent_io); 3514 break; 3515 case SPDK_BDEV_IO_TYPE_COPY: 3516 bdev_copy_split(parent_io); 3517 break; 3518 default: 3519 assert(false); 3520 break; 3521 } 3522 } 3523 3524 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3525 bool success); 3526 3527 static void 3528 bdev_io_split(struct spdk_bdev_io *bdev_io) 3529 { 3530 assert(bdev_io_should_split(bdev_io)); 3531 assert(bdev_io->internal.f.split); 3532 3533 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3534 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3535 bdev_io->internal.split.outstanding = 0; 3536 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3537 3538 switch (bdev_io->type) { 3539 case SPDK_BDEV_IO_TYPE_READ: 3540 case SPDK_BDEV_IO_TYPE_WRITE: 3541 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3542 _bdev_rw_split(bdev_io); 3543 } else { 3544 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3545 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3546 bdev_io->u.bdev.num_blocks * bdev_io_get_block_size(bdev_io)); 3547 } 3548 break; 3549 case SPDK_BDEV_IO_TYPE_UNMAP: 3550 bdev_unmap_split(bdev_io); 3551 break; 3552 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3553 bdev_write_zeroes_split(bdev_io); 3554 break; 3555 case SPDK_BDEV_IO_TYPE_COPY: 3556 bdev_copy_split(bdev_io); 3557 break; 3558 default: 3559 assert(false); 3560 break; 3561 } 3562 } 3563 3564 static void 3565 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3566 { 3567 if (!success) { 3568 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3569 return; 3570 } 3571 3572 _bdev_rw_split(bdev_io); 3573 } 3574 3575 static inline void 3576 _bdev_io_submit(struct spdk_bdev_io *bdev_io) 3577 { 3578 struct spdk_bdev *bdev = bdev_io->bdev; 3579 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3580 3581 if (spdk_likely(bdev_ch->flags == 0)) { 3582 bdev_io_do_submit(bdev_ch, bdev_io); 3583 return; 3584 } 3585 3586 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3587 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3588 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3589 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3590 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3591 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3592 } else { 3593 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3594 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3595 } 3596 } else { 3597 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3598 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3599 } 3600 } 3601 3602 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3603 3604 bool 3605 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3606 { 3607 if (range1->length == 0 || range2->length == 0) { 3608 return false; 3609 } 3610 3611 if (range1->offset + range1->length <= range2->offset) { 3612 return false; 3613 } 3614 3615 if (range2->offset + range2->length <= range1->offset) { 3616 return false; 3617 } 3618 3619 return true; 3620 } 3621 3622 static bool 3623 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3624 { 3625 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3626 struct lba_range r; 3627 3628 switch (bdev_io->type) { 3629 case SPDK_BDEV_IO_TYPE_NVME_IO: 3630 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3631 /* Don't try to decode the NVMe command - just assume worst-case and that 3632 * it overlaps a locked range. 3633 */ 3634 return true; 3635 case SPDK_BDEV_IO_TYPE_READ: 3636 if (!range->quiesce) { 3637 return false; 3638 } 3639 /* fallthrough */ 3640 case SPDK_BDEV_IO_TYPE_WRITE: 3641 case SPDK_BDEV_IO_TYPE_UNMAP: 3642 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3643 case SPDK_BDEV_IO_TYPE_ZCOPY: 3644 case SPDK_BDEV_IO_TYPE_COPY: 3645 r.offset = bdev_io->u.bdev.offset_blocks; 3646 r.length = bdev_io->u.bdev.num_blocks; 3647 if (!bdev_lba_range_overlapped(range, &r)) { 3648 /* This I/O doesn't overlap the specified LBA range. */ 3649 return false; 3650 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3651 /* This I/O overlaps, but the I/O is on the same channel that locked this 3652 * range, and the caller_ctx is the same as the locked_ctx. This means 3653 * that this I/O is associated with the lock, and is allowed to execute. 3654 */ 3655 return false; 3656 } else { 3657 return true; 3658 } 3659 default: 3660 return false; 3661 } 3662 } 3663 3664 void 3665 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3666 { 3667 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3668 3669 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3670 3671 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3672 struct lba_range *range; 3673 3674 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3675 if (bdev_io_range_is_locked(bdev_io, range)) { 3676 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3677 return; 3678 } 3679 } 3680 } 3681 3682 bdev_ch_add_to_io_submitted(bdev_io); 3683 3684 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3685 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3686 ch->trace_id, bdev_io->u.bdev.num_blocks, 3687 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3688 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3689 3690 if (bdev_io->internal.f.split) { 3691 bdev_io_split(bdev_io); 3692 return; 3693 } 3694 3695 _bdev_io_submit(bdev_io); 3696 } 3697 3698 static inline int 3699 bdev_io_init_dif_ctx(struct spdk_bdev_io *bdev_io) 3700 { 3701 struct spdk_bdev *bdev = bdev_io->bdev; 3702 struct spdk_dif_ctx_init_ext_opts dif_opts; 3703 3704 memset(&bdev_io->u.bdev.dif_err, 0, sizeof(struct spdk_dif_error)); 3705 3706 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 3707 dif_opts.dif_pi_format = bdev->dif_pi_format; 3708 3709 return spdk_dif_ctx_init(&bdev_io->u.bdev.dif_ctx, 3710 bdev->blocklen, 3711 bdev->md_len, 3712 bdev->md_interleave, 3713 bdev->dif_is_head_of_md, 3714 bdev->dif_type, 3715 bdev_io->u.bdev.dif_check_flags, 3716 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 3717 0xFFFF, 0, 0, 0, &dif_opts); 3718 } 3719 3720 static inline void 3721 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3722 { 3723 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3724 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3725 * For write operation we need to pull buffers from memory domain before submitting IO. 3726 * Once read operation completes, we need to use memory_domain push functionality to 3727 * update data in original memory domain IO buffer 3728 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3729 assert(bdev_io->internal.f.has_memory_domain); 3730 bdev_io->u.bdev.memory_domain = NULL; 3731 bdev_io->u.bdev.memory_domain_ctx = NULL; 3732 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3733 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3734 } 3735 3736 static inline void 3737 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3738 { 3739 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3740 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3741 int rc; 3742 3743 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3744 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3745 bdev_io_complete_unsubmitted(bdev_io); 3746 return; 3747 } 3748 3749 if (bdev_io_needs_metadata(desc, bdev_io)) { 3750 rc = bdev_io_init_dif_ctx(bdev_io); 3751 if (spdk_unlikely(rc != 0)) { 3752 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3753 bdev_io_complete_unsubmitted(bdev_io); 3754 return; 3755 } 3756 } 3757 3758 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3759 * support them, but we need to execute an accel sequence and the data buffer is from accel 3760 * memory domain (to avoid doing a push/pull from that domain). 3761 */ 3762 if (bdev_io_use_memory_domain(bdev_io)) { 3763 if (!desc->memory_domains_supported || 3764 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3765 _bdev_io_ext_use_bounce_buffer(bdev_io); 3766 return; 3767 } 3768 } 3769 3770 if (needs_exec) { 3771 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3772 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3773 return; 3774 } 3775 /* For reads we'll execute the sequence after the data is read, so, for now, only 3776 * clear out accel_sequence pointer and submit the IO */ 3777 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3778 bdev_io->u.bdev.accel_sequence = NULL; 3779 } 3780 3781 bdev_io_submit(bdev_io); 3782 } 3783 3784 static void 3785 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3786 { 3787 struct spdk_bdev *bdev = bdev_io->bdev; 3788 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3789 struct spdk_io_channel *ch = bdev_ch->channel; 3790 3791 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3792 3793 bdev_io->internal.f.in_submit_request = true; 3794 bdev_submit_request(bdev, ch, bdev_io); 3795 bdev_io->internal.f.in_submit_request = false; 3796 } 3797 3798 void 3799 bdev_io_init(struct spdk_bdev_io *bdev_io, 3800 struct spdk_bdev *bdev, void *cb_arg, 3801 spdk_bdev_io_completion_cb cb) 3802 { 3803 bdev_io->bdev = bdev; 3804 bdev_io->internal.f.raw = 0; 3805 bdev_io->internal.caller_ctx = cb_arg; 3806 bdev_io->internal.cb = cb; 3807 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3808 bdev_io->internal.f.in_submit_request = false; 3809 bdev_io->internal.error.nvme.cdw0 = 0; 3810 bdev_io->num_retries = 0; 3811 bdev_io->internal.get_buf_cb = NULL; 3812 bdev_io->internal.get_aux_buf_cb = NULL; 3813 bdev_io->internal.data_transfer_cpl = NULL; 3814 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3815 } 3816 3817 static bool 3818 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3819 { 3820 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3821 } 3822 3823 bool 3824 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3825 { 3826 bool supported; 3827 3828 supported = bdev_io_type_supported(bdev, io_type); 3829 3830 if (!supported) { 3831 switch (io_type) { 3832 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3833 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3834 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3835 break; 3836 default: 3837 break; 3838 } 3839 } 3840 3841 return supported; 3842 } 3843 3844 static const char *g_io_type_strings[] = { 3845 [SPDK_BDEV_IO_TYPE_READ] = "read", 3846 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3847 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3848 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3849 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3850 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3851 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3852 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3853 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3854 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3855 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3856 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3857 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3858 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3859 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3860 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3861 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3862 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3863 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3864 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3865 }; 3866 3867 const char * 3868 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3869 { 3870 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3871 return NULL; 3872 } 3873 3874 return g_io_type_strings[io_type]; 3875 } 3876 3877 int 3878 spdk_bdev_get_io_type(const char *io_type_string) 3879 { 3880 int i; 3881 3882 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3883 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3884 return i; 3885 } 3886 } 3887 3888 return -1; 3889 } 3890 3891 uint64_t 3892 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3893 { 3894 return bdev_io->internal.submit_tsc; 3895 } 3896 3897 int 3898 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3899 { 3900 if (bdev->fn_table->dump_info_json) { 3901 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3902 } 3903 3904 return 0; 3905 } 3906 3907 static void 3908 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3909 { 3910 uint32_t max_per_timeslice = 0; 3911 int i; 3912 3913 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3914 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3915 qos->rate_limits[i].max_per_timeslice = 0; 3916 continue; 3917 } 3918 3919 max_per_timeslice = qos->rate_limits[i].limit * 3920 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3921 3922 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3923 qos->rate_limits[i].min_per_timeslice); 3924 3925 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3926 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3927 } 3928 3929 bdev_qos_set_ops(qos); 3930 } 3931 3932 static void 3933 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3934 struct spdk_io_channel *io_ch, void *ctx) 3935 { 3936 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3937 int status; 3938 3939 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3940 3941 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3942 /* TODO: channels round robing */ 3943 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3944 3945 spdk_bdev_for_each_channel_continue(i, status); 3946 } 3947 3948 3949 static void 3950 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3951 { 3952 3953 } 3954 3955 static int 3956 bdev_channel_poll_qos(void *arg) 3957 { 3958 struct spdk_bdev *bdev = arg; 3959 struct spdk_bdev_qos *qos = bdev->internal.qos; 3960 uint64_t now = spdk_get_ticks(); 3961 int i; 3962 int64_t remaining_last_timeslice; 3963 3964 if (spdk_unlikely(qos->thread == NULL)) { 3965 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3966 return SPDK_POLLER_IDLE; 3967 } 3968 3969 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3970 /* We received our callback earlier than expected - return 3971 * immediately and wait to do accounting until at least one 3972 * timeslice has actually expired. This should never happen 3973 * with a well-behaved timer implementation. 3974 */ 3975 return SPDK_POLLER_IDLE; 3976 } 3977 3978 /* Reset for next round of rate limiting */ 3979 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3980 /* We may have allowed the IOs or bytes to slightly overrun in the last 3981 * timeslice. remaining_this_timeslice is signed, so if it's negative 3982 * here, we'll account for the overrun so that the next timeslice will 3983 * be appropriately reduced. 3984 */ 3985 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3986 0, __ATOMIC_RELAXED); 3987 if (remaining_last_timeslice < 0) { 3988 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3989 * potentially use 2 atomic ops each, so they can intertwine. 3990 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3991 */ 3992 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3993 remaining_last_timeslice, __ATOMIC_RELAXED); 3994 } 3995 } 3996 3997 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3998 qos->last_timeslice += qos->timeslice_size; 3999 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4000 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 4001 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 4002 } 4003 } 4004 4005 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 4006 bdev_channel_submit_qos_io_done); 4007 4008 return SPDK_POLLER_BUSY; 4009 } 4010 4011 static void 4012 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 4013 { 4014 struct spdk_bdev_shared_resource *shared_resource; 4015 struct lba_range *range; 4016 4017 bdev_free_io_stat(ch->stat); 4018 #ifdef SPDK_CONFIG_VTUNE 4019 bdev_free_io_stat(ch->prev_stat); 4020 #endif 4021 4022 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 4023 range = TAILQ_FIRST(&ch->locked_ranges); 4024 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 4025 free(range); 4026 } 4027 4028 spdk_put_io_channel(ch->channel); 4029 spdk_put_io_channel(ch->accel_channel); 4030 4031 shared_resource = ch->shared_resource; 4032 4033 assert(TAILQ_EMPTY(&ch->io_locked)); 4034 assert(TAILQ_EMPTY(&ch->io_submitted)); 4035 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 4036 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 4037 assert(ch->io_outstanding == 0); 4038 assert(shared_resource->ref > 0); 4039 shared_resource->ref--; 4040 if (shared_resource->ref == 0) { 4041 assert(shared_resource->io_outstanding == 0); 4042 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 4043 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 4044 spdk_poller_unregister(&shared_resource->nomem_poller); 4045 free(shared_resource); 4046 } 4047 } 4048 4049 static void 4050 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 4051 { 4052 struct spdk_bdev_qos *qos = bdev->internal.qos; 4053 int i; 4054 4055 assert(spdk_spin_held(&bdev->internal.spinlock)); 4056 4057 /* Rate limiting on this bdev enabled */ 4058 if (qos) { 4059 if (qos->ch == NULL) { 4060 struct spdk_io_channel *io_ch; 4061 4062 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4063 bdev->name, spdk_get_thread()); 4064 4065 /* No qos channel has been selected, so set one up */ 4066 4067 /* Take another reference to ch */ 4068 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4069 assert(io_ch != NULL); 4070 qos->ch = ch; 4071 4072 qos->thread = spdk_io_channel_get_thread(io_ch); 4073 4074 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4075 if (bdev_qos_is_iops_rate_limit(i) == true) { 4076 qos->rate_limits[i].min_per_timeslice = 4077 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4078 } else { 4079 qos->rate_limits[i].min_per_timeslice = 4080 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4081 } 4082 4083 if (qos->rate_limits[i].limit == 0) { 4084 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4085 } 4086 } 4087 bdev_qos_update_max_quota_per_timeslice(qos); 4088 qos->timeslice_size = 4089 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4090 qos->last_timeslice = spdk_get_ticks(); 4091 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4092 bdev, 4093 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4094 } 4095 4096 ch->flags |= BDEV_CH_QOS_ENABLED; 4097 } 4098 } 4099 4100 struct poll_timeout_ctx { 4101 struct spdk_bdev_desc *desc; 4102 uint64_t timeout_in_sec; 4103 spdk_bdev_io_timeout_cb cb_fn; 4104 void *cb_arg; 4105 }; 4106 4107 static void 4108 bdev_desc_free(struct spdk_bdev_desc *desc) 4109 { 4110 spdk_spin_destroy(&desc->spinlock); 4111 free(desc->media_events_buffer); 4112 free(desc); 4113 } 4114 4115 static void 4116 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4117 { 4118 struct poll_timeout_ctx *ctx = _ctx; 4119 struct spdk_bdev_desc *desc = ctx->desc; 4120 4121 free(ctx); 4122 4123 spdk_spin_lock(&desc->spinlock); 4124 desc->refs--; 4125 if (desc->closed == true && desc->refs == 0) { 4126 spdk_spin_unlock(&desc->spinlock); 4127 bdev_desc_free(desc); 4128 return; 4129 } 4130 spdk_spin_unlock(&desc->spinlock); 4131 } 4132 4133 static void 4134 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4135 struct spdk_io_channel *io_ch, void *_ctx) 4136 { 4137 struct poll_timeout_ctx *ctx = _ctx; 4138 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4139 struct spdk_bdev_desc *desc = ctx->desc; 4140 struct spdk_bdev_io *bdev_io; 4141 uint64_t now; 4142 4143 spdk_spin_lock(&desc->spinlock); 4144 if (desc->closed == true) { 4145 spdk_spin_unlock(&desc->spinlock); 4146 spdk_bdev_for_each_channel_continue(i, -1); 4147 return; 4148 } 4149 spdk_spin_unlock(&desc->spinlock); 4150 4151 now = spdk_get_ticks(); 4152 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4153 /* Exclude any I/O that are generated via splitting. */ 4154 if (bdev_io->internal.cb == bdev_io_split_done) { 4155 continue; 4156 } 4157 4158 /* Once we find an I/O that has not timed out, we can immediately 4159 * exit the loop. 4160 */ 4161 if (now < (bdev_io->internal.submit_tsc + 4162 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4163 goto end; 4164 } 4165 4166 if (bdev_io->internal.desc == desc) { 4167 ctx->cb_fn(ctx->cb_arg, bdev_io); 4168 } 4169 } 4170 4171 end: 4172 spdk_bdev_for_each_channel_continue(i, 0); 4173 } 4174 4175 static int 4176 bdev_poll_timeout_io(void *arg) 4177 { 4178 struct spdk_bdev_desc *desc = arg; 4179 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4180 struct poll_timeout_ctx *ctx; 4181 4182 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4183 if (!ctx) { 4184 SPDK_ERRLOG("failed to allocate memory\n"); 4185 return SPDK_POLLER_BUSY; 4186 } 4187 ctx->desc = desc; 4188 ctx->cb_arg = desc->cb_arg; 4189 ctx->cb_fn = desc->cb_fn; 4190 ctx->timeout_in_sec = desc->timeout_in_sec; 4191 4192 /* Take a ref on the descriptor in case it gets closed while we are checking 4193 * all of the channels. 4194 */ 4195 spdk_spin_lock(&desc->spinlock); 4196 desc->refs++; 4197 spdk_spin_unlock(&desc->spinlock); 4198 4199 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4200 bdev_channel_poll_timeout_io_done); 4201 4202 return SPDK_POLLER_BUSY; 4203 } 4204 4205 int 4206 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4207 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4208 { 4209 assert(desc->thread == spdk_get_thread()); 4210 4211 spdk_poller_unregister(&desc->io_timeout_poller); 4212 4213 if (timeout_in_sec) { 4214 assert(cb_fn != NULL); 4215 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4216 desc, 4217 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4218 1000); 4219 if (desc->io_timeout_poller == NULL) { 4220 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4221 return -1; 4222 } 4223 } 4224 4225 desc->cb_fn = cb_fn; 4226 desc->cb_arg = cb_arg; 4227 desc->timeout_in_sec = timeout_in_sec; 4228 4229 return 0; 4230 } 4231 4232 static int 4233 bdev_channel_create(void *io_device, void *ctx_buf) 4234 { 4235 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4236 struct spdk_bdev_channel *ch = ctx_buf; 4237 struct spdk_io_channel *mgmt_io_ch; 4238 struct spdk_bdev_mgmt_channel *mgmt_ch; 4239 struct spdk_bdev_shared_resource *shared_resource; 4240 struct lba_range *range; 4241 4242 ch->bdev = bdev; 4243 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4244 if (!ch->channel) { 4245 return -1; 4246 } 4247 4248 ch->accel_channel = spdk_accel_get_io_channel(); 4249 if (!ch->accel_channel) { 4250 spdk_put_io_channel(ch->channel); 4251 return -1; 4252 } 4253 4254 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4255 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4256 4257 assert(ch->histogram == NULL); 4258 if (bdev->internal.histogram_enabled) { 4259 ch->histogram = spdk_histogram_data_alloc(); 4260 if (ch->histogram == NULL) { 4261 SPDK_ERRLOG("Could not allocate histogram\n"); 4262 } 4263 } 4264 4265 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4266 if (!mgmt_io_ch) { 4267 spdk_put_io_channel(ch->channel); 4268 spdk_put_io_channel(ch->accel_channel); 4269 return -1; 4270 } 4271 4272 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4273 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4274 if (shared_resource->shared_ch == ch->channel) { 4275 spdk_put_io_channel(mgmt_io_ch); 4276 shared_resource->ref++; 4277 break; 4278 } 4279 } 4280 4281 if (shared_resource == NULL) { 4282 shared_resource = calloc(1, sizeof(*shared_resource)); 4283 if (shared_resource == NULL) { 4284 spdk_put_io_channel(ch->channel); 4285 spdk_put_io_channel(ch->accel_channel); 4286 spdk_put_io_channel(mgmt_io_ch); 4287 return -1; 4288 } 4289 4290 shared_resource->mgmt_ch = mgmt_ch; 4291 shared_resource->io_outstanding = 0; 4292 TAILQ_INIT(&shared_resource->nomem_io); 4293 shared_resource->nomem_threshold = 0; 4294 shared_resource->shared_ch = ch->channel; 4295 shared_resource->ref = 1; 4296 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4297 } 4298 4299 ch->io_outstanding = 0; 4300 TAILQ_INIT(&ch->locked_ranges); 4301 TAILQ_INIT(&ch->qos_queued_io); 4302 ch->flags = 0; 4303 ch->trace_id = bdev->internal.trace_id; 4304 ch->shared_resource = shared_resource; 4305 4306 TAILQ_INIT(&ch->io_submitted); 4307 TAILQ_INIT(&ch->io_locked); 4308 TAILQ_INIT(&ch->io_accel_exec); 4309 TAILQ_INIT(&ch->io_memory_domain); 4310 4311 ch->stat = bdev_alloc_io_stat(false); 4312 if (ch->stat == NULL) { 4313 bdev_channel_destroy_resource(ch); 4314 return -1; 4315 } 4316 4317 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4318 4319 #ifdef SPDK_CONFIG_VTUNE 4320 { 4321 char *name; 4322 __itt_init_ittlib(NULL, 0); 4323 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4324 if (!name) { 4325 bdev_channel_destroy_resource(ch); 4326 return -1; 4327 } 4328 ch->handle = __itt_string_handle_create(name); 4329 free(name); 4330 ch->start_tsc = spdk_get_ticks(); 4331 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4332 ch->prev_stat = bdev_alloc_io_stat(false); 4333 if (ch->prev_stat == NULL) { 4334 bdev_channel_destroy_resource(ch); 4335 return -1; 4336 } 4337 } 4338 #endif 4339 4340 spdk_spin_lock(&bdev->internal.spinlock); 4341 bdev_enable_qos(bdev, ch); 4342 4343 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4344 struct lba_range *new_range; 4345 4346 new_range = calloc(1, sizeof(*new_range)); 4347 if (new_range == NULL) { 4348 spdk_spin_unlock(&bdev->internal.spinlock); 4349 bdev_channel_destroy_resource(ch); 4350 return -1; 4351 } 4352 new_range->length = range->length; 4353 new_range->offset = range->offset; 4354 new_range->locked_ctx = range->locked_ctx; 4355 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4356 } 4357 4358 spdk_spin_unlock(&bdev->internal.spinlock); 4359 4360 return 0; 4361 } 4362 4363 static int 4364 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4365 void *cb_ctx) 4366 { 4367 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4368 struct spdk_bdev_io *bdev_io; 4369 uint64_t buf_len; 4370 4371 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4372 if (bdev_io->internal.ch == bdev_ch) { 4373 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4374 spdk_iobuf_entry_abort(ch, entry, buf_len); 4375 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4376 } 4377 4378 return 0; 4379 } 4380 4381 /* 4382 * Abort I/O that are waiting on a data buffer. 4383 */ 4384 static void 4385 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4386 { 4387 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4388 } 4389 4390 /* 4391 * Abort I/O that are queued waiting for submission. These types of I/O are 4392 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4393 */ 4394 static void 4395 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4396 { 4397 struct spdk_bdev_io *bdev_io, *tmp; 4398 4399 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4400 if (bdev_io->internal.ch == ch) { 4401 TAILQ_REMOVE(queue, bdev_io, internal.link); 4402 /* 4403 * spdk_bdev_io_complete() assumes that the completed I/O had 4404 * been submitted to the bdev module. Since in this case it 4405 * hadn't, bump io_outstanding to account for the decrement 4406 * that spdk_bdev_io_complete() will do. 4407 */ 4408 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4409 bdev_io_increment_outstanding(ch, ch->shared_resource); 4410 } 4411 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4412 } 4413 } 4414 } 4415 4416 static bool 4417 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4418 { 4419 struct spdk_bdev_io *bdev_io; 4420 4421 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4422 if (bdev_io == bio_to_abort) { 4423 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4424 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4425 return true; 4426 } 4427 } 4428 4429 return false; 4430 } 4431 4432 static int 4433 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4434 { 4435 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4436 uint64_t buf_len; 4437 4438 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4439 if (bdev_io == bio_to_abort) { 4440 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4441 spdk_iobuf_entry_abort(ch, entry, buf_len); 4442 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4443 return 1; 4444 } 4445 4446 return 0; 4447 } 4448 4449 static bool 4450 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4451 { 4452 int rc; 4453 4454 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4455 return rc == 1; 4456 } 4457 4458 static void 4459 bdev_qos_channel_destroy(void *cb_arg) 4460 { 4461 struct spdk_bdev_qos *qos = cb_arg; 4462 4463 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4464 spdk_poller_unregister(&qos->poller); 4465 4466 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4467 4468 free(qos); 4469 } 4470 4471 static int 4472 bdev_qos_destroy(struct spdk_bdev *bdev) 4473 { 4474 int i; 4475 4476 /* 4477 * Cleanly shutting down the QoS poller is tricky, because 4478 * during the asynchronous operation the user could open 4479 * a new descriptor and create a new channel, spawning 4480 * a new QoS poller. 4481 * 4482 * The strategy is to create a new QoS structure here and swap it 4483 * in. The shutdown path then continues to refer to the old one 4484 * until it completes and then releases it. 4485 */ 4486 struct spdk_bdev_qos *new_qos, *old_qos; 4487 4488 old_qos = bdev->internal.qos; 4489 4490 new_qos = calloc(1, sizeof(*new_qos)); 4491 if (!new_qos) { 4492 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4493 return -ENOMEM; 4494 } 4495 4496 /* Copy the old QoS data into the newly allocated structure */ 4497 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4498 4499 /* Zero out the key parts of the QoS structure */ 4500 new_qos->ch = NULL; 4501 new_qos->thread = NULL; 4502 new_qos->poller = NULL; 4503 /* 4504 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4505 * It will be used later for the new QoS structure. 4506 */ 4507 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4508 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4509 new_qos->rate_limits[i].min_per_timeslice = 0; 4510 new_qos->rate_limits[i].max_per_timeslice = 0; 4511 } 4512 4513 bdev->internal.qos = new_qos; 4514 4515 if (old_qos->thread == NULL) { 4516 free(old_qos); 4517 } else { 4518 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4519 } 4520 4521 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4522 * been destroyed yet. The destruction path will end up waiting for the final 4523 * channel to be put before it releases resources. */ 4524 4525 return 0; 4526 } 4527 4528 void 4529 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4530 { 4531 total->bytes_read += add->bytes_read; 4532 total->num_read_ops += add->num_read_ops; 4533 total->bytes_written += add->bytes_written; 4534 total->num_write_ops += add->num_write_ops; 4535 total->bytes_unmapped += add->bytes_unmapped; 4536 total->num_unmap_ops += add->num_unmap_ops; 4537 total->bytes_copied += add->bytes_copied; 4538 total->num_copy_ops += add->num_copy_ops; 4539 total->read_latency_ticks += add->read_latency_ticks; 4540 total->write_latency_ticks += add->write_latency_ticks; 4541 total->unmap_latency_ticks += add->unmap_latency_ticks; 4542 total->copy_latency_ticks += add->copy_latency_ticks; 4543 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4544 total->max_read_latency_ticks = add->max_read_latency_ticks; 4545 } 4546 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4547 total->min_read_latency_ticks = add->min_read_latency_ticks; 4548 } 4549 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4550 total->max_write_latency_ticks = add->max_write_latency_ticks; 4551 } 4552 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4553 total->min_write_latency_ticks = add->min_write_latency_ticks; 4554 } 4555 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4556 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4557 } 4558 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4559 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4560 } 4561 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4562 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4563 } 4564 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4565 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4566 } 4567 } 4568 4569 static void 4570 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4571 { 4572 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4573 4574 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4575 memcpy(to_stat->io_error, from_stat->io_error, 4576 sizeof(struct spdk_bdev_io_error_stat)); 4577 } 4578 } 4579 4580 void 4581 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4582 { 4583 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4584 return; 4585 } 4586 4587 stat->max_read_latency_ticks = 0; 4588 stat->min_read_latency_ticks = UINT64_MAX; 4589 stat->max_write_latency_ticks = 0; 4590 stat->min_write_latency_ticks = UINT64_MAX; 4591 stat->max_unmap_latency_ticks = 0; 4592 stat->min_unmap_latency_ticks = UINT64_MAX; 4593 stat->max_copy_latency_ticks = 0; 4594 stat->min_copy_latency_ticks = UINT64_MAX; 4595 4596 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4597 return; 4598 } 4599 4600 stat->bytes_read = 0; 4601 stat->num_read_ops = 0; 4602 stat->bytes_written = 0; 4603 stat->num_write_ops = 0; 4604 stat->bytes_unmapped = 0; 4605 stat->num_unmap_ops = 0; 4606 stat->bytes_copied = 0; 4607 stat->num_copy_ops = 0; 4608 stat->read_latency_ticks = 0; 4609 stat->write_latency_ticks = 0; 4610 stat->unmap_latency_ticks = 0; 4611 stat->copy_latency_ticks = 0; 4612 4613 if (stat->io_error != NULL) { 4614 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4615 } 4616 } 4617 4618 struct spdk_bdev_io_stat * 4619 bdev_alloc_io_stat(bool io_error_stat) 4620 { 4621 struct spdk_bdev_io_stat *stat; 4622 4623 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4624 if (stat == NULL) { 4625 return NULL; 4626 } 4627 4628 if (io_error_stat) { 4629 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4630 if (stat->io_error == NULL) { 4631 free(stat); 4632 return NULL; 4633 } 4634 } else { 4635 stat->io_error = NULL; 4636 } 4637 4638 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4639 4640 return stat; 4641 } 4642 4643 void 4644 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4645 { 4646 if (stat != NULL) { 4647 free(stat->io_error); 4648 free(stat); 4649 } 4650 } 4651 4652 void 4653 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4654 { 4655 int i; 4656 4657 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4658 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4659 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4660 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4661 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4662 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4663 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4664 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4665 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4666 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4667 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4668 stat->min_read_latency_ticks != UINT64_MAX ? 4669 stat->min_read_latency_ticks : 0); 4670 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4671 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4672 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4673 stat->min_write_latency_ticks != UINT64_MAX ? 4674 stat->min_write_latency_ticks : 0); 4675 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4676 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4677 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4678 stat->min_unmap_latency_ticks != UINT64_MAX ? 4679 stat->min_unmap_latency_ticks : 0); 4680 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4681 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4682 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4683 stat->min_copy_latency_ticks != UINT64_MAX ? 4684 stat->min_copy_latency_ticks : 0); 4685 4686 if (stat->io_error != NULL) { 4687 spdk_json_write_named_object_begin(w, "io_error"); 4688 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4689 if (stat->io_error->error_status[i] != 0) { 4690 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4691 stat->io_error->error_status[i]); 4692 } 4693 } 4694 spdk_json_write_object_end(w); 4695 } 4696 } 4697 4698 static void 4699 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4700 { 4701 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4702 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4703 4704 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4705 bdev_abort_all_buf_io(mgmt_ch, ch); 4706 } 4707 4708 static void 4709 bdev_channel_destroy(void *io_device, void *ctx_buf) 4710 { 4711 struct spdk_bdev_channel *ch = ctx_buf; 4712 4713 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4714 spdk_get_thread()); 4715 4716 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4717 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4718 4719 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4720 spdk_spin_lock(&ch->bdev->internal.spinlock); 4721 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4722 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4723 4724 bdev_channel_abort_queued_ios(ch); 4725 4726 if (ch->histogram) { 4727 spdk_histogram_data_free(ch->histogram); 4728 } 4729 4730 bdev_channel_destroy_resource(ch); 4731 } 4732 4733 /* 4734 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4735 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4736 */ 4737 static int 4738 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4739 { 4740 struct spdk_bdev_name *tmp; 4741 4742 bdev_name->name = strdup(name); 4743 if (bdev_name->name == NULL) { 4744 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4745 return -ENOMEM; 4746 } 4747 4748 bdev_name->bdev = bdev; 4749 4750 spdk_spin_lock(&g_bdev_mgr.spinlock); 4751 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4752 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4753 4754 if (tmp != NULL) { 4755 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4756 free(bdev_name->name); 4757 return -EEXIST; 4758 } 4759 4760 return 0; 4761 } 4762 4763 static void 4764 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4765 { 4766 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4767 free(bdev_name->name); 4768 } 4769 4770 static void 4771 bdev_name_del(struct spdk_bdev_name *bdev_name) 4772 { 4773 spdk_spin_lock(&g_bdev_mgr.spinlock); 4774 bdev_name_del_unsafe(bdev_name); 4775 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4776 } 4777 4778 int 4779 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4780 { 4781 struct spdk_bdev_alias *tmp; 4782 int ret; 4783 4784 if (alias == NULL) { 4785 SPDK_ERRLOG("Empty alias passed\n"); 4786 return -EINVAL; 4787 } 4788 4789 tmp = calloc(1, sizeof(*tmp)); 4790 if (tmp == NULL) { 4791 SPDK_ERRLOG("Unable to allocate alias\n"); 4792 return -ENOMEM; 4793 } 4794 4795 ret = bdev_name_add(&tmp->alias, bdev, alias); 4796 if (ret != 0) { 4797 free(tmp); 4798 return ret; 4799 } 4800 4801 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4802 4803 return 0; 4804 } 4805 4806 static int 4807 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4808 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4809 { 4810 struct spdk_bdev_alias *tmp; 4811 4812 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4813 if (strcmp(alias, tmp->alias.name) == 0) { 4814 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4815 alias_del_fn(&tmp->alias); 4816 free(tmp); 4817 return 0; 4818 } 4819 } 4820 4821 return -ENOENT; 4822 } 4823 4824 int 4825 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4826 { 4827 int rc; 4828 4829 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4830 if (rc == -ENOENT) { 4831 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4832 } 4833 4834 return rc; 4835 } 4836 4837 void 4838 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4839 { 4840 struct spdk_bdev_alias *p, *tmp; 4841 4842 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4843 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4844 bdev_name_del(&p->alias); 4845 free(p); 4846 } 4847 } 4848 4849 struct spdk_io_channel * 4850 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4851 { 4852 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4853 } 4854 4855 void * 4856 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4857 { 4858 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4859 void *ctx = NULL; 4860 4861 if (bdev->fn_table->get_module_ctx) { 4862 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4863 } 4864 4865 return ctx; 4866 } 4867 4868 const char * 4869 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4870 { 4871 return bdev->module->name; 4872 } 4873 4874 const char * 4875 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4876 { 4877 return bdev->name; 4878 } 4879 4880 const char * 4881 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4882 { 4883 return bdev->product_name; 4884 } 4885 4886 const struct spdk_bdev_aliases_list * 4887 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4888 { 4889 return &bdev->aliases; 4890 } 4891 4892 uint32_t 4893 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4894 { 4895 return bdev->blocklen; 4896 } 4897 4898 uint32_t 4899 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4900 { 4901 return bdev->write_unit_size; 4902 } 4903 4904 uint64_t 4905 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4906 { 4907 return bdev->blockcnt; 4908 } 4909 4910 const char * 4911 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4912 { 4913 return qos_rpc_type[type]; 4914 } 4915 4916 void 4917 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4918 { 4919 int i; 4920 4921 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4922 4923 spdk_spin_lock(&bdev->internal.spinlock); 4924 if (bdev->internal.qos) { 4925 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4926 if (bdev->internal.qos->rate_limits[i].limit != 4927 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4928 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4929 if (bdev_qos_is_iops_rate_limit(i) == false) { 4930 /* Change from Byte to Megabyte which is user visible. */ 4931 limits[i] = limits[i] / 1024 / 1024; 4932 } 4933 } 4934 } 4935 } 4936 spdk_spin_unlock(&bdev->internal.spinlock); 4937 } 4938 4939 size_t 4940 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4941 { 4942 return 1 << bdev->required_alignment; 4943 } 4944 4945 uint32_t 4946 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4947 { 4948 return bdev->optimal_io_boundary; 4949 } 4950 4951 bool 4952 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4953 { 4954 return bdev->write_cache; 4955 } 4956 4957 const struct spdk_uuid * 4958 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4959 { 4960 return &bdev->uuid; 4961 } 4962 4963 uint16_t 4964 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4965 { 4966 return bdev->acwu; 4967 } 4968 4969 uint32_t 4970 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4971 { 4972 return bdev->md_len; 4973 } 4974 4975 bool 4976 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4977 { 4978 return (bdev->md_len != 0) && bdev->md_interleave; 4979 } 4980 4981 bool 4982 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4983 { 4984 return (bdev->md_len != 0) && !bdev->md_interleave; 4985 } 4986 4987 bool 4988 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4989 { 4990 return bdev->zoned; 4991 } 4992 4993 uint32_t 4994 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4995 { 4996 if (spdk_bdev_is_md_interleaved(bdev)) { 4997 return bdev->blocklen - bdev->md_len; 4998 } else { 4999 return bdev->blocklen; 5000 } 5001 } 5002 5003 uint32_t 5004 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 5005 { 5006 return bdev->phys_blocklen; 5007 } 5008 5009 static uint32_t 5010 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 5011 { 5012 if (!spdk_bdev_is_md_interleaved(bdev)) { 5013 return bdev->blocklen + bdev->md_len; 5014 } else { 5015 return bdev->blocklen; 5016 } 5017 } 5018 5019 /* We have to use the typedef in the function declaration to appease astyle. */ 5020 typedef enum spdk_dif_type spdk_dif_type_t; 5021 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 5022 5023 spdk_dif_type_t 5024 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 5025 { 5026 if (bdev->md_len != 0) { 5027 return bdev->dif_type; 5028 } else { 5029 return SPDK_DIF_DISABLE; 5030 } 5031 } 5032 5033 spdk_dif_pi_format_t 5034 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 5035 { 5036 return bdev->dif_pi_format; 5037 } 5038 5039 bool 5040 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 5041 { 5042 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 5043 return bdev->dif_is_head_of_md; 5044 } else { 5045 return false; 5046 } 5047 } 5048 5049 bool 5050 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 5051 enum spdk_dif_check_type check_type) 5052 { 5053 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 5054 return false; 5055 } 5056 5057 switch (check_type) { 5058 case SPDK_DIF_CHECK_TYPE_REFTAG: 5059 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5060 case SPDK_DIF_CHECK_TYPE_APPTAG: 5061 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5062 case SPDK_DIF_CHECK_TYPE_GUARD: 5063 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5064 default: 5065 return false; 5066 } 5067 } 5068 5069 static uint32_t 5070 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5071 { 5072 uint64_t aligned_length, max_write_blocks; 5073 5074 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5075 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5076 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5077 5078 return max_write_blocks; 5079 } 5080 5081 uint32_t 5082 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5083 { 5084 return bdev->max_copy; 5085 } 5086 5087 uint64_t 5088 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5089 { 5090 return bdev->internal.measured_queue_depth; 5091 } 5092 5093 uint64_t 5094 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5095 { 5096 return bdev->internal.period; 5097 } 5098 5099 uint64_t 5100 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5101 { 5102 return bdev->internal.weighted_io_time; 5103 } 5104 5105 uint64_t 5106 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5107 { 5108 return bdev->internal.io_time; 5109 } 5110 5111 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5112 { 5113 return bdev->ctratt; 5114 } 5115 5116 uint32_t 5117 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5118 { 5119 return bdev->nsid; 5120 } 5121 5122 uint32_t 5123 spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 5124 { 5125 struct spdk_bdev *bdev = desc->bdev; 5126 5127 return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen; 5128 } 5129 5130 uint32_t 5131 spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc) 5132 { 5133 struct spdk_bdev *bdev = desc->bdev; 5134 5135 return desc->opts.hide_metadata ? 0 : bdev->md_len; 5136 } 5137 5138 bool 5139 spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc) 5140 { 5141 struct spdk_bdev *bdev = desc->bdev; 5142 5143 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev); 5144 } 5145 5146 bool 5147 spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc) 5148 { 5149 struct spdk_bdev *bdev = desc->bdev; 5150 5151 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev); 5152 } 5153 5154 spdk_dif_type_t 5155 spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc) 5156 { 5157 struct spdk_bdev *bdev = desc->bdev; 5158 5159 return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev); 5160 } 5161 5162 spdk_dif_pi_format_t 5163 spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc) 5164 { 5165 struct spdk_bdev *bdev = desc->bdev; 5166 5167 return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev); 5168 } 5169 5170 bool 5171 spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc) 5172 { 5173 struct spdk_bdev *bdev = desc->bdev; 5174 5175 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev); 5176 } 5177 5178 bool 5179 spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc, 5180 enum spdk_dif_check_type check_type) 5181 { 5182 struct spdk_bdev *bdev = desc->bdev; 5183 5184 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type); 5185 } 5186 5187 static void bdev_update_qd_sampling_period(void *ctx); 5188 5189 static void 5190 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5191 { 5192 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5193 5194 if (bdev->internal.measured_queue_depth) { 5195 bdev->internal.io_time += bdev->internal.period; 5196 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5197 } 5198 5199 bdev->internal.qd_poll_in_progress = false; 5200 5201 bdev_update_qd_sampling_period(bdev); 5202 } 5203 5204 static void 5205 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5206 struct spdk_io_channel *io_ch, void *_ctx) 5207 { 5208 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5209 5210 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5211 spdk_bdev_for_each_channel_continue(i, 0); 5212 } 5213 5214 static int 5215 bdev_calculate_measured_queue_depth(void *ctx) 5216 { 5217 struct spdk_bdev *bdev = ctx; 5218 5219 bdev->internal.qd_poll_in_progress = true; 5220 bdev->internal.temporary_queue_depth = 0; 5221 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5222 return SPDK_POLLER_BUSY; 5223 } 5224 5225 static void 5226 bdev_update_qd_sampling_period(void *ctx) 5227 { 5228 struct spdk_bdev *bdev = ctx; 5229 5230 if (bdev->internal.period == bdev->internal.new_period) { 5231 return; 5232 } 5233 5234 if (bdev->internal.qd_poll_in_progress) { 5235 return; 5236 } 5237 5238 bdev->internal.period = bdev->internal.new_period; 5239 5240 spdk_poller_unregister(&bdev->internal.qd_poller); 5241 if (bdev->internal.period != 0) { 5242 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5243 bdev, bdev->internal.period); 5244 } else { 5245 spdk_bdev_close(bdev->internal.qd_desc); 5246 bdev->internal.qd_desc = NULL; 5247 } 5248 } 5249 5250 static void 5251 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5252 { 5253 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5254 } 5255 5256 void 5257 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5258 { 5259 int rc; 5260 5261 if (bdev->internal.new_period == period) { 5262 return; 5263 } 5264 5265 bdev->internal.new_period = period; 5266 5267 if (bdev->internal.qd_desc != NULL) { 5268 assert(bdev->internal.period != 0); 5269 5270 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5271 bdev_update_qd_sampling_period, bdev); 5272 return; 5273 } 5274 5275 assert(bdev->internal.period == 0); 5276 5277 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5278 NULL, &bdev->internal.qd_desc); 5279 if (rc != 0) { 5280 return; 5281 } 5282 5283 bdev->internal.period = period; 5284 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5285 bdev, period); 5286 } 5287 5288 struct bdev_get_current_qd_ctx { 5289 uint64_t current_qd; 5290 spdk_bdev_get_current_qd_cb cb_fn; 5291 void *cb_arg; 5292 }; 5293 5294 static void 5295 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5296 { 5297 struct bdev_get_current_qd_ctx *ctx = _ctx; 5298 5299 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5300 5301 free(ctx); 5302 } 5303 5304 static void 5305 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5306 struct spdk_io_channel *io_ch, void *_ctx) 5307 { 5308 struct bdev_get_current_qd_ctx *ctx = _ctx; 5309 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5310 5311 ctx->current_qd += bdev_ch->io_outstanding; 5312 5313 spdk_bdev_for_each_channel_continue(i, 0); 5314 } 5315 5316 void 5317 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5318 void *cb_arg) 5319 { 5320 struct bdev_get_current_qd_ctx *ctx; 5321 5322 assert(cb_fn != NULL); 5323 5324 ctx = calloc(1, sizeof(*ctx)); 5325 if (ctx == NULL) { 5326 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5327 return; 5328 } 5329 5330 ctx->cb_fn = cb_fn; 5331 ctx->cb_arg = cb_arg; 5332 5333 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5334 } 5335 5336 static void 5337 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5338 { 5339 assert(desc->thread == spdk_get_thread()); 5340 5341 spdk_spin_lock(&desc->spinlock); 5342 desc->refs--; 5343 if (!desc->closed) { 5344 spdk_spin_unlock(&desc->spinlock); 5345 desc->callback.event_fn(type, 5346 desc->bdev, 5347 desc->callback.ctx); 5348 return; 5349 } else if (desc->refs == 0) { 5350 /* This descriptor was closed after this event_notify message was sent. 5351 * spdk_bdev_close() could not free the descriptor since this message was 5352 * in flight, so we free it now using bdev_desc_free(). 5353 */ 5354 spdk_spin_unlock(&desc->spinlock); 5355 bdev_desc_free(desc); 5356 return; 5357 } 5358 spdk_spin_unlock(&desc->spinlock); 5359 } 5360 5361 static void 5362 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5363 { 5364 spdk_spin_lock(&desc->spinlock); 5365 desc->refs++; 5366 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5367 spdk_spin_unlock(&desc->spinlock); 5368 } 5369 5370 static void 5371 _resize_notify(void *ctx) 5372 { 5373 struct spdk_bdev_desc *desc = ctx; 5374 5375 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5376 } 5377 5378 int 5379 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5380 { 5381 struct spdk_bdev_desc *desc; 5382 int ret; 5383 5384 if (size == bdev->blockcnt) { 5385 return 0; 5386 } 5387 5388 spdk_spin_lock(&bdev->internal.spinlock); 5389 5390 /* bdev has open descriptors */ 5391 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5392 bdev->blockcnt > size) { 5393 ret = -EBUSY; 5394 } else { 5395 bdev->blockcnt = size; 5396 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5397 event_notify(desc, _resize_notify); 5398 } 5399 ret = 0; 5400 } 5401 5402 spdk_spin_unlock(&bdev->internal.spinlock); 5403 5404 return ret; 5405 } 5406 5407 /* 5408 * Convert I/O offset and length from bytes to blocks. 5409 * 5410 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5411 */ 5412 static uint64_t 5413 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes, 5414 uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks) 5415 { 5416 uint32_t block_size = bdev_desc_get_block_size(desc); 5417 uint8_t shift_cnt; 5418 5419 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5420 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5421 shift_cnt = spdk_u32log2(block_size); 5422 *offset_blocks = offset_bytes >> shift_cnt; 5423 *num_blocks = num_bytes >> shift_cnt; 5424 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5425 (num_bytes - (*num_blocks << shift_cnt)); 5426 } else { 5427 *offset_blocks = offset_bytes / block_size; 5428 *num_blocks = num_bytes / block_size; 5429 return (offset_bytes % block_size) | (num_bytes % block_size); 5430 } 5431 } 5432 5433 static bool 5434 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5435 { 5436 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5437 * has been an overflow and hence the offset has been wrapped around */ 5438 if (offset_blocks + num_blocks < offset_blocks) { 5439 return false; 5440 } 5441 5442 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5443 if (offset_blocks + num_blocks > bdev->blockcnt) { 5444 return false; 5445 } 5446 5447 return true; 5448 } 5449 5450 static void 5451 bdev_seek_complete_cb(void *ctx) 5452 { 5453 struct spdk_bdev_io *bdev_io = ctx; 5454 5455 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5456 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5457 } 5458 5459 static int 5460 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5461 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5462 spdk_bdev_io_completion_cb cb, void *cb_arg) 5463 { 5464 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5465 struct spdk_bdev_io *bdev_io; 5466 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5467 5468 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5469 5470 /* Check if offset_blocks is valid looking at the validity of one block */ 5471 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5472 return -EINVAL; 5473 } 5474 5475 bdev_io = bdev_channel_get_io(channel); 5476 if (!bdev_io) { 5477 return -ENOMEM; 5478 } 5479 5480 bdev_io->internal.ch = channel; 5481 bdev_io->internal.desc = desc; 5482 bdev_io->type = io_type; 5483 bdev_io->u.bdev.offset_blocks = offset_blocks; 5484 bdev_io->u.bdev.memory_domain = NULL; 5485 bdev_io->u.bdev.memory_domain_ctx = NULL; 5486 bdev_io->u.bdev.accel_sequence = NULL; 5487 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5488 5489 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5490 /* In case bdev doesn't support seek to next data/hole offset, 5491 * it is assumed that only data and no holes are present */ 5492 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5493 bdev_io->u.bdev.seek.offset = offset_blocks; 5494 } else { 5495 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5496 } 5497 5498 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5499 return 0; 5500 } 5501 5502 bdev_io_submit(bdev_io); 5503 return 0; 5504 } 5505 5506 int 5507 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5508 uint64_t offset_blocks, 5509 spdk_bdev_io_completion_cb cb, void *cb_arg) 5510 { 5511 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5512 } 5513 5514 int 5515 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5516 uint64_t offset_blocks, 5517 spdk_bdev_io_completion_cb cb, void *cb_arg) 5518 { 5519 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5520 } 5521 5522 uint64_t 5523 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5524 { 5525 return bdev_io->u.bdev.seek.offset; 5526 } 5527 5528 static int 5529 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5530 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5531 spdk_bdev_io_completion_cb cb, void *cb_arg) 5532 { 5533 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5534 struct spdk_bdev_io *bdev_io; 5535 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5536 5537 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5538 return -EINVAL; 5539 } 5540 5541 bdev_io = bdev_channel_get_io(channel); 5542 if (!bdev_io) { 5543 return -ENOMEM; 5544 } 5545 5546 bdev_io->internal.ch = channel; 5547 bdev_io->internal.desc = desc; 5548 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5549 bdev_io->u.bdev.iovs = &bdev_io->iov; 5550 bdev_io->u.bdev.iovs[0].iov_base = buf; 5551 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5552 bdev_io->u.bdev.iovcnt = 1; 5553 bdev_io->u.bdev.md_buf = md_buf; 5554 bdev_io->u.bdev.num_blocks = num_blocks; 5555 bdev_io->u.bdev.offset_blocks = offset_blocks; 5556 bdev_io->u.bdev.memory_domain = NULL; 5557 bdev_io->u.bdev.memory_domain_ctx = NULL; 5558 bdev_io->u.bdev.accel_sequence = NULL; 5559 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5560 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5561 5562 bdev_io_submit(bdev_io); 5563 return 0; 5564 } 5565 5566 int 5567 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5568 void *buf, uint64_t offset, uint64_t nbytes, 5569 spdk_bdev_io_completion_cb cb, void *cb_arg) 5570 { 5571 uint64_t offset_blocks, num_blocks; 5572 5573 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5574 return -EINVAL; 5575 } 5576 5577 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5578 } 5579 5580 int 5581 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5582 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5583 spdk_bdev_io_completion_cb cb, void *cb_arg) 5584 { 5585 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5586 } 5587 5588 int 5589 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5590 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5591 spdk_bdev_io_completion_cb cb, void *cb_arg) 5592 { 5593 struct iovec iov = { 5594 .iov_base = buf, 5595 }; 5596 5597 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5598 return -EINVAL; 5599 } 5600 5601 if (md_buf && !_is_buf_allocated(&iov)) { 5602 return -EINVAL; 5603 } 5604 5605 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5606 cb, cb_arg); 5607 } 5608 5609 int 5610 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5611 struct iovec *iov, int iovcnt, 5612 uint64_t offset, uint64_t nbytes, 5613 spdk_bdev_io_completion_cb cb, void *cb_arg) 5614 { 5615 uint64_t offset_blocks, num_blocks; 5616 5617 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5618 return -EINVAL; 5619 } 5620 5621 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5622 } 5623 5624 static int 5625 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5626 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5627 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5628 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5629 spdk_bdev_io_completion_cb cb, void *cb_arg) 5630 { 5631 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5632 struct spdk_bdev_io *bdev_io; 5633 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5634 5635 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5636 return -EINVAL; 5637 } 5638 5639 bdev_io = bdev_channel_get_io(channel); 5640 if (spdk_unlikely(!bdev_io)) { 5641 return -ENOMEM; 5642 } 5643 5644 bdev_io->internal.ch = channel; 5645 bdev_io->internal.desc = desc; 5646 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5647 bdev_io->u.bdev.iovs = iov; 5648 bdev_io->u.bdev.iovcnt = iovcnt; 5649 bdev_io->u.bdev.md_buf = md_buf; 5650 bdev_io->u.bdev.num_blocks = num_blocks; 5651 bdev_io->u.bdev.offset_blocks = offset_blocks; 5652 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5653 5654 if (seq != NULL) { 5655 bdev_io->internal.f.has_accel_sequence = true; 5656 bdev_io->internal.accel_sequence = seq; 5657 } 5658 5659 if (domain != NULL) { 5660 bdev_io->internal.f.has_memory_domain = true; 5661 bdev_io->internal.memory_domain = domain; 5662 bdev_io->internal.memory_domain_ctx = domain_ctx; 5663 } 5664 5665 bdev_io->u.bdev.memory_domain = domain; 5666 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5667 bdev_io->u.bdev.accel_sequence = seq; 5668 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5669 5670 _bdev_io_submit_ext(desc, bdev_io); 5671 5672 return 0; 5673 } 5674 5675 int 5676 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5677 struct iovec *iov, int iovcnt, 5678 uint64_t offset_blocks, uint64_t num_blocks, 5679 spdk_bdev_io_completion_cb cb, void *cb_arg) 5680 { 5681 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5682 5683 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5684 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5685 } 5686 5687 int 5688 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5689 struct iovec *iov, int iovcnt, void *md_buf, 5690 uint64_t offset_blocks, uint64_t num_blocks, 5691 spdk_bdev_io_completion_cb cb, void *cb_arg) 5692 { 5693 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5694 5695 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5696 return -EINVAL; 5697 } 5698 5699 if (md_buf && !_is_buf_allocated(iov)) { 5700 return -EINVAL; 5701 } 5702 5703 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5704 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5705 } 5706 5707 static inline bool 5708 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5709 { 5710 /* 5711 * We check if opts size is at least of size when we first introduced 5712 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5713 * are not checked internal. 5714 */ 5715 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5716 sizeof(opts->metadata) && 5717 opts->size <= sizeof(*opts) && 5718 /* When memory domain is used, the user must provide data buffers */ 5719 (!opts->memory_domain || (iov && iov[0].iov_base)); 5720 } 5721 5722 int 5723 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5724 struct iovec *iov, int iovcnt, 5725 uint64_t offset_blocks, uint64_t num_blocks, 5726 spdk_bdev_io_completion_cb cb, void *cb_arg, 5727 struct spdk_bdev_ext_io_opts *opts) 5728 { 5729 struct spdk_memory_domain *domain = NULL; 5730 struct spdk_accel_sequence *seq = NULL; 5731 void *domain_ctx = NULL, *md = NULL; 5732 uint32_t dif_check_flags = 0; 5733 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5734 5735 if (opts) { 5736 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5737 return -EINVAL; 5738 } 5739 5740 md = opts->metadata; 5741 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5742 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5743 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5744 if (md) { 5745 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5746 return -EINVAL; 5747 } 5748 5749 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5750 return -EINVAL; 5751 } 5752 5753 if (spdk_unlikely(seq != NULL)) { 5754 return -EINVAL; 5755 } 5756 } 5757 } 5758 5759 dif_check_flags = bdev->dif_check_flags & 5760 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5761 5762 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5763 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5764 } 5765 5766 static int 5767 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5768 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5769 spdk_bdev_io_completion_cb cb, void *cb_arg) 5770 { 5771 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5772 struct spdk_bdev_io *bdev_io; 5773 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5774 5775 if (!desc->write) { 5776 return -EBADF; 5777 } 5778 5779 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5780 return -EINVAL; 5781 } 5782 5783 bdev_io = bdev_channel_get_io(channel); 5784 if (!bdev_io) { 5785 return -ENOMEM; 5786 } 5787 5788 bdev_io->internal.ch = channel; 5789 bdev_io->internal.desc = desc; 5790 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5791 bdev_io->u.bdev.iovs = &bdev_io->iov; 5792 bdev_io->u.bdev.iovs[0].iov_base = buf; 5793 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5794 bdev_io->u.bdev.iovcnt = 1; 5795 bdev_io->u.bdev.md_buf = md_buf; 5796 bdev_io->u.bdev.num_blocks = num_blocks; 5797 bdev_io->u.bdev.offset_blocks = offset_blocks; 5798 bdev_io->u.bdev.memory_domain = NULL; 5799 bdev_io->u.bdev.memory_domain_ctx = NULL; 5800 bdev_io->u.bdev.accel_sequence = NULL; 5801 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5802 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5803 5804 bdev_io_submit(bdev_io); 5805 return 0; 5806 } 5807 5808 int 5809 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5810 void *buf, uint64_t offset, uint64_t nbytes, 5811 spdk_bdev_io_completion_cb cb, void *cb_arg) 5812 { 5813 uint64_t offset_blocks, num_blocks; 5814 5815 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5816 return -EINVAL; 5817 } 5818 5819 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5820 } 5821 5822 int 5823 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5824 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5825 spdk_bdev_io_completion_cb cb, void *cb_arg) 5826 { 5827 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5828 cb, cb_arg); 5829 } 5830 5831 int 5832 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5833 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5834 spdk_bdev_io_completion_cb cb, void *cb_arg) 5835 { 5836 struct iovec iov = { 5837 .iov_base = buf, 5838 }; 5839 5840 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5841 return -EINVAL; 5842 } 5843 5844 if (md_buf && !_is_buf_allocated(&iov)) { 5845 return -EINVAL; 5846 } 5847 5848 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5849 cb, cb_arg); 5850 } 5851 5852 static int 5853 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5854 struct iovec *iov, int iovcnt, void *md_buf, 5855 uint64_t offset_blocks, uint64_t num_blocks, 5856 struct spdk_memory_domain *domain, void *domain_ctx, 5857 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5858 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5859 spdk_bdev_io_completion_cb cb, void *cb_arg) 5860 { 5861 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5862 struct spdk_bdev_io *bdev_io; 5863 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5864 5865 if (spdk_unlikely(!desc->write)) { 5866 return -EBADF; 5867 } 5868 5869 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5870 return -EINVAL; 5871 } 5872 5873 bdev_io = bdev_channel_get_io(channel); 5874 if (spdk_unlikely(!bdev_io)) { 5875 return -ENOMEM; 5876 } 5877 5878 bdev_io->internal.ch = channel; 5879 bdev_io->internal.desc = desc; 5880 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5881 bdev_io->u.bdev.iovs = iov; 5882 bdev_io->u.bdev.iovcnt = iovcnt; 5883 bdev_io->u.bdev.md_buf = md_buf; 5884 bdev_io->u.bdev.num_blocks = num_blocks; 5885 bdev_io->u.bdev.offset_blocks = offset_blocks; 5886 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5887 if (seq != NULL) { 5888 bdev_io->internal.f.has_accel_sequence = true; 5889 bdev_io->internal.accel_sequence = seq; 5890 } 5891 5892 if (domain != NULL) { 5893 bdev_io->internal.f.has_memory_domain = true; 5894 bdev_io->internal.memory_domain = domain; 5895 bdev_io->internal.memory_domain_ctx = domain_ctx; 5896 } 5897 5898 bdev_io->u.bdev.memory_domain = domain; 5899 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5900 bdev_io->u.bdev.accel_sequence = seq; 5901 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5902 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5903 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5904 5905 _bdev_io_submit_ext(desc, bdev_io); 5906 5907 return 0; 5908 } 5909 5910 int 5911 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5912 struct iovec *iov, int iovcnt, 5913 uint64_t offset, uint64_t len, 5914 spdk_bdev_io_completion_cb cb, void *cb_arg) 5915 { 5916 uint64_t offset_blocks, num_blocks; 5917 5918 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 5919 return -EINVAL; 5920 } 5921 5922 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5923 } 5924 5925 int 5926 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5927 struct iovec *iov, int iovcnt, 5928 uint64_t offset_blocks, uint64_t num_blocks, 5929 spdk_bdev_io_completion_cb cb, void *cb_arg) 5930 { 5931 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5932 5933 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5934 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5935 cb, cb_arg); 5936 } 5937 5938 int 5939 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5940 struct iovec *iov, int iovcnt, void *md_buf, 5941 uint64_t offset_blocks, uint64_t num_blocks, 5942 spdk_bdev_io_completion_cb cb, void *cb_arg) 5943 { 5944 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5945 5946 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5947 return -EINVAL; 5948 } 5949 5950 if (md_buf && !_is_buf_allocated(iov)) { 5951 return -EINVAL; 5952 } 5953 5954 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5955 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5956 cb, cb_arg); 5957 } 5958 5959 int 5960 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5961 struct iovec *iov, int iovcnt, 5962 uint64_t offset_blocks, uint64_t num_blocks, 5963 spdk_bdev_io_completion_cb cb, void *cb_arg, 5964 struct spdk_bdev_ext_io_opts *opts) 5965 { 5966 struct spdk_memory_domain *domain = NULL; 5967 struct spdk_accel_sequence *seq = NULL; 5968 void *domain_ctx = NULL, *md = NULL; 5969 uint32_t dif_check_flags = 0; 5970 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5971 uint32_t nvme_cdw12_raw = 0; 5972 uint32_t nvme_cdw13_raw = 0; 5973 5974 if (opts) { 5975 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5976 return -EINVAL; 5977 } 5978 md = opts->metadata; 5979 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5980 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5981 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5982 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5983 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5984 if (md) { 5985 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5986 return -EINVAL; 5987 } 5988 5989 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5990 return -EINVAL; 5991 } 5992 5993 if (spdk_unlikely(seq != NULL)) { 5994 return -EINVAL; 5995 } 5996 } 5997 } 5998 5999 dif_check_flags = bdev->dif_check_flags & 6000 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 6001 6002 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 6003 domain, domain_ctx, seq, dif_check_flags, 6004 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 6005 } 6006 6007 static void 6008 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6009 { 6010 struct spdk_bdev_io *parent_io = cb_arg; 6011 struct spdk_bdev *bdev = parent_io->bdev; 6012 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 6013 int i, rc = 0; 6014 6015 if (!success) { 6016 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6017 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6018 spdk_bdev_free_io(bdev_io); 6019 return; 6020 } 6021 6022 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 6023 rc = memcmp(read_buf, 6024 parent_io->u.bdev.iovs[i].iov_base, 6025 parent_io->u.bdev.iovs[i].iov_len); 6026 if (rc) { 6027 break; 6028 } 6029 read_buf += parent_io->u.bdev.iovs[i].iov_len; 6030 } 6031 6032 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 6033 rc = memcmp(bdev_io->u.bdev.md_buf, 6034 parent_io->u.bdev.md_buf, 6035 spdk_bdev_get_md_size(bdev)); 6036 } 6037 6038 spdk_bdev_free_io(bdev_io); 6039 6040 if (rc == 0) { 6041 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6042 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6043 } else { 6044 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 6045 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6046 } 6047 } 6048 6049 static void 6050 bdev_compare_do_read(void *_bdev_io) 6051 { 6052 struct spdk_bdev_io *bdev_io = _bdev_io; 6053 int rc; 6054 6055 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 6056 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 6057 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6058 bdev_compare_do_read_done, bdev_io); 6059 6060 if (rc == -ENOMEM) { 6061 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 6062 } else if (rc != 0) { 6063 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6064 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6065 } 6066 } 6067 6068 static int 6069 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6070 struct iovec *iov, int iovcnt, void *md_buf, 6071 uint64_t offset_blocks, uint64_t num_blocks, 6072 spdk_bdev_io_completion_cb cb, void *cb_arg) 6073 { 6074 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6075 struct spdk_bdev_io *bdev_io; 6076 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6077 6078 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6079 return -EINVAL; 6080 } 6081 6082 bdev_io = bdev_channel_get_io(channel); 6083 if (!bdev_io) { 6084 return -ENOMEM; 6085 } 6086 6087 bdev_io->internal.ch = channel; 6088 bdev_io->internal.desc = desc; 6089 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6090 bdev_io->u.bdev.iovs = iov; 6091 bdev_io->u.bdev.iovcnt = iovcnt; 6092 bdev_io->u.bdev.md_buf = md_buf; 6093 bdev_io->u.bdev.num_blocks = num_blocks; 6094 bdev_io->u.bdev.offset_blocks = offset_blocks; 6095 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6096 bdev_io->u.bdev.memory_domain = NULL; 6097 bdev_io->u.bdev.memory_domain_ctx = NULL; 6098 bdev_io->u.bdev.accel_sequence = NULL; 6099 6100 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6101 bdev_io_submit(bdev_io); 6102 return 0; 6103 } 6104 6105 bdev_compare_do_read(bdev_io); 6106 6107 return 0; 6108 } 6109 6110 int 6111 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6112 struct iovec *iov, int iovcnt, 6113 uint64_t offset_blocks, uint64_t num_blocks, 6114 spdk_bdev_io_completion_cb cb, void *cb_arg) 6115 { 6116 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6117 num_blocks, cb, cb_arg); 6118 } 6119 6120 int 6121 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6122 struct iovec *iov, int iovcnt, void *md_buf, 6123 uint64_t offset_blocks, uint64_t num_blocks, 6124 spdk_bdev_io_completion_cb cb, void *cb_arg) 6125 { 6126 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6127 return -EINVAL; 6128 } 6129 6130 if (md_buf && !_is_buf_allocated(iov)) { 6131 return -EINVAL; 6132 } 6133 6134 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6135 num_blocks, cb, cb_arg); 6136 } 6137 6138 static int 6139 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6140 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6141 spdk_bdev_io_completion_cb cb, void *cb_arg) 6142 { 6143 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6144 struct spdk_bdev_io *bdev_io; 6145 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6146 6147 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6148 return -EINVAL; 6149 } 6150 6151 bdev_io = bdev_channel_get_io(channel); 6152 if (!bdev_io) { 6153 return -ENOMEM; 6154 } 6155 6156 bdev_io->internal.ch = channel; 6157 bdev_io->internal.desc = desc; 6158 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6159 bdev_io->u.bdev.iovs = &bdev_io->iov; 6160 bdev_io->u.bdev.iovs[0].iov_base = buf; 6161 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 6162 bdev_io->u.bdev.iovcnt = 1; 6163 bdev_io->u.bdev.md_buf = md_buf; 6164 bdev_io->u.bdev.num_blocks = num_blocks; 6165 bdev_io->u.bdev.offset_blocks = offset_blocks; 6166 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6167 bdev_io->u.bdev.memory_domain = NULL; 6168 bdev_io->u.bdev.memory_domain_ctx = NULL; 6169 bdev_io->u.bdev.accel_sequence = NULL; 6170 6171 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6172 bdev_io_submit(bdev_io); 6173 return 0; 6174 } 6175 6176 bdev_compare_do_read(bdev_io); 6177 6178 return 0; 6179 } 6180 6181 int 6182 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6183 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6184 spdk_bdev_io_completion_cb cb, void *cb_arg) 6185 { 6186 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6187 cb, cb_arg); 6188 } 6189 6190 int 6191 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6192 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6193 spdk_bdev_io_completion_cb cb, void *cb_arg) 6194 { 6195 struct iovec iov = { 6196 .iov_base = buf, 6197 }; 6198 6199 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6200 return -EINVAL; 6201 } 6202 6203 if (md_buf && !_is_buf_allocated(&iov)) { 6204 return -EINVAL; 6205 } 6206 6207 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6208 cb, cb_arg); 6209 } 6210 6211 static void 6212 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6213 { 6214 struct spdk_bdev_io *bdev_io = ctx; 6215 6216 if (unlock_status) { 6217 SPDK_ERRLOG("LBA range unlock failed\n"); 6218 } 6219 6220 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6221 false, bdev_io->internal.caller_ctx); 6222 } 6223 6224 static void 6225 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6226 { 6227 bdev_io->internal.status = status; 6228 6229 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6230 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6231 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6232 } 6233 6234 static void 6235 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6236 { 6237 struct spdk_bdev_io *parent_io = cb_arg; 6238 6239 if (!success) { 6240 SPDK_ERRLOG("Compare and write operation failed\n"); 6241 } 6242 6243 spdk_bdev_free_io(bdev_io); 6244 6245 bdev_comparev_and_writev_blocks_unlock(parent_io, 6246 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6247 } 6248 6249 static void 6250 bdev_compare_and_write_do_write(void *_bdev_io) 6251 { 6252 struct spdk_bdev_io *bdev_io = _bdev_io; 6253 int rc; 6254 6255 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6256 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6257 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6258 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6259 bdev_compare_and_write_do_write_done, bdev_io); 6260 6261 6262 if (rc == -ENOMEM) { 6263 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6264 } else if (rc != 0) { 6265 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6266 } 6267 } 6268 6269 static void 6270 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6271 { 6272 struct spdk_bdev_io *parent_io = cb_arg; 6273 6274 spdk_bdev_free_io(bdev_io); 6275 6276 if (!success) { 6277 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6278 return; 6279 } 6280 6281 bdev_compare_and_write_do_write(parent_io); 6282 } 6283 6284 static void 6285 bdev_compare_and_write_do_compare(void *_bdev_io) 6286 { 6287 struct spdk_bdev_io *bdev_io = _bdev_io; 6288 int rc; 6289 6290 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6291 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6292 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6293 bdev_compare_and_write_do_compare_done, bdev_io); 6294 6295 if (rc == -ENOMEM) { 6296 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6297 } else if (rc != 0) { 6298 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6299 } 6300 } 6301 6302 static void 6303 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6304 { 6305 struct spdk_bdev_io *bdev_io = ctx; 6306 6307 if (status) { 6308 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6309 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6310 return; 6311 } 6312 6313 bdev_compare_and_write_do_compare(bdev_io); 6314 } 6315 6316 int 6317 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6318 struct iovec *compare_iov, int compare_iovcnt, 6319 struct iovec *write_iov, int write_iovcnt, 6320 uint64_t offset_blocks, uint64_t num_blocks, 6321 spdk_bdev_io_completion_cb cb, void *cb_arg) 6322 { 6323 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6324 struct spdk_bdev_io *bdev_io; 6325 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6326 6327 if (!desc->write) { 6328 return -EBADF; 6329 } 6330 6331 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6332 return -EINVAL; 6333 } 6334 6335 if (num_blocks > bdev->acwu) { 6336 return -EINVAL; 6337 } 6338 6339 bdev_io = bdev_channel_get_io(channel); 6340 if (!bdev_io) { 6341 return -ENOMEM; 6342 } 6343 6344 bdev_io->internal.ch = channel; 6345 bdev_io->internal.desc = desc; 6346 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6347 bdev_io->u.bdev.iovs = compare_iov; 6348 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6349 bdev_io->u.bdev.fused_iovs = write_iov; 6350 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6351 bdev_io->u.bdev.md_buf = NULL; 6352 bdev_io->u.bdev.num_blocks = num_blocks; 6353 bdev_io->u.bdev.offset_blocks = offset_blocks; 6354 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6355 bdev_io->u.bdev.memory_domain = NULL; 6356 bdev_io->u.bdev.memory_domain_ctx = NULL; 6357 bdev_io->u.bdev.accel_sequence = NULL; 6358 6359 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6360 bdev_io_submit(bdev_io); 6361 return 0; 6362 } 6363 6364 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6365 bdev_comparev_and_writev_blocks_locked, bdev_io); 6366 } 6367 6368 int 6369 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6370 struct iovec *iov, int iovcnt, 6371 uint64_t offset_blocks, uint64_t num_blocks, 6372 bool populate, 6373 spdk_bdev_io_completion_cb cb, void *cb_arg) 6374 { 6375 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6376 struct spdk_bdev_io *bdev_io; 6377 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6378 6379 if (!desc->write) { 6380 return -EBADF; 6381 } 6382 6383 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6384 return -EINVAL; 6385 } 6386 6387 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6388 return -ENOTSUP; 6389 } 6390 6391 bdev_io = bdev_channel_get_io(channel); 6392 if (!bdev_io) { 6393 return -ENOMEM; 6394 } 6395 6396 bdev_io->internal.ch = channel; 6397 bdev_io->internal.desc = desc; 6398 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6399 bdev_io->u.bdev.num_blocks = num_blocks; 6400 bdev_io->u.bdev.offset_blocks = offset_blocks; 6401 bdev_io->u.bdev.iovs = iov; 6402 bdev_io->u.bdev.iovcnt = iovcnt; 6403 bdev_io->u.bdev.md_buf = NULL; 6404 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6405 bdev_io->u.bdev.zcopy.commit = 0; 6406 bdev_io->u.bdev.zcopy.start = 1; 6407 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6408 bdev_io->u.bdev.memory_domain = NULL; 6409 bdev_io->u.bdev.memory_domain_ctx = NULL; 6410 bdev_io->u.bdev.accel_sequence = NULL; 6411 6412 bdev_io_submit(bdev_io); 6413 6414 return 0; 6415 } 6416 6417 int 6418 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6419 spdk_bdev_io_completion_cb cb, void *cb_arg) 6420 { 6421 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6422 return -EINVAL; 6423 } 6424 6425 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6426 bdev_io->u.bdev.zcopy.start = 0; 6427 bdev_io->internal.caller_ctx = cb_arg; 6428 bdev_io->internal.cb = cb; 6429 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6430 6431 bdev_io_submit(bdev_io); 6432 6433 return 0; 6434 } 6435 6436 int 6437 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6438 uint64_t offset, uint64_t len, 6439 spdk_bdev_io_completion_cb cb, void *cb_arg) 6440 { 6441 uint64_t offset_blocks, num_blocks; 6442 6443 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6444 return -EINVAL; 6445 } 6446 6447 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6448 } 6449 6450 int 6451 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6452 uint64_t offset_blocks, uint64_t num_blocks, 6453 spdk_bdev_io_completion_cb cb, void *cb_arg) 6454 { 6455 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6456 struct spdk_bdev_io *bdev_io; 6457 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6458 6459 if (!desc->write) { 6460 return -EBADF; 6461 } 6462 6463 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6464 return -EINVAL; 6465 } 6466 6467 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6468 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6469 return -ENOTSUP; 6470 } 6471 6472 bdev_io = bdev_channel_get_io(channel); 6473 6474 if (!bdev_io) { 6475 return -ENOMEM; 6476 } 6477 6478 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6479 bdev_io->internal.ch = channel; 6480 bdev_io->internal.desc = desc; 6481 bdev_io->u.bdev.offset_blocks = offset_blocks; 6482 bdev_io->u.bdev.num_blocks = num_blocks; 6483 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6484 bdev_io->u.bdev.memory_domain = NULL; 6485 bdev_io->u.bdev.memory_domain_ctx = NULL; 6486 bdev_io->u.bdev.accel_sequence = NULL; 6487 6488 /* If the write_zeroes size is large and should be split, use the generic split 6489 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6490 * 6491 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6492 * or emulate it using regular write request otherwise. 6493 */ 6494 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6495 bdev_io->internal.f.split) { 6496 bdev_io_submit(bdev_io); 6497 return 0; 6498 } 6499 6500 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6501 6502 return bdev_write_zero_buffer(bdev_io); 6503 } 6504 6505 int 6506 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6507 uint64_t offset, uint64_t nbytes, 6508 spdk_bdev_io_completion_cb cb, void *cb_arg) 6509 { 6510 uint64_t offset_blocks, num_blocks; 6511 6512 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6513 return -EINVAL; 6514 } 6515 6516 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6517 } 6518 6519 static void 6520 bdev_io_complete_cb(void *ctx) 6521 { 6522 struct spdk_bdev_io *bdev_io = ctx; 6523 6524 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6525 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6526 } 6527 6528 int 6529 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6530 uint64_t offset_blocks, uint64_t num_blocks, 6531 spdk_bdev_io_completion_cb cb, void *cb_arg) 6532 { 6533 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6534 struct spdk_bdev_io *bdev_io; 6535 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6536 6537 if (!desc->write) { 6538 return -EBADF; 6539 } 6540 6541 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6542 return -EINVAL; 6543 } 6544 6545 bdev_io = bdev_channel_get_io(channel); 6546 if (!bdev_io) { 6547 return -ENOMEM; 6548 } 6549 6550 bdev_io->internal.ch = channel; 6551 bdev_io->internal.desc = desc; 6552 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6553 6554 bdev_io->u.bdev.iovs = &bdev_io->iov; 6555 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6556 bdev_io->u.bdev.iovs[0].iov_len = 0; 6557 bdev_io->u.bdev.iovcnt = 1; 6558 6559 bdev_io->u.bdev.offset_blocks = offset_blocks; 6560 bdev_io->u.bdev.num_blocks = num_blocks; 6561 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6562 bdev_io->u.bdev.memory_domain = NULL; 6563 bdev_io->u.bdev.memory_domain_ctx = NULL; 6564 bdev_io->u.bdev.accel_sequence = NULL; 6565 6566 if (num_blocks == 0) { 6567 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6568 return 0; 6569 } 6570 6571 bdev_io_submit(bdev_io); 6572 return 0; 6573 } 6574 6575 int 6576 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6577 uint64_t offset, uint64_t length, 6578 spdk_bdev_io_completion_cb cb, void *cb_arg) 6579 { 6580 uint64_t offset_blocks, num_blocks; 6581 6582 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) { 6583 return -EINVAL; 6584 } 6585 6586 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6587 } 6588 6589 int 6590 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6591 uint64_t offset_blocks, uint64_t num_blocks, 6592 spdk_bdev_io_completion_cb cb, void *cb_arg) 6593 { 6594 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6595 struct spdk_bdev_io *bdev_io; 6596 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6597 6598 if (!desc->write) { 6599 return -EBADF; 6600 } 6601 6602 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6603 return -ENOTSUP; 6604 } 6605 6606 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6607 return -EINVAL; 6608 } 6609 6610 bdev_io = bdev_channel_get_io(channel); 6611 if (!bdev_io) { 6612 return -ENOMEM; 6613 } 6614 6615 bdev_io->internal.ch = channel; 6616 bdev_io->internal.desc = desc; 6617 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6618 bdev_io->u.bdev.iovs = NULL; 6619 bdev_io->u.bdev.iovcnt = 0; 6620 bdev_io->u.bdev.offset_blocks = offset_blocks; 6621 bdev_io->u.bdev.num_blocks = num_blocks; 6622 bdev_io->u.bdev.memory_domain = NULL; 6623 bdev_io->u.bdev.memory_domain_ctx = NULL; 6624 bdev_io->u.bdev.accel_sequence = NULL; 6625 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6626 6627 bdev_io_submit(bdev_io); 6628 return 0; 6629 } 6630 6631 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6632 6633 static void 6634 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6635 { 6636 struct spdk_bdev_io *bdev_io = _ctx; 6637 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6638 6639 if (status == -EBUSY) { 6640 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6641 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6642 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6643 } else { 6644 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6645 /* If outstanding IOs are still present and reset_io_drain_timeout 6646 * seconds passed, start the reset. */ 6647 bdev_io_submit_reset(bdev_io); 6648 } else { 6649 /* We still have in progress memory domain pull/push or we're 6650 * executing accel sequence. Since we cannot abort either of those 6651 * operations, fail the reset request. */ 6652 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6653 } 6654 } 6655 } else { 6656 SPDK_DEBUGLOG(bdev, 6657 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6658 ch->bdev->name); 6659 /* Mark the completion status as a SUCCESS and complete the reset. */ 6660 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6661 } 6662 } 6663 6664 static void 6665 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6666 struct spdk_io_channel *io_ch, void *_ctx) 6667 { 6668 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6669 int status = 0; 6670 6671 if (cur_ch->io_outstanding > 0 || 6672 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6673 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6674 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6675 * further iteration over the rest of the channels and pass non-zero status 6676 * to the callback function. */ 6677 status = -EBUSY; 6678 } 6679 spdk_bdev_for_each_channel_continue(i, status); 6680 } 6681 6682 static int 6683 bdev_reset_poll_for_outstanding_io(void *ctx) 6684 { 6685 struct spdk_bdev_io *bdev_io = ctx; 6686 6687 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6688 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6689 bdev_reset_check_outstanding_io_done); 6690 6691 return SPDK_POLLER_BUSY; 6692 } 6693 6694 static void 6695 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6696 { 6697 struct spdk_bdev_io *bdev_io = _ctx; 6698 6699 if (bdev->reset_io_drain_timeout == 0) { 6700 bdev_io_submit_reset(bdev_io); 6701 return; 6702 } 6703 6704 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6705 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6706 6707 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6708 * submit the reset to the underlying module only if outstanding I/O 6709 * remain after reset_io_drain_timeout seconds have passed. */ 6710 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6711 bdev_reset_check_outstanding_io_done); 6712 } 6713 6714 static void 6715 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6716 struct spdk_io_channel *ch, void *_ctx) 6717 { 6718 struct spdk_bdev_channel *channel; 6719 struct spdk_bdev_mgmt_channel *mgmt_channel; 6720 struct spdk_bdev_shared_resource *shared_resource; 6721 bdev_io_tailq_t tmp_queued; 6722 6723 TAILQ_INIT(&tmp_queued); 6724 6725 channel = __io_ch_to_bdev_ch(ch); 6726 shared_resource = channel->shared_resource; 6727 mgmt_channel = shared_resource->mgmt_ch; 6728 6729 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6730 6731 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6732 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6733 } 6734 6735 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6736 bdev_abort_all_buf_io(mgmt_channel, channel); 6737 bdev_abort_all_queued_io(&tmp_queued, channel); 6738 6739 spdk_bdev_for_each_channel_continue(i, 0); 6740 } 6741 6742 static void 6743 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6744 { 6745 struct spdk_bdev *bdev = bdev_io->bdev; 6746 bool freeze_channel = false; 6747 6748 bdev_ch_add_to_io_submitted(bdev_io); 6749 6750 /** 6751 * Take a channel reference for the target bdev for the life of this 6752 * reset. This guards against the channel getting destroyed before 6753 * the reset is completed. We will release the reference when this 6754 * reset is completed. 6755 */ 6756 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6757 6758 spdk_spin_lock(&bdev->internal.spinlock); 6759 if (bdev->internal.reset_in_progress == NULL) { 6760 bdev->internal.reset_in_progress = bdev_io; 6761 freeze_channel = true; 6762 } else { 6763 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6764 } 6765 spdk_spin_unlock(&bdev->internal.spinlock); 6766 6767 if (freeze_channel) { 6768 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6769 bdev_reset_freeze_channel_done); 6770 } 6771 } 6772 6773 int 6774 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6775 spdk_bdev_io_completion_cb cb, void *cb_arg) 6776 { 6777 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6778 struct spdk_bdev_io *bdev_io; 6779 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6780 6781 bdev_io = bdev_channel_get_io(channel); 6782 if (!bdev_io) { 6783 return -ENOMEM; 6784 } 6785 6786 bdev_io->internal.ch = channel; 6787 bdev_io->internal.desc = desc; 6788 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6789 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6790 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6791 6792 bdev_start_reset(bdev_io); 6793 return 0; 6794 } 6795 6796 void 6797 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6798 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6799 { 6800 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6801 6802 bdev_get_io_stat(stat, channel->stat); 6803 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 6804 } 6805 6806 static void 6807 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6808 { 6809 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6810 6811 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6812 bdev_iostat_ctx->cb_arg, 0); 6813 free(bdev_iostat_ctx); 6814 } 6815 6816 static void 6817 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6818 struct spdk_io_channel *ch, void *_ctx) 6819 { 6820 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6821 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6822 6823 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6824 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 6825 spdk_bdev_for_each_channel_continue(i, 0); 6826 } 6827 6828 void 6829 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6830 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6831 { 6832 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6833 6834 assert(bdev != NULL); 6835 assert(stat != NULL); 6836 assert(cb != NULL); 6837 6838 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6839 if (bdev_iostat_ctx == NULL) { 6840 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6841 cb(bdev, stat, cb_arg, -ENOMEM); 6842 return; 6843 } 6844 6845 bdev_iostat_ctx->stat = stat; 6846 bdev_iostat_ctx->cb = cb; 6847 bdev_iostat_ctx->cb_arg = cb_arg; 6848 bdev_iostat_ctx->reset_mode = reset_mode; 6849 6850 /* Start with the statistics from previously deleted channels. */ 6851 spdk_spin_lock(&bdev->internal.spinlock); 6852 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6853 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 6854 spdk_spin_unlock(&bdev->internal.spinlock); 6855 6856 /* Then iterate and add the statistics from each existing channel. */ 6857 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6858 bdev_get_device_stat_done); 6859 } 6860 6861 struct bdev_iostat_reset_ctx { 6862 enum spdk_bdev_reset_stat_mode mode; 6863 bdev_reset_device_stat_cb cb; 6864 void *cb_arg; 6865 }; 6866 6867 static void 6868 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6869 { 6870 struct bdev_iostat_reset_ctx *ctx = _ctx; 6871 6872 ctx->cb(bdev, ctx->cb_arg, 0); 6873 6874 free(ctx); 6875 } 6876 6877 static void 6878 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6879 struct spdk_io_channel *ch, void *_ctx) 6880 { 6881 struct bdev_iostat_reset_ctx *ctx = _ctx; 6882 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6883 6884 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6885 6886 spdk_bdev_for_each_channel_continue(i, 0); 6887 } 6888 6889 void 6890 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6891 bdev_reset_device_stat_cb cb, void *cb_arg) 6892 { 6893 struct bdev_iostat_reset_ctx *ctx; 6894 6895 assert(bdev != NULL); 6896 assert(cb != NULL); 6897 6898 ctx = calloc(1, sizeof(*ctx)); 6899 if (ctx == NULL) { 6900 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6901 cb(bdev, cb_arg, -ENOMEM); 6902 return; 6903 } 6904 6905 ctx->mode = mode; 6906 ctx->cb = cb; 6907 ctx->cb_arg = cb_arg; 6908 6909 spdk_spin_lock(&bdev->internal.spinlock); 6910 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6911 spdk_spin_unlock(&bdev->internal.spinlock); 6912 6913 spdk_bdev_for_each_channel(bdev, 6914 bdev_reset_each_channel_stat, 6915 ctx, 6916 bdev_reset_device_stat_done); 6917 } 6918 6919 int 6920 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6921 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6922 spdk_bdev_io_completion_cb cb, void *cb_arg) 6923 { 6924 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6925 struct spdk_bdev_io *bdev_io; 6926 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6927 6928 if (!desc->write) { 6929 return -EBADF; 6930 } 6931 6932 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6933 return -ENOTSUP; 6934 } 6935 6936 bdev_io = bdev_channel_get_io(channel); 6937 if (!bdev_io) { 6938 return -ENOMEM; 6939 } 6940 6941 bdev_io->internal.ch = channel; 6942 bdev_io->internal.desc = desc; 6943 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6944 bdev_io->u.nvme_passthru.cmd = *cmd; 6945 bdev_io->u.nvme_passthru.buf = buf; 6946 bdev_io->u.nvme_passthru.nbytes = nbytes; 6947 bdev_io->u.nvme_passthru.md_buf = NULL; 6948 bdev_io->u.nvme_passthru.md_len = 0; 6949 6950 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6951 6952 bdev_io_submit(bdev_io); 6953 return 0; 6954 } 6955 6956 int 6957 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6958 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6959 spdk_bdev_io_completion_cb cb, void *cb_arg) 6960 { 6961 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6962 struct spdk_bdev_io *bdev_io; 6963 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6964 6965 if (!desc->write) { 6966 /* 6967 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6968 * to easily determine if the command is a read or write, but for now just 6969 * do not allow io_passthru with a read-only descriptor. 6970 */ 6971 return -EBADF; 6972 } 6973 6974 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6975 return -ENOTSUP; 6976 } 6977 6978 bdev_io = bdev_channel_get_io(channel); 6979 if (!bdev_io) { 6980 return -ENOMEM; 6981 } 6982 6983 bdev_io->internal.ch = channel; 6984 bdev_io->internal.desc = desc; 6985 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6986 bdev_io->u.nvme_passthru.cmd = *cmd; 6987 bdev_io->u.nvme_passthru.buf = buf; 6988 bdev_io->u.nvme_passthru.nbytes = nbytes; 6989 bdev_io->u.nvme_passthru.md_buf = NULL; 6990 bdev_io->u.nvme_passthru.md_len = 0; 6991 6992 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6993 6994 bdev_io_submit(bdev_io); 6995 return 0; 6996 } 6997 6998 int 6999 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7000 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 7001 spdk_bdev_io_completion_cb cb, void *cb_arg) 7002 { 7003 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7004 struct spdk_bdev_io *bdev_io; 7005 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7006 7007 if (!desc->write) { 7008 /* 7009 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7010 * to easily determine if the command is a read or write, but for now just 7011 * do not allow io_passthru with a read-only descriptor. 7012 */ 7013 return -EBADF; 7014 } 7015 7016 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7017 return -ENOTSUP; 7018 } 7019 7020 bdev_io = bdev_channel_get_io(channel); 7021 if (!bdev_io) { 7022 return -ENOMEM; 7023 } 7024 7025 bdev_io->internal.ch = channel; 7026 bdev_io->internal.desc = desc; 7027 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 7028 bdev_io->u.nvme_passthru.cmd = *cmd; 7029 bdev_io->u.nvme_passthru.buf = buf; 7030 bdev_io->u.nvme_passthru.nbytes = nbytes; 7031 bdev_io->u.nvme_passthru.md_buf = md_buf; 7032 bdev_io->u.nvme_passthru.md_len = md_len; 7033 7034 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7035 7036 bdev_io_submit(bdev_io); 7037 return 0; 7038 } 7039 7040 int 7041 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 7042 struct spdk_io_channel *ch, 7043 const struct spdk_nvme_cmd *cmd, 7044 struct iovec *iov, int iovcnt, size_t nbytes, 7045 void *md_buf, size_t md_len, 7046 spdk_bdev_io_completion_cb cb, void *cb_arg) 7047 { 7048 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7049 struct spdk_bdev_io *bdev_io; 7050 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7051 7052 if (!desc->write) { 7053 /* 7054 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7055 * to easily determine if the command is a read or write, but for now just 7056 * do not allow io_passthru with a read-only descriptor. 7057 */ 7058 return -EBADF; 7059 } 7060 7061 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7062 return -ENOTSUP; 7063 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7064 return -ENOTSUP; 7065 } 7066 7067 bdev_io = bdev_channel_get_io(channel); 7068 if (!bdev_io) { 7069 return -ENOMEM; 7070 } 7071 7072 bdev_io->internal.ch = channel; 7073 bdev_io->internal.desc = desc; 7074 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 7075 bdev_io->u.nvme_passthru.cmd = *cmd; 7076 bdev_io->u.nvme_passthru.iovs = iov; 7077 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 7078 bdev_io->u.nvme_passthru.nbytes = nbytes; 7079 bdev_io->u.nvme_passthru.md_buf = md_buf; 7080 bdev_io->u.nvme_passthru.md_len = md_len; 7081 7082 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7083 7084 bdev_io_submit(bdev_io); 7085 return 0; 7086 } 7087 7088 static void bdev_abort_retry(void *ctx); 7089 static void bdev_abort(struct spdk_bdev_io *parent_io); 7090 7091 static void 7092 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7093 { 7094 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 7095 struct spdk_bdev_io *parent_io = cb_arg; 7096 struct spdk_bdev_io *bio_to_abort, *tmp_io; 7097 7098 bio_to_abort = bdev_io->u.abort.bio_to_abort; 7099 7100 spdk_bdev_free_io(bdev_io); 7101 7102 if (!success) { 7103 /* Check if the target I/O completed in the meantime. */ 7104 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 7105 if (tmp_io == bio_to_abort) { 7106 break; 7107 } 7108 } 7109 7110 /* If the target I/O still exists, set the parent to failed. */ 7111 if (tmp_io != NULL) { 7112 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7113 } 7114 } 7115 7116 assert(parent_io->internal.f.split); 7117 7118 parent_io->internal.split.outstanding--; 7119 if (parent_io->internal.split.outstanding == 0) { 7120 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7121 bdev_abort_retry(parent_io); 7122 } else { 7123 bdev_io_complete(parent_io); 7124 } 7125 } 7126 } 7127 7128 static int 7129 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7130 struct spdk_bdev_io *bio_to_abort, 7131 spdk_bdev_io_completion_cb cb, void *cb_arg) 7132 { 7133 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7134 struct spdk_bdev_io *bdev_io; 7135 7136 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7137 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7138 /* TODO: Abort reset or abort request. */ 7139 return -ENOTSUP; 7140 } 7141 7142 bdev_io = bdev_channel_get_io(channel); 7143 if (bdev_io == NULL) { 7144 return -ENOMEM; 7145 } 7146 7147 bdev_io->internal.ch = channel; 7148 bdev_io->internal.desc = desc; 7149 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7150 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7151 7152 if (bio_to_abort->internal.f.split) { 7153 assert(bdev_io_should_split(bio_to_abort)); 7154 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7155 7156 /* Parent abort request is not submitted directly, but to manage its 7157 * execution add it to the submitted list here. 7158 */ 7159 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7160 bdev_ch_add_to_io_submitted(bdev_io); 7161 7162 bdev_abort(bdev_io); 7163 7164 return 0; 7165 } 7166 7167 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7168 7169 /* Submit the abort request to the underlying bdev module. */ 7170 bdev_io_submit(bdev_io); 7171 7172 return 0; 7173 } 7174 7175 static bool 7176 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7177 { 7178 struct spdk_bdev_io *iter; 7179 7180 TAILQ_FOREACH(iter, tailq, internal.link) { 7181 if (iter == bdev_io) { 7182 return true; 7183 } 7184 } 7185 7186 return false; 7187 } 7188 7189 static uint32_t 7190 _bdev_abort(struct spdk_bdev_io *parent_io) 7191 { 7192 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7193 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7194 void *bio_cb_arg; 7195 struct spdk_bdev_io *bio_to_abort; 7196 uint32_t matched_ios; 7197 int rc; 7198 7199 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7200 7201 /* matched_ios is returned and will be kept by the caller. 7202 * 7203 * This function will be used for two cases, 1) the same cb_arg is used for 7204 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7205 * Incrementing split_outstanding directly here may confuse readers especially 7206 * for the 1st case. 7207 * 7208 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7209 * works as expected. 7210 */ 7211 matched_ios = 0; 7212 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7213 7214 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7215 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7216 continue; 7217 } 7218 7219 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7220 /* Any I/O which was submitted after this abort command should be excluded. */ 7221 continue; 7222 } 7223 7224 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7225 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7226 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7227 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7228 break; 7229 } 7230 7231 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7232 if (rc != 0) { 7233 if (rc == -ENOMEM) { 7234 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7235 } else { 7236 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7237 } 7238 break; 7239 } 7240 matched_ios++; 7241 } 7242 7243 return matched_ios; 7244 } 7245 7246 static void 7247 bdev_abort_retry(void *ctx) 7248 { 7249 struct spdk_bdev_io *parent_io = ctx; 7250 uint32_t matched_ios; 7251 7252 matched_ios = _bdev_abort(parent_io); 7253 7254 if (matched_ios == 0) { 7255 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7256 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7257 } else { 7258 /* For retry, the case that no target I/O was found is success 7259 * because it means target I/Os completed in the meantime. 7260 */ 7261 bdev_io_complete(parent_io); 7262 } 7263 return; 7264 } 7265 7266 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7267 parent_io->internal.f.split = true; 7268 parent_io->internal.split.outstanding = matched_ios; 7269 } 7270 7271 static void 7272 bdev_abort(struct spdk_bdev_io *parent_io) 7273 { 7274 uint32_t matched_ios; 7275 7276 matched_ios = _bdev_abort(parent_io); 7277 7278 if (matched_ios == 0) { 7279 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7280 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7281 } else { 7282 /* The case the no target I/O was found is failure. */ 7283 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7284 bdev_io_complete(parent_io); 7285 } 7286 return; 7287 } 7288 7289 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7290 parent_io->internal.f.split = true; 7291 parent_io->internal.split.outstanding = matched_ios; 7292 } 7293 7294 int 7295 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7296 void *bio_cb_arg, 7297 spdk_bdev_io_completion_cb cb, void *cb_arg) 7298 { 7299 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7300 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7301 struct spdk_bdev_io *bdev_io; 7302 7303 if (bio_cb_arg == NULL) { 7304 return -EINVAL; 7305 } 7306 7307 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7308 return -ENOTSUP; 7309 } 7310 7311 bdev_io = bdev_channel_get_io(channel); 7312 if (bdev_io == NULL) { 7313 return -ENOMEM; 7314 } 7315 7316 bdev_io->internal.ch = channel; 7317 bdev_io->internal.desc = desc; 7318 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7319 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7320 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7321 7322 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7323 7324 /* Parent abort request is not submitted directly, but to manage its execution, 7325 * add it to the submitted list here. 7326 */ 7327 bdev_ch_add_to_io_submitted(bdev_io); 7328 7329 bdev_abort(bdev_io); 7330 7331 return 0; 7332 } 7333 7334 int 7335 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7336 struct spdk_bdev_io_wait_entry *entry) 7337 { 7338 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7339 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7340 7341 if (bdev != entry->bdev) { 7342 SPDK_ERRLOG("bdevs do not match\n"); 7343 return -EINVAL; 7344 } 7345 7346 if (mgmt_ch->per_thread_cache_count > 0) { 7347 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7348 return -EINVAL; 7349 } 7350 7351 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7352 return 0; 7353 } 7354 7355 static inline void 7356 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7357 { 7358 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7359 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7360 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7361 uint32_t blocklen = bdev_io->bdev->blocklen; 7362 7363 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7364 switch (bdev_io->type) { 7365 case SPDK_BDEV_IO_TYPE_READ: 7366 io_stat->bytes_read += num_blocks * blocklen; 7367 io_stat->num_read_ops++; 7368 io_stat->read_latency_ticks += tsc_diff; 7369 if (io_stat->max_read_latency_ticks < tsc_diff) { 7370 io_stat->max_read_latency_ticks = tsc_diff; 7371 } 7372 if (io_stat->min_read_latency_ticks > tsc_diff) { 7373 io_stat->min_read_latency_ticks = tsc_diff; 7374 } 7375 break; 7376 case SPDK_BDEV_IO_TYPE_WRITE: 7377 io_stat->bytes_written += num_blocks * blocklen; 7378 io_stat->num_write_ops++; 7379 io_stat->write_latency_ticks += tsc_diff; 7380 if (io_stat->max_write_latency_ticks < tsc_diff) { 7381 io_stat->max_write_latency_ticks = tsc_diff; 7382 } 7383 if (io_stat->min_write_latency_ticks > tsc_diff) { 7384 io_stat->min_write_latency_ticks = tsc_diff; 7385 } 7386 break; 7387 case SPDK_BDEV_IO_TYPE_UNMAP: 7388 io_stat->bytes_unmapped += num_blocks * blocklen; 7389 io_stat->num_unmap_ops++; 7390 io_stat->unmap_latency_ticks += tsc_diff; 7391 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7392 io_stat->max_unmap_latency_ticks = tsc_diff; 7393 } 7394 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7395 io_stat->min_unmap_latency_ticks = tsc_diff; 7396 } 7397 break; 7398 case SPDK_BDEV_IO_TYPE_ZCOPY: 7399 /* Track the data in the start phase only */ 7400 if (bdev_io->u.bdev.zcopy.start) { 7401 if (bdev_io->u.bdev.zcopy.populate) { 7402 io_stat->bytes_read += num_blocks * blocklen; 7403 io_stat->num_read_ops++; 7404 io_stat->read_latency_ticks += tsc_diff; 7405 if (io_stat->max_read_latency_ticks < tsc_diff) { 7406 io_stat->max_read_latency_ticks = tsc_diff; 7407 } 7408 if (io_stat->min_read_latency_ticks > tsc_diff) { 7409 io_stat->min_read_latency_ticks = tsc_diff; 7410 } 7411 } else { 7412 io_stat->bytes_written += num_blocks * blocklen; 7413 io_stat->num_write_ops++; 7414 io_stat->write_latency_ticks += tsc_diff; 7415 if (io_stat->max_write_latency_ticks < tsc_diff) { 7416 io_stat->max_write_latency_ticks = tsc_diff; 7417 } 7418 if (io_stat->min_write_latency_ticks > tsc_diff) { 7419 io_stat->min_write_latency_ticks = tsc_diff; 7420 } 7421 } 7422 } 7423 break; 7424 case SPDK_BDEV_IO_TYPE_COPY: 7425 io_stat->bytes_copied += num_blocks * blocklen; 7426 io_stat->num_copy_ops++; 7427 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7428 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7429 io_stat->max_copy_latency_ticks = tsc_diff; 7430 } 7431 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7432 io_stat->min_copy_latency_ticks = tsc_diff; 7433 } 7434 break; 7435 default: 7436 break; 7437 } 7438 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7439 io_stat = bdev_io->bdev->internal.stat; 7440 assert(io_stat->io_error != NULL); 7441 7442 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7443 io_stat->io_error->error_status[-io_status - 1]++; 7444 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7445 } 7446 7447 #ifdef SPDK_CONFIG_VTUNE 7448 uint64_t now_tsc = spdk_get_ticks(); 7449 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7450 uint64_t data[5]; 7451 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7452 7453 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7454 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7455 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7456 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7457 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7458 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7459 7460 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7461 __itt_metadata_u64, 5, data); 7462 7463 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7464 bdev_io->internal.ch->start_tsc = now_tsc; 7465 } 7466 #endif 7467 } 7468 7469 static inline void 7470 _bdev_io_complete(void *ctx) 7471 { 7472 struct spdk_bdev_io *bdev_io = ctx; 7473 7474 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7475 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7476 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7477 } 7478 7479 assert(bdev_io->internal.cb != NULL); 7480 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7481 7482 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7483 bdev_io->internal.caller_ctx); 7484 } 7485 7486 static inline void 7487 bdev_io_complete(void *ctx) 7488 { 7489 struct spdk_bdev_io *bdev_io = ctx; 7490 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7491 uint64_t tsc, tsc_diff; 7492 7493 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7494 /* 7495 * Defer completion to avoid potential infinite recursion if the 7496 * user's completion callback issues a new I/O. 7497 */ 7498 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7499 bdev_io_complete, bdev_io); 7500 return; 7501 } 7502 7503 tsc = spdk_get_ticks(); 7504 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7505 7506 bdev_ch_remove_from_io_submitted(bdev_io); 7507 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7508 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7509 7510 if (bdev_ch->histogram) { 7511 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7512 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7513 /* 7514 * Tally all I/O types if the histogram_io_type is set to 0. 7515 */ 7516 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7517 } 7518 } 7519 7520 bdev_io_update_io_stat(bdev_io, tsc_diff); 7521 _bdev_io_complete(bdev_io); 7522 } 7523 7524 /* The difference between this function and bdev_io_complete() is that this should be called to 7525 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7526 * io_submitted list and don't have submit_tsc updated. 7527 */ 7528 static inline void 7529 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7530 { 7531 /* Since the IO hasn't been submitted it's bound to be failed */ 7532 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7533 7534 /* At this point we don't know if the IO is completed from submission context or not, but, 7535 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7536 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7537 _bdev_io_complete, bdev_io); 7538 } 7539 7540 static void bdev_destroy_cb(void *io_device); 7541 7542 static inline void 7543 _bdev_reset_complete(void *ctx) 7544 { 7545 struct spdk_bdev_io *bdev_io = ctx; 7546 7547 /* Put the channel reference we got in submission. */ 7548 assert(bdev_io->u.reset.ch_ref != NULL); 7549 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7550 bdev_io->u.reset.ch_ref = NULL; 7551 7552 bdev_io_complete(bdev_io); 7553 } 7554 7555 static void 7556 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7557 { 7558 struct spdk_bdev_io *bdev_io = _ctx; 7559 bdev_io_tailq_t queued_resets; 7560 struct spdk_bdev_io *queued_reset; 7561 7562 assert(bdev_io == bdev->internal.reset_in_progress); 7563 7564 TAILQ_INIT(&queued_resets); 7565 7566 spdk_spin_lock(&bdev->internal.spinlock); 7567 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7568 spdk_bdev_io, internal.link); 7569 bdev->internal.reset_in_progress = NULL; 7570 spdk_spin_unlock(&bdev->internal.spinlock); 7571 7572 while (!TAILQ_EMPTY(&queued_resets)) { 7573 queued_reset = TAILQ_FIRST(&queued_resets); 7574 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7575 queued_reset->internal.status = bdev_io->internal.status; 7576 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7577 _bdev_reset_complete, queued_reset); 7578 } 7579 7580 _bdev_reset_complete(bdev_io); 7581 7582 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7583 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7584 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7585 } 7586 } 7587 7588 static void 7589 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7590 struct spdk_io_channel *_ch, void *_ctx) 7591 { 7592 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7593 7594 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7595 7596 spdk_bdev_for_each_channel_continue(i, 0); 7597 } 7598 7599 static void 7600 bdev_io_complete_sequence_cb(void *ctx, int status) 7601 { 7602 struct spdk_bdev_io *bdev_io = ctx; 7603 7604 /* u.bdev.accel_sequence should have already been cleared at this point */ 7605 assert(bdev_io->u.bdev.accel_sequence == NULL); 7606 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7607 bdev_io->internal.f.has_accel_sequence = false; 7608 7609 if (spdk_unlikely(status != 0)) { 7610 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7611 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7612 } 7613 7614 bdev_io_complete(bdev_io); 7615 } 7616 7617 void 7618 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7619 { 7620 struct spdk_bdev *bdev = bdev_io->bdev; 7621 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7622 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7623 7624 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7625 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7626 spdk_bdev_get_module_name(bdev), 7627 bdev_io_status_get_string(bdev_io->internal.status)); 7628 assert(false); 7629 } 7630 bdev_io->internal.status = status; 7631 7632 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7633 assert(bdev_io == bdev->internal.reset_in_progress); 7634 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7635 bdev_reset_complete); 7636 return; 7637 } else { 7638 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7639 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7640 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7641 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7642 return; 7643 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7644 !bdev_io_use_accel_sequence(bdev_io))) { 7645 _bdev_io_push_bounce_data_buffer(bdev_io, 7646 _bdev_io_complete_push_bounce_done); 7647 /* bdev IO will be completed in the callback */ 7648 return; 7649 } 7650 } 7651 7652 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7653 return; 7654 } 7655 } 7656 7657 bdev_io_complete(bdev_io); 7658 } 7659 7660 void 7661 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7662 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7663 { 7664 enum spdk_bdev_io_status status; 7665 7666 if (sc == SPDK_SCSI_STATUS_GOOD) { 7667 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7668 } else { 7669 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7670 bdev_io->internal.error.scsi.sc = sc; 7671 bdev_io->internal.error.scsi.sk = sk; 7672 bdev_io->internal.error.scsi.asc = asc; 7673 bdev_io->internal.error.scsi.ascq = ascq; 7674 } 7675 7676 spdk_bdev_io_complete(bdev_io, status); 7677 } 7678 7679 void 7680 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7681 int *sc, int *sk, int *asc, int *ascq) 7682 { 7683 assert(sc != NULL); 7684 assert(sk != NULL); 7685 assert(asc != NULL); 7686 assert(ascq != NULL); 7687 7688 switch (bdev_io->internal.status) { 7689 case SPDK_BDEV_IO_STATUS_SUCCESS: 7690 *sc = SPDK_SCSI_STATUS_GOOD; 7691 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7692 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7693 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7694 break; 7695 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7696 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7697 break; 7698 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7699 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7700 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7701 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7702 *ascq = bdev_io->internal.error.scsi.ascq; 7703 break; 7704 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7705 *sc = bdev_io->internal.error.scsi.sc; 7706 *sk = bdev_io->internal.error.scsi.sk; 7707 *asc = bdev_io->internal.error.scsi.asc; 7708 *ascq = bdev_io->internal.error.scsi.ascq; 7709 break; 7710 default: 7711 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7712 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7713 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7714 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7715 break; 7716 } 7717 } 7718 7719 void 7720 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7721 { 7722 enum spdk_bdev_io_status status; 7723 7724 if (aio_result == 0) { 7725 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7726 } else { 7727 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7728 } 7729 7730 bdev_io->internal.error.aio_result = aio_result; 7731 7732 spdk_bdev_io_complete(bdev_io, status); 7733 } 7734 7735 void 7736 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7737 { 7738 assert(aio_result != NULL); 7739 7740 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7741 *aio_result = bdev_io->internal.error.aio_result; 7742 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7743 *aio_result = 0; 7744 } else { 7745 *aio_result = -EIO; 7746 } 7747 } 7748 7749 void 7750 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7751 { 7752 enum spdk_bdev_io_status status; 7753 7754 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7755 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7756 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7757 status = SPDK_BDEV_IO_STATUS_ABORTED; 7758 } else { 7759 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7760 } 7761 7762 bdev_io->internal.error.nvme.cdw0 = cdw0; 7763 bdev_io->internal.error.nvme.sct = sct; 7764 bdev_io->internal.error.nvme.sc = sc; 7765 7766 spdk_bdev_io_complete(bdev_io, status); 7767 } 7768 7769 void 7770 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7771 { 7772 assert(sct != NULL); 7773 assert(sc != NULL); 7774 assert(cdw0 != NULL); 7775 7776 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7777 *sct = SPDK_NVME_SCT_GENERIC; 7778 *sc = SPDK_NVME_SC_SUCCESS; 7779 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7780 *cdw0 = 0; 7781 } else { 7782 *cdw0 = 1U; 7783 } 7784 return; 7785 } 7786 7787 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7788 *sct = SPDK_NVME_SCT_GENERIC; 7789 *sc = SPDK_NVME_SC_SUCCESS; 7790 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7791 *sct = bdev_io->internal.error.nvme.sct; 7792 *sc = bdev_io->internal.error.nvme.sc; 7793 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7794 *sct = SPDK_NVME_SCT_GENERIC; 7795 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7796 } else { 7797 *sct = SPDK_NVME_SCT_GENERIC; 7798 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7799 } 7800 7801 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7802 } 7803 7804 void 7805 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7806 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7807 { 7808 assert(first_sct != NULL); 7809 assert(first_sc != NULL); 7810 assert(second_sct != NULL); 7811 assert(second_sc != NULL); 7812 assert(cdw0 != NULL); 7813 7814 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7815 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7816 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7817 *first_sct = bdev_io->internal.error.nvme.sct; 7818 *first_sc = bdev_io->internal.error.nvme.sc; 7819 *second_sct = SPDK_NVME_SCT_GENERIC; 7820 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7821 } else { 7822 *first_sct = SPDK_NVME_SCT_GENERIC; 7823 *first_sc = SPDK_NVME_SC_SUCCESS; 7824 *second_sct = bdev_io->internal.error.nvme.sct; 7825 *second_sc = bdev_io->internal.error.nvme.sc; 7826 } 7827 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7828 *first_sct = SPDK_NVME_SCT_GENERIC; 7829 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7830 *second_sct = SPDK_NVME_SCT_GENERIC; 7831 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7832 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7833 *first_sct = SPDK_NVME_SCT_GENERIC; 7834 *first_sc = SPDK_NVME_SC_SUCCESS; 7835 *second_sct = SPDK_NVME_SCT_GENERIC; 7836 *second_sc = SPDK_NVME_SC_SUCCESS; 7837 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7838 *first_sct = SPDK_NVME_SCT_GENERIC; 7839 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7840 *second_sct = SPDK_NVME_SCT_GENERIC; 7841 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7842 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7843 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7844 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7845 *second_sct = SPDK_NVME_SCT_GENERIC; 7846 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7847 } else { 7848 *first_sct = SPDK_NVME_SCT_GENERIC; 7849 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7850 *second_sct = SPDK_NVME_SCT_GENERIC; 7851 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7852 } 7853 7854 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7855 } 7856 7857 void 7858 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7859 const struct spdk_bdev_io *base_io) 7860 { 7861 switch (base_io->internal.status) { 7862 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7863 spdk_bdev_io_complete_nvme_status(bdev_io, 7864 base_io->internal.error.nvme.cdw0, 7865 base_io->internal.error.nvme.sct, 7866 base_io->internal.error.nvme.sc); 7867 break; 7868 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7869 spdk_bdev_io_complete_scsi_status(bdev_io, 7870 base_io->internal.error.scsi.sc, 7871 base_io->internal.error.scsi.sk, 7872 base_io->internal.error.scsi.asc, 7873 base_io->internal.error.scsi.ascq); 7874 break; 7875 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7876 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7877 break; 7878 default: 7879 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7880 break; 7881 } 7882 } 7883 7884 struct spdk_thread * 7885 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7886 { 7887 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7888 } 7889 7890 struct spdk_io_channel * 7891 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7892 { 7893 return bdev_io->internal.ch->channel; 7894 } 7895 7896 static int 7897 bdev_register(struct spdk_bdev *bdev) 7898 { 7899 char *bdev_name; 7900 char uuid[SPDK_UUID_STRING_LEN]; 7901 struct spdk_iobuf_opts iobuf_opts; 7902 int ret; 7903 7904 assert(bdev->module != NULL); 7905 7906 if (!bdev->name) { 7907 SPDK_ERRLOG("Bdev name is NULL\n"); 7908 return -EINVAL; 7909 } 7910 7911 if (!strlen(bdev->name)) { 7912 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7913 return -EINVAL; 7914 } 7915 7916 /* Users often register their own I/O devices using the bdev name. In 7917 * order to avoid conflicts, prepend bdev_. */ 7918 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7919 if (!bdev_name) { 7920 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7921 return -ENOMEM; 7922 } 7923 7924 bdev->internal.stat = bdev_alloc_io_stat(true); 7925 if (!bdev->internal.stat) { 7926 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7927 free(bdev_name); 7928 return -ENOMEM; 7929 } 7930 7931 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7932 bdev->internal.measured_queue_depth = UINT64_MAX; 7933 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7934 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7935 bdev->internal.qd_poller = NULL; 7936 bdev->internal.qos = NULL; 7937 7938 TAILQ_INIT(&bdev->internal.open_descs); 7939 TAILQ_INIT(&bdev->internal.locked_ranges); 7940 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7941 TAILQ_INIT(&bdev->internal.queued_resets); 7942 TAILQ_INIT(&bdev->aliases); 7943 7944 /* UUID may be specified by the user or defined by bdev itself. 7945 * Otherwise it will be generated here, so this field will never be empty. */ 7946 if (spdk_uuid_is_null(&bdev->uuid)) { 7947 spdk_uuid_generate(&bdev->uuid); 7948 } 7949 7950 /* Add the UUID alias only if it's different than the name */ 7951 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7952 if (strcmp(bdev->name, uuid) != 0) { 7953 ret = spdk_bdev_alias_add(bdev, uuid); 7954 if (ret != 0) { 7955 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7956 bdev_free_io_stat(bdev->internal.stat); 7957 free(bdev_name); 7958 return ret; 7959 } 7960 } 7961 7962 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7963 if (spdk_bdev_get_buf_align(bdev) > 1) { 7964 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7965 iobuf_opts.large_bufsize / bdev->blocklen); 7966 } 7967 7968 /* If the user didn't specify a write unit size, set it to one. */ 7969 if (bdev->write_unit_size == 0) { 7970 bdev->write_unit_size = 1; 7971 } 7972 7973 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7974 if (bdev->acwu == 0) { 7975 bdev->acwu = bdev->write_unit_size; 7976 } 7977 7978 if (bdev->phys_blocklen == 0) { 7979 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7980 } 7981 7982 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7983 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7984 } 7985 7986 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7987 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7988 } 7989 7990 bdev->internal.reset_in_progress = NULL; 7991 bdev->internal.qd_poll_in_progress = false; 7992 bdev->internal.period = 0; 7993 bdev->internal.new_period = 0; 7994 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7995 7996 /* 7997 * Initialize spinlock before registering IO device because spinlock is used in 7998 * bdev_channel_create 7999 */ 8000 spdk_spin_init(&bdev->internal.spinlock); 8001 8002 spdk_io_device_register(__bdev_to_io_dev(bdev), 8003 bdev_channel_create, bdev_channel_destroy, 8004 sizeof(struct spdk_bdev_channel), 8005 bdev_name); 8006 8007 /* 8008 * Register bdev name only after the bdev object is ready. 8009 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 8010 * create IO channels... 8011 */ 8012 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 8013 if (ret != 0) { 8014 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 8015 bdev_free_io_stat(bdev->internal.stat); 8016 spdk_spin_destroy(&bdev->internal.spinlock); 8017 free(bdev_name); 8018 return ret; 8019 } 8020 8021 free(bdev_name); 8022 8023 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 8024 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 8025 8026 return 0; 8027 } 8028 8029 static void 8030 bdev_destroy_cb(void *io_device) 8031 { 8032 int rc; 8033 struct spdk_bdev *bdev; 8034 spdk_bdev_unregister_cb cb_fn; 8035 void *cb_arg; 8036 8037 bdev = __bdev_from_io_dev(io_device); 8038 8039 if (bdev->internal.unregister_td != spdk_get_thread()) { 8040 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 8041 return; 8042 } 8043 8044 cb_fn = bdev->internal.unregister_cb; 8045 cb_arg = bdev->internal.unregister_ctx; 8046 8047 spdk_spin_destroy(&bdev->internal.spinlock); 8048 free(bdev->internal.qos); 8049 bdev_free_io_stat(bdev->internal.stat); 8050 spdk_trace_unregister_owner(bdev->internal.trace_id); 8051 8052 rc = bdev->fn_table->destruct(bdev->ctxt); 8053 if (rc < 0) { 8054 SPDK_ERRLOG("destruct failed\n"); 8055 } 8056 if (rc <= 0 && cb_fn != NULL) { 8057 cb_fn(cb_arg, rc); 8058 } 8059 } 8060 8061 void 8062 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 8063 { 8064 if (bdev->internal.unregister_cb != NULL) { 8065 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 8066 } 8067 } 8068 8069 static void 8070 _remove_notify(void *arg) 8071 { 8072 struct spdk_bdev_desc *desc = arg; 8073 8074 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 8075 } 8076 8077 /* returns: 0 - bdev removed and ready to be destructed. 8078 * -EBUSY - bdev can't be destructed yet. */ 8079 static int 8080 bdev_unregister_unsafe(struct spdk_bdev *bdev) 8081 { 8082 struct spdk_bdev_desc *desc, *tmp; 8083 struct spdk_bdev_alias *alias; 8084 int rc = 0; 8085 char uuid[SPDK_UUID_STRING_LEN]; 8086 8087 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 8088 assert(spdk_spin_held(&bdev->internal.spinlock)); 8089 8090 /* Notify each descriptor about hotremoval */ 8091 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 8092 rc = -EBUSY; 8093 /* 8094 * Defer invocation of the event_cb to a separate message that will 8095 * run later on its thread. This ensures this context unwinds and 8096 * we don't recursively unregister this bdev again if the event_cb 8097 * immediately closes its descriptor. 8098 */ 8099 event_notify(desc, _remove_notify); 8100 } 8101 8102 /* If there are no descriptors, proceed removing the bdev */ 8103 if (rc == 0) { 8104 bdev_examine_allowlist_remove(bdev->name); 8105 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 8106 bdev_examine_allowlist_remove(alias->alias.name); 8107 } 8108 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 8109 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 8110 8111 /* Delete the name and the UUID alias */ 8112 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8113 bdev_name_del_unsafe(&bdev->internal.bdev_name); 8114 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 8115 8116 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 8117 8118 if (bdev->internal.reset_in_progress != NULL) { 8119 /* If reset is in progress, let the completion callback for reset 8120 * unregister the bdev. 8121 */ 8122 rc = -EBUSY; 8123 } 8124 } 8125 8126 return rc; 8127 } 8128 8129 static void 8130 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8131 struct spdk_io_channel *io_ch, void *_ctx) 8132 { 8133 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8134 8135 bdev_channel_abort_queued_ios(bdev_ch); 8136 spdk_bdev_for_each_channel_continue(i, 0); 8137 } 8138 8139 static void 8140 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8141 { 8142 int rc; 8143 8144 spdk_spin_lock(&g_bdev_mgr.spinlock); 8145 spdk_spin_lock(&bdev->internal.spinlock); 8146 /* 8147 * Set the status to REMOVING after completing to abort channels. Otherwise, 8148 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8149 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8150 * may fail. 8151 */ 8152 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8153 rc = bdev_unregister_unsafe(bdev); 8154 spdk_spin_unlock(&bdev->internal.spinlock); 8155 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8156 8157 if (rc == 0) { 8158 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8159 } 8160 } 8161 8162 void 8163 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8164 { 8165 struct spdk_thread *thread; 8166 8167 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8168 8169 thread = spdk_get_thread(); 8170 if (!thread) { 8171 /* The user called this from a non-SPDK thread. */ 8172 if (cb_fn != NULL) { 8173 cb_fn(cb_arg, -ENOTSUP); 8174 } 8175 return; 8176 } 8177 8178 spdk_spin_lock(&g_bdev_mgr.spinlock); 8179 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8180 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8181 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8182 if (cb_fn) { 8183 cb_fn(cb_arg, -EBUSY); 8184 } 8185 return; 8186 } 8187 8188 spdk_spin_lock(&bdev->internal.spinlock); 8189 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8190 bdev->internal.unregister_cb = cb_fn; 8191 bdev->internal.unregister_ctx = cb_arg; 8192 bdev->internal.unregister_td = thread; 8193 spdk_spin_unlock(&bdev->internal.spinlock); 8194 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8195 8196 spdk_bdev_set_qd_sampling_period(bdev, 0); 8197 8198 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8199 bdev_unregister); 8200 } 8201 8202 int 8203 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8204 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8205 { 8206 struct spdk_bdev_desc *desc; 8207 struct spdk_bdev *bdev; 8208 int rc; 8209 8210 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8211 if (rc != 0) { 8212 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8213 return rc; 8214 } 8215 8216 bdev = spdk_bdev_desc_get_bdev(desc); 8217 8218 if (bdev->module != module) { 8219 spdk_bdev_close(desc); 8220 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8221 bdev_name); 8222 return -ENODEV; 8223 } 8224 8225 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8226 8227 spdk_bdev_close(desc); 8228 8229 return 0; 8230 } 8231 8232 static int 8233 bdev_start_qos(struct spdk_bdev *bdev) 8234 { 8235 struct set_qos_limit_ctx *ctx; 8236 8237 /* Enable QoS */ 8238 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8239 ctx = calloc(1, sizeof(*ctx)); 8240 if (ctx == NULL) { 8241 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8242 return -ENOMEM; 8243 } 8244 ctx->bdev = bdev; 8245 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8246 } 8247 8248 return 0; 8249 } 8250 8251 static void 8252 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8253 struct spdk_bdev *bdev) 8254 { 8255 enum spdk_bdev_claim_type type; 8256 const char *typename, *modname; 8257 extern struct spdk_log_flag SPDK_LOG_bdev; 8258 8259 assert(spdk_spin_held(&bdev->internal.spinlock)); 8260 8261 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8262 return; 8263 } 8264 8265 type = bdev->internal.claim_type; 8266 typename = spdk_bdev_claim_get_name(type); 8267 8268 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8269 modname = bdev->internal.claim.v1.module->name; 8270 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8271 bdev->name, detail, typename, modname); 8272 return; 8273 } 8274 8275 if (claim_type_is_v2(type)) { 8276 struct spdk_bdev_module_claim *claim; 8277 8278 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8279 modname = claim->module->name; 8280 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8281 bdev->name, detail, typename, modname); 8282 } 8283 return; 8284 } 8285 8286 assert(false); 8287 } 8288 8289 static int 8290 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8291 { 8292 struct spdk_thread *thread; 8293 int rc = 0; 8294 8295 thread = spdk_get_thread(); 8296 if (!thread) { 8297 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8298 return -ENOTSUP; 8299 } 8300 8301 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8302 spdk_get_thread()); 8303 8304 desc->bdev = bdev; 8305 desc->thread = thread; 8306 desc->write = write; 8307 8308 spdk_spin_lock(&bdev->internal.spinlock); 8309 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8310 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8311 spdk_spin_unlock(&bdev->internal.spinlock); 8312 return -ENODEV; 8313 } 8314 8315 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8316 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8317 spdk_spin_unlock(&bdev->internal.spinlock); 8318 return -EPERM; 8319 } 8320 8321 rc = bdev_start_qos(bdev); 8322 if (rc != 0) { 8323 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8324 spdk_spin_unlock(&bdev->internal.spinlock); 8325 return rc; 8326 } 8327 8328 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8329 8330 spdk_spin_unlock(&bdev->internal.spinlock); 8331 8332 return 0; 8333 } 8334 8335 static void 8336 bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size) 8337 { 8338 if (!opts) { 8339 SPDK_ERRLOG("opts should not be NULL.\n"); 8340 return; 8341 } 8342 8343 if (!opts_size) { 8344 SPDK_ERRLOG("opts_size should not be zero.\n"); 8345 return; 8346 } 8347 8348 memset(opts, 0, opts_size); 8349 opts->size = opts_size; 8350 8351 #define FIELD_OK(field) \ 8352 offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size 8353 8354 #define SET_FIELD(field, value) \ 8355 if (FIELD_OK(field)) { \ 8356 opts->field = value; \ 8357 } \ 8358 8359 SET_FIELD(hide_metadata, false); 8360 8361 #undef FIELD_OK 8362 #undef SET_FIELD 8363 } 8364 8365 static void 8366 bdev_open_opts_copy(struct spdk_bdev_open_opts *opts, 8367 const struct spdk_bdev_open_opts *opts_src, size_t opts_size) 8368 { 8369 assert(opts); 8370 assert(opts_src); 8371 8372 #define SET_FIELD(field) \ 8373 if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \ 8374 opts->field = opts_src->field; \ 8375 } \ 8376 8377 SET_FIELD(hide_metadata); 8378 8379 opts->size = opts_src->size; 8380 8381 /* We should not remove this statement, but need to update the assert statement 8382 * if we add a new field, and also add a corresponding SET_FIELD statement. 8383 */ 8384 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size"); 8385 8386 #undef SET_FIELD 8387 } 8388 8389 void 8390 spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size) 8391 { 8392 struct spdk_bdev_open_opts opts_local; 8393 8394 bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local)); 8395 bdev_open_opts_copy(opts, &opts_local, opts_size); 8396 } 8397 8398 static int 8399 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8400 struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc) 8401 { 8402 struct spdk_bdev_desc *desc; 8403 struct spdk_bdev_open_opts opts; 8404 unsigned int i; 8405 8406 bdev_open_opts_get_defaults(&opts, sizeof(opts)); 8407 if (user_opts != NULL) { 8408 bdev_open_opts_copy(&opts, user_opts, user_opts->size); 8409 } 8410 8411 desc = calloc(1, sizeof(*desc)); 8412 if (desc == NULL) { 8413 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8414 return -ENOMEM; 8415 } 8416 8417 desc->opts = opts; 8418 8419 TAILQ_INIT(&desc->pending_media_events); 8420 TAILQ_INIT(&desc->free_media_events); 8421 8422 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8423 desc->callback.event_fn = event_cb; 8424 desc->callback.ctx = event_ctx; 8425 spdk_spin_init(&desc->spinlock); 8426 8427 if (desc->opts.hide_metadata) { 8428 if (spdk_bdev_is_md_separate(bdev)) { 8429 SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n"); 8430 bdev_desc_free(desc); 8431 return -EINVAL; 8432 } 8433 } 8434 8435 if (bdev->media_events) { 8436 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8437 sizeof(*desc->media_events_buffer)); 8438 if (desc->media_events_buffer == NULL) { 8439 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8440 bdev_desc_free(desc); 8441 return -ENOMEM; 8442 } 8443 8444 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8445 TAILQ_INSERT_TAIL(&desc->free_media_events, 8446 &desc->media_events_buffer[i], tailq); 8447 } 8448 } 8449 8450 if (bdev->fn_table->accel_sequence_supported != NULL) { 8451 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8452 desc->accel_sequence_supported[i] = 8453 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8454 (enum spdk_bdev_io_type)i); 8455 } 8456 } 8457 8458 *_desc = desc; 8459 8460 return 0; 8461 } 8462 8463 static int 8464 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8465 void *event_ctx, struct spdk_bdev_open_opts *opts, 8466 struct spdk_bdev_desc **_desc) 8467 { 8468 struct spdk_bdev_desc *desc; 8469 struct spdk_bdev *bdev; 8470 int rc; 8471 8472 bdev = bdev_get_by_name(bdev_name); 8473 8474 if (bdev == NULL) { 8475 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8476 return -ENODEV; 8477 } 8478 8479 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc); 8480 if (rc != 0) { 8481 return rc; 8482 } 8483 8484 rc = bdev_open(bdev, write, desc); 8485 if (rc != 0) { 8486 bdev_desc_free(desc); 8487 desc = NULL; 8488 } 8489 8490 *_desc = desc; 8491 8492 return rc; 8493 } 8494 8495 int 8496 spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8497 void *event_ctx, struct spdk_bdev_open_opts *opts, 8498 struct spdk_bdev_desc **_desc) 8499 { 8500 int rc; 8501 8502 if (event_cb == NULL) { 8503 SPDK_ERRLOG("Missing event callback function\n"); 8504 return -EINVAL; 8505 } 8506 8507 spdk_spin_lock(&g_bdev_mgr.spinlock); 8508 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc); 8509 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8510 8511 return rc; 8512 } 8513 8514 int 8515 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8516 void *event_ctx, struct spdk_bdev_desc **_desc) 8517 { 8518 return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc); 8519 } 8520 8521 struct spdk_bdev_open_async_ctx { 8522 char *bdev_name; 8523 spdk_bdev_event_cb_t event_cb; 8524 void *event_ctx; 8525 bool write; 8526 int rc; 8527 spdk_bdev_open_async_cb_t cb_fn; 8528 void *cb_arg; 8529 struct spdk_bdev_desc *desc; 8530 struct spdk_bdev_open_async_opts opts; 8531 uint64_t start_ticks; 8532 struct spdk_thread *orig_thread; 8533 struct spdk_poller *poller; 8534 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8535 }; 8536 8537 static void 8538 bdev_open_async_done(void *arg) 8539 { 8540 struct spdk_bdev_open_async_ctx *ctx = arg; 8541 8542 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8543 8544 free(ctx->bdev_name); 8545 free(ctx); 8546 } 8547 8548 static void 8549 bdev_open_async_cancel(void *arg) 8550 { 8551 struct spdk_bdev_open_async_ctx *ctx = arg; 8552 8553 assert(ctx->rc == -ESHUTDOWN); 8554 8555 spdk_poller_unregister(&ctx->poller); 8556 8557 bdev_open_async_done(ctx); 8558 } 8559 8560 /* This is called when the bdev library finishes at shutdown. */ 8561 static void 8562 bdev_open_async_fini(void) 8563 { 8564 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8565 8566 spdk_spin_lock(&g_bdev_mgr.spinlock); 8567 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8568 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8569 /* 8570 * We have to move to ctx->orig_thread to unregister ctx->poller. 8571 * However, there is a chance that ctx->poller is executed before 8572 * message is executed, which could result in bdev_open_async_done() 8573 * being called twice. To avoid such race condition, set ctx->rc to 8574 * -ESHUTDOWN. 8575 */ 8576 ctx->rc = -ESHUTDOWN; 8577 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8578 } 8579 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8580 } 8581 8582 static int bdev_open_async(void *arg); 8583 8584 static void 8585 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8586 { 8587 uint64_t timeout_ticks; 8588 8589 if (ctx->rc == -ESHUTDOWN) { 8590 /* This context is being canceled. Do nothing. */ 8591 return; 8592 } 8593 8594 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8595 NULL, &ctx->desc); 8596 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8597 goto exit; 8598 } 8599 8600 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8601 if (spdk_get_ticks() >= timeout_ticks) { 8602 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8603 ctx->rc = -ETIMEDOUT; 8604 goto exit; 8605 } 8606 8607 return; 8608 8609 exit: 8610 spdk_poller_unregister(&ctx->poller); 8611 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8612 8613 /* Completion callback is processed after stack unwinding. */ 8614 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8615 } 8616 8617 static int 8618 bdev_open_async(void *arg) 8619 { 8620 struct spdk_bdev_open_async_ctx *ctx = arg; 8621 8622 spdk_spin_lock(&g_bdev_mgr.spinlock); 8623 8624 _bdev_open_async(ctx); 8625 8626 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8627 8628 return SPDK_POLLER_BUSY; 8629 } 8630 8631 static void 8632 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8633 struct spdk_bdev_open_async_opts *opts_src, 8634 size_t size) 8635 { 8636 assert(opts); 8637 assert(opts_src); 8638 8639 opts->size = size; 8640 8641 #define SET_FIELD(field) \ 8642 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8643 opts->field = opts_src->field; \ 8644 } \ 8645 8646 SET_FIELD(timeout_ms); 8647 8648 /* Do not remove this statement, you should always update this statement when you adding a new field, 8649 * and do not forget to add the SET_FIELD statement for your added field. */ 8650 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8651 8652 #undef SET_FIELD 8653 } 8654 8655 static void 8656 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8657 { 8658 assert(opts); 8659 8660 opts->size = size; 8661 8662 #define SET_FIELD(field, value) \ 8663 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8664 opts->field = value; \ 8665 } \ 8666 8667 SET_FIELD(timeout_ms, 0); 8668 8669 #undef SET_FIELD 8670 } 8671 8672 int 8673 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8674 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8675 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8676 { 8677 struct spdk_bdev_open_async_ctx *ctx; 8678 8679 if (event_cb == NULL) { 8680 SPDK_ERRLOG("Missing event callback function\n"); 8681 return -EINVAL; 8682 } 8683 8684 if (open_cb == NULL) { 8685 SPDK_ERRLOG("Missing open callback function\n"); 8686 return -EINVAL; 8687 } 8688 8689 if (opts != NULL && opts->size == 0) { 8690 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8691 return -EINVAL; 8692 } 8693 8694 ctx = calloc(1, sizeof(*ctx)); 8695 if (ctx == NULL) { 8696 SPDK_ERRLOG("Failed to allocate open context\n"); 8697 return -ENOMEM; 8698 } 8699 8700 ctx->bdev_name = strdup(bdev_name); 8701 if (ctx->bdev_name == NULL) { 8702 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8703 free(ctx); 8704 return -ENOMEM; 8705 } 8706 8707 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8708 if (ctx->poller == NULL) { 8709 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8710 free(ctx->bdev_name); 8711 free(ctx); 8712 return -ENOMEM; 8713 } 8714 8715 ctx->cb_fn = open_cb; 8716 ctx->cb_arg = open_cb_arg; 8717 ctx->write = write; 8718 ctx->event_cb = event_cb; 8719 ctx->event_ctx = event_ctx; 8720 ctx->orig_thread = spdk_get_thread(); 8721 ctx->start_ticks = spdk_get_ticks(); 8722 8723 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8724 if (opts != NULL) { 8725 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8726 } 8727 8728 spdk_spin_lock(&g_bdev_mgr.spinlock); 8729 8730 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8731 _bdev_open_async(ctx); 8732 8733 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8734 8735 return 0; 8736 } 8737 8738 static void 8739 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8740 { 8741 int rc; 8742 8743 spdk_spin_lock(&bdev->internal.spinlock); 8744 spdk_spin_lock(&desc->spinlock); 8745 8746 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8747 8748 desc->closed = true; 8749 8750 if (desc->claim != NULL) { 8751 bdev_desc_release_claims(desc); 8752 } 8753 8754 if (0 == desc->refs) { 8755 spdk_spin_unlock(&desc->spinlock); 8756 bdev_desc_free(desc); 8757 } else { 8758 spdk_spin_unlock(&desc->spinlock); 8759 } 8760 8761 /* If no more descriptors, kill QoS channel */ 8762 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8763 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8764 bdev->name, spdk_get_thread()); 8765 8766 if (bdev_qos_destroy(bdev)) { 8767 /* There isn't anything we can do to recover here. Just let the 8768 * old QoS poller keep running. The QoS handling won't change 8769 * cores when the user allocates a new channel, but it won't break. */ 8770 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8771 } 8772 } 8773 8774 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8775 rc = bdev_unregister_unsafe(bdev); 8776 spdk_spin_unlock(&bdev->internal.spinlock); 8777 8778 if (rc == 0) { 8779 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8780 } 8781 } else { 8782 spdk_spin_unlock(&bdev->internal.spinlock); 8783 } 8784 } 8785 8786 void 8787 spdk_bdev_close(struct spdk_bdev_desc *desc) 8788 { 8789 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8790 8791 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8792 spdk_get_thread()); 8793 8794 assert(desc->thread == spdk_get_thread()); 8795 8796 spdk_poller_unregister(&desc->io_timeout_poller); 8797 8798 spdk_spin_lock(&g_bdev_mgr.spinlock); 8799 8800 bdev_close(bdev, desc); 8801 8802 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8803 } 8804 8805 int32_t 8806 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8807 { 8808 if (bdev->numa.id_valid) { 8809 return bdev->numa.id; 8810 } else { 8811 return SPDK_ENV_NUMA_ID_ANY; 8812 } 8813 } 8814 8815 static void 8816 bdev_register_finished(void *arg) 8817 { 8818 struct spdk_bdev_desc *desc = arg; 8819 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8820 8821 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8822 8823 spdk_spin_lock(&g_bdev_mgr.spinlock); 8824 8825 bdev_close(bdev, desc); 8826 8827 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8828 } 8829 8830 int 8831 spdk_bdev_register(struct spdk_bdev *bdev) 8832 { 8833 struct spdk_bdev_desc *desc; 8834 struct spdk_thread *thread = spdk_get_thread(); 8835 int rc; 8836 8837 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8838 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8839 thread ? spdk_thread_get_name(thread) : "null"); 8840 return -EINVAL; 8841 } 8842 8843 rc = bdev_register(bdev); 8844 if (rc != 0) { 8845 return rc; 8846 } 8847 8848 /* A descriptor is opened to prevent bdev deletion during examination */ 8849 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 8850 if (rc != 0) { 8851 spdk_bdev_unregister(bdev, NULL, NULL); 8852 return rc; 8853 } 8854 8855 rc = bdev_open(bdev, false, desc); 8856 if (rc != 0) { 8857 bdev_desc_free(desc); 8858 spdk_bdev_unregister(bdev, NULL, NULL); 8859 return rc; 8860 } 8861 8862 /* Examine configuration before initializing I/O */ 8863 bdev_examine(bdev); 8864 8865 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8866 if (rc != 0) { 8867 bdev_close(bdev, desc); 8868 spdk_bdev_unregister(bdev, NULL, NULL); 8869 } 8870 8871 return rc; 8872 } 8873 8874 int 8875 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8876 struct spdk_bdev_module *module) 8877 { 8878 spdk_spin_lock(&bdev->internal.spinlock); 8879 8880 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8881 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8882 spdk_spin_unlock(&bdev->internal.spinlock); 8883 return -EPERM; 8884 } 8885 8886 if (desc && !desc->write) { 8887 desc->write = true; 8888 } 8889 8890 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8891 bdev->internal.claim.v1.module = module; 8892 8893 spdk_spin_unlock(&bdev->internal.spinlock); 8894 return 0; 8895 } 8896 8897 void 8898 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8899 { 8900 spdk_spin_lock(&bdev->internal.spinlock); 8901 8902 assert(bdev->internal.claim.v1.module != NULL); 8903 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8904 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8905 bdev->internal.claim.v1.module = NULL; 8906 8907 spdk_spin_unlock(&bdev->internal.spinlock); 8908 } 8909 8910 /* 8911 * Start claims v2 8912 */ 8913 8914 const char * 8915 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8916 { 8917 switch (type) { 8918 case SPDK_BDEV_CLAIM_NONE: 8919 return "not_claimed"; 8920 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8921 return "exclusive_write"; 8922 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8923 return "read_many_write_one"; 8924 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8925 return "read_many_write_none"; 8926 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8927 return "read_many_write_many"; 8928 default: 8929 break; 8930 } 8931 return "invalid_claim"; 8932 } 8933 8934 static bool 8935 claim_type_is_v2(enum spdk_bdev_claim_type type) 8936 { 8937 switch (type) { 8938 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8939 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8940 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8941 return true; 8942 default: 8943 break; 8944 } 8945 return false; 8946 } 8947 8948 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8949 static bool 8950 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8951 { 8952 switch (type) { 8953 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8954 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8955 return true; 8956 default: 8957 break; 8958 } 8959 return false; 8960 } 8961 8962 void 8963 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8964 { 8965 if (opts == NULL) { 8966 SPDK_ERRLOG("opts should not be NULL\n"); 8967 assert(opts != NULL); 8968 return; 8969 } 8970 if (size == 0) { 8971 SPDK_ERRLOG("size should not be zero\n"); 8972 assert(size != 0); 8973 return; 8974 } 8975 8976 memset(opts, 0, size); 8977 opts->opts_size = size; 8978 8979 #define FIELD_OK(field) \ 8980 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8981 8982 #define SET_FIELD(field, value) \ 8983 if (FIELD_OK(field)) { \ 8984 opts->field = value; \ 8985 } \ 8986 8987 SET_FIELD(shared_claim_key, 0); 8988 8989 #undef FIELD_OK 8990 #undef SET_FIELD 8991 } 8992 8993 static int 8994 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8995 { 8996 if (src->opts_size == 0) { 8997 SPDK_ERRLOG("size should not be zero\n"); 8998 return -1; 8999 } 9000 9001 memset(dst, 0, sizeof(*dst)); 9002 dst->opts_size = src->opts_size; 9003 9004 #define FIELD_OK(field) \ 9005 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 9006 9007 #define SET_FIELD(field) \ 9008 if (FIELD_OK(field)) { \ 9009 dst->field = src->field; \ 9010 } \ 9011 9012 if (FIELD_OK(name)) { 9013 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 9014 } 9015 9016 SET_FIELD(shared_claim_key); 9017 9018 /* You should not remove this statement, but need to update the assert statement 9019 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9020 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 9021 9022 #undef FIELD_OK 9023 #undef SET_FIELD 9024 return 0; 9025 } 9026 9027 /* Returns 0 if a read-write-once claim can be taken. */ 9028 static int 9029 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9030 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9031 { 9032 struct spdk_bdev *bdev = desc->bdev; 9033 struct spdk_bdev_desc *open_desc; 9034 9035 assert(spdk_spin_held(&bdev->internal.spinlock)); 9036 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 9037 9038 if (opts->shared_claim_key != 0) { 9039 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 9040 bdev->name); 9041 return -EINVAL; 9042 } 9043 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 9044 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9045 return -EPERM; 9046 } 9047 if (desc->claim != NULL) { 9048 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 9049 bdev->name, desc->claim->module->name); 9050 return -EPERM; 9051 } 9052 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9053 if (desc != open_desc && open_desc->write) { 9054 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 9055 "another descriptor is open for writing\n", 9056 bdev->name); 9057 return -EPERM; 9058 } 9059 } 9060 9061 return 0; 9062 } 9063 9064 /* Returns 0 if a read-only-many claim can be taken. */ 9065 static int 9066 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9067 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9068 { 9069 struct spdk_bdev *bdev = desc->bdev; 9070 struct spdk_bdev_desc *open_desc; 9071 9072 assert(spdk_spin_held(&bdev->internal.spinlock)); 9073 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 9074 assert(desc->claim == NULL); 9075 9076 if (desc->write) { 9077 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 9078 bdev->name); 9079 return -EINVAL; 9080 } 9081 if (opts->shared_claim_key != 0) { 9082 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 9083 return -EINVAL; 9084 } 9085 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9086 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9087 if (open_desc->write) { 9088 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 9089 "another descriptor is open for writing\n", 9090 bdev->name); 9091 return -EPERM; 9092 } 9093 } 9094 } 9095 9096 return 0; 9097 } 9098 9099 /* Returns 0 if a read-write-many claim can be taken. */ 9100 static int 9101 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9102 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9103 { 9104 struct spdk_bdev *bdev = desc->bdev; 9105 struct spdk_bdev_desc *open_desc; 9106 9107 assert(spdk_spin_held(&bdev->internal.spinlock)); 9108 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 9109 assert(desc->claim == NULL); 9110 9111 if (opts->shared_claim_key == 0) { 9112 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 9113 bdev->name); 9114 return -EINVAL; 9115 } 9116 switch (bdev->internal.claim_type) { 9117 case SPDK_BDEV_CLAIM_NONE: 9118 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9119 if (open_desc == desc) { 9120 continue; 9121 } 9122 if (open_desc->write) { 9123 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 9124 "another descriptor is open for writing without a " 9125 "claim\n", bdev->name); 9126 return -EPERM; 9127 } 9128 } 9129 break; 9130 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9131 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 9132 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 9133 return -EPERM; 9134 } 9135 break; 9136 default: 9137 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9138 return -EBUSY; 9139 } 9140 9141 return 0; 9142 } 9143 9144 /* Updates desc and its bdev with a v2 claim. */ 9145 static int 9146 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9147 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9148 { 9149 struct spdk_bdev *bdev = desc->bdev; 9150 struct spdk_bdev_module_claim *claim; 9151 9152 assert(spdk_spin_held(&bdev->internal.spinlock)); 9153 assert(claim_type_is_v2(type)); 9154 assert(desc->claim == NULL); 9155 9156 claim = calloc(1, sizeof(*desc->claim)); 9157 if (claim == NULL) { 9158 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 9159 return -ENOMEM; 9160 } 9161 claim->module = module; 9162 claim->desc = desc; 9163 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 9164 memcpy(claim->name, opts->name, sizeof(claim->name)); 9165 desc->claim = claim; 9166 9167 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9168 bdev->internal.claim_type = type; 9169 TAILQ_INIT(&bdev->internal.claim.v2.claims); 9170 bdev->internal.claim.v2.key = opts->shared_claim_key; 9171 } 9172 assert(type == bdev->internal.claim_type); 9173 9174 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 9175 9176 if (!desc->write && claim_type_promotes_to_write(type)) { 9177 desc->write = true; 9178 } 9179 9180 return 0; 9181 } 9182 9183 int 9184 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9185 struct spdk_bdev_claim_opts *_opts, 9186 struct spdk_bdev_module *module) 9187 { 9188 struct spdk_bdev *bdev; 9189 struct spdk_bdev_claim_opts opts; 9190 int rc = 0; 9191 9192 if (desc == NULL) { 9193 SPDK_ERRLOG("descriptor must not be NULL\n"); 9194 return -EINVAL; 9195 } 9196 9197 bdev = desc->bdev; 9198 9199 if (_opts == NULL) { 9200 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 9201 } else if (claim_opts_copy(_opts, &opts) != 0) { 9202 return -EINVAL; 9203 } 9204 9205 spdk_spin_lock(&bdev->internal.spinlock); 9206 9207 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9208 bdev->internal.claim_type != type) { 9209 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9210 spdk_spin_unlock(&bdev->internal.spinlock); 9211 return -EPERM; 9212 } 9213 9214 if (claim_type_is_v2(type) && desc->claim != NULL) { 9215 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9216 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9217 spdk_spin_unlock(&bdev->internal.spinlock); 9218 return -EPERM; 9219 } 9220 9221 switch (type) { 9222 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9223 spdk_spin_unlock(&bdev->internal.spinlock); 9224 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9225 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9226 rc = claim_verify_rwo(desc, type, &opts, module); 9227 break; 9228 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9229 rc = claim_verify_rom(desc, type, &opts, module); 9230 break; 9231 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9232 rc = claim_verify_rwm(desc, type, &opts, module); 9233 break; 9234 default: 9235 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9236 rc = -ENOTSUP; 9237 } 9238 9239 if (rc == 0) { 9240 rc = claim_bdev(desc, type, &opts, module); 9241 } 9242 9243 spdk_spin_unlock(&bdev->internal.spinlock); 9244 return rc; 9245 } 9246 9247 static void 9248 claim_reset(struct spdk_bdev *bdev) 9249 { 9250 assert(spdk_spin_held(&bdev->internal.spinlock)); 9251 assert(claim_type_is_v2(bdev->internal.claim_type)); 9252 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9253 9254 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9255 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9256 } 9257 9258 static void 9259 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9260 { 9261 struct spdk_bdev *bdev = desc->bdev; 9262 9263 assert(spdk_spin_held(&bdev->internal.spinlock)); 9264 assert(claim_type_is_v2(bdev->internal.claim_type)); 9265 9266 if (bdev->internal.examine_in_progress == 0) { 9267 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9268 free(desc->claim); 9269 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9270 claim_reset(bdev); 9271 } 9272 } else { 9273 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9274 desc->claim->module = NULL; 9275 desc->claim->desc = NULL; 9276 } 9277 desc->claim = NULL; 9278 } 9279 9280 /* 9281 * End claims v2 9282 */ 9283 9284 struct spdk_bdev * 9285 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9286 { 9287 assert(desc != NULL); 9288 return desc->bdev; 9289 } 9290 9291 int 9292 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9293 { 9294 struct spdk_bdev *bdev, *tmp; 9295 struct spdk_bdev_desc *desc; 9296 int rc = 0; 9297 9298 assert(fn != NULL); 9299 9300 spdk_spin_lock(&g_bdev_mgr.spinlock); 9301 bdev = spdk_bdev_first(); 9302 while (bdev != NULL) { 9303 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9304 if (rc != 0) { 9305 break; 9306 } 9307 rc = bdev_open(bdev, false, desc); 9308 if (rc != 0) { 9309 bdev_desc_free(desc); 9310 if (rc == -ENODEV) { 9311 /* Ignore the error and move to the next bdev. */ 9312 rc = 0; 9313 bdev = spdk_bdev_next(bdev); 9314 continue; 9315 } 9316 break; 9317 } 9318 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9319 9320 rc = fn(ctx, bdev); 9321 9322 spdk_spin_lock(&g_bdev_mgr.spinlock); 9323 tmp = spdk_bdev_next(bdev); 9324 bdev_close(bdev, desc); 9325 if (rc != 0) { 9326 break; 9327 } 9328 bdev = tmp; 9329 } 9330 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9331 9332 return rc; 9333 } 9334 9335 int 9336 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9337 { 9338 struct spdk_bdev *bdev, *tmp; 9339 struct spdk_bdev_desc *desc; 9340 int rc = 0; 9341 9342 assert(fn != NULL); 9343 9344 spdk_spin_lock(&g_bdev_mgr.spinlock); 9345 bdev = spdk_bdev_first_leaf(); 9346 while (bdev != NULL) { 9347 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9348 if (rc != 0) { 9349 break; 9350 } 9351 rc = bdev_open(bdev, false, desc); 9352 if (rc != 0) { 9353 bdev_desc_free(desc); 9354 if (rc == -ENODEV) { 9355 /* Ignore the error and move to the next bdev. */ 9356 rc = 0; 9357 bdev = spdk_bdev_next_leaf(bdev); 9358 continue; 9359 } 9360 break; 9361 } 9362 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9363 9364 rc = fn(ctx, bdev); 9365 9366 spdk_spin_lock(&g_bdev_mgr.spinlock); 9367 tmp = spdk_bdev_next_leaf(bdev); 9368 bdev_close(bdev, desc); 9369 if (rc != 0) { 9370 break; 9371 } 9372 bdev = tmp; 9373 } 9374 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9375 9376 return rc; 9377 } 9378 9379 void 9380 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9381 { 9382 struct iovec *iovs; 9383 int iovcnt; 9384 9385 if (bdev_io == NULL) { 9386 return; 9387 } 9388 9389 switch (bdev_io->type) { 9390 case SPDK_BDEV_IO_TYPE_READ: 9391 case SPDK_BDEV_IO_TYPE_WRITE: 9392 case SPDK_BDEV_IO_TYPE_ZCOPY: 9393 iovs = bdev_io->u.bdev.iovs; 9394 iovcnt = bdev_io->u.bdev.iovcnt; 9395 break; 9396 default: 9397 iovs = NULL; 9398 iovcnt = 0; 9399 break; 9400 } 9401 9402 if (iovp) { 9403 *iovp = iovs; 9404 } 9405 if (iovcntp) { 9406 *iovcntp = iovcnt; 9407 } 9408 } 9409 9410 void * 9411 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9412 { 9413 if (bdev_io == NULL) { 9414 return NULL; 9415 } 9416 9417 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9418 return NULL; 9419 } 9420 9421 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9422 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9423 return bdev_io->u.bdev.md_buf; 9424 } 9425 9426 return NULL; 9427 } 9428 9429 void * 9430 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9431 { 9432 if (bdev_io == NULL) { 9433 assert(false); 9434 return NULL; 9435 } 9436 9437 return bdev_io->internal.caller_ctx; 9438 } 9439 9440 void 9441 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9442 { 9443 9444 if (spdk_bdev_module_list_find(bdev_module->name)) { 9445 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9446 assert(false); 9447 } 9448 9449 spdk_spin_init(&bdev_module->internal.spinlock); 9450 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9451 9452 /* 9453 * Modules with examine callbacks must be initialized first, so they are 9454 * ready to handle examine callbacks from later modules that will 9455 * register physical bdevs. 9456 */ 9457 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9458 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9459 } else { 9460 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9461 } 9462 } 9463 9464 struct spdk_bdev_module * 9465 spdk_bdev_module_list_find(const char *name) 9466 { 9467 struct spdk_bdev_module *bdev_module; 9468 9469 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9470 if (strcmp(name, bdev_module->name) == 0) { 9471 break; 9472 } 9473 } 9474 9475 return bdev_module; 9476 } 9477 9478 static int 9479 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9480 { 9481 uint64_t num_blocks; 9482 void *md_buf = NULL; 9483 9484 num_blocks = bdev_io->u.bdev.num_blocks; 9485 9486 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9487 md_buf = (char *)g_bdev_mgr.zero_buffer + 9488 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9489 } 9490 9491 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9492 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9493 g_bdev_mgr.zero_buffer, md_buf, 9494 bdev_io->u.bdev.offset_blocks, num_blocks, 9495 bdev_write_zero_buffer_done, bdev_io); 9496 } 9497 9498 static void 9499 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9500 { 9501 struct spdk_bdev_io *parent_io = cb_arg; 9502 9503 spdk_bdev_free_io(bdev_io); 9504 9505 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9506 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9507 } 9508 9509 static void 9510 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9511 { 9512 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9513 ctx->bdev->internal.qos_mod_in_progress = false; 9514 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9515 9516 if (ctx->cb_fn) { 9517 ctx->cb_fn(ctx->cb_arg, status); 9518 } 9519 free(ctx); 9520 } 9521 9522 static void 9523 bdev_disable_qos_done(void *cb_arg) 9524 { 9525 struct set_qos_limit_ctx *ctx = cb_arg; 9526 struct spdk_bdev *bdev = ctx->bdev; 9527 struct spdk_bdev_qos *qos; 9528 9529 spdk_spin_lock(&bdev->internal.spinlock); 9530 qos = bdev->internal.qos; 9531 bdev->internal.qos = NULL; 9532 spdk_spin_unlock(&bdev->internal.spinlock); 9533 9534 if (qos->thread != NULL) { 9535 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9536 spdk_poller_unregister(&qos->poller); 9537 } 9538 9539 free(qos); 9540 9541 bdev_set_qos_limit_done(ctx, 0); 9542 } 9543 9544 static void 9545 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9546 { 9547 struct set_qos_limit_ctx *ctx = _ctx; 9548 struct spdk_thread *thread; 9549 9550 spdk_spin_lock(&bdev->internal.spinlock); 9551 thread = bdev->internal.qos->thread; 9552 spdk_spin_unlock(&bdev->internal.spinlock); 9553 9554 if (thread != NULL) { 9555 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9556 } else { 9557 bdev_disable_qos_done(ctx); 9558 } 9559 } 9560 9561 static void 9562 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9563 struct spdk_io_channel *ch, void *_ctx) 9564 { 9565 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9566 struct spdk_bdev_io *bdev_io; 9567 9568 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9569 9570 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9571 /* Re-submit the queued I/O. */ 9572 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9573 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9574 _bdev_io_submit(bdev_io); 9575 } 9576 9577 spdk_bdev_for_each_channel_continue(i, 0); 9578 } 9579 9580 static void 9581 bdev_update_qos_rate_limit_msg(void *cb_arg) 9582 { 9583 struct set_qos_limit_ctx *ctx = cb_arg; 9584 struct spdk_bdev *bdev = ctx->bdev; 9585 9586 spdk_spin_lock(&bdev->internal.spinlock); 9587 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9588 spdk_spin_unlock(&bdev->internal.spinlock); 9589 9590 bdev_set_qos_limit_done(ctx, 0); 9591 } 9592 9593 static void 9594 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9595 struct spdk_io_channel *ch, void *_ctx) 9596 { 9597 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9598 9599 spdk_spin_lock(&bdev->internal.spinlock); 9600 bdev_enable_qos(bdev, bdev_ch); 9601 spdk_spin_unlock(&bdev->internal.spinlock); 9602 spdk_bdev_for_each_channel_continue(i, 0); 9603 } 9604 9605 static void 9606 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9607 { 9608 struct set_qos_limit_ctx *ctx = _ctx; 9609 9610 bdev_set_qos_limit_done(ctx, status); 9611 } 9612 9613 static void 9614 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9615 { 9616 int i; 9617 9618 assert(bdev->internal.qos != NULL); 9619 9620 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9621 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9622 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9623 9624 if (limits[i] == 0) { 9625 bdev->internal.qos->rate_limits[i].limit = 9626 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9627 } 9628 } 9629 } 9630 } 9631 9632 void 9633 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9634 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9635 { 9636 struct set_qos_limit_ctx *ctx; 9637 uint32_t limit_set_complement; 9638 uint64_t min_limit_per_sec; 9639 int i; 9640 bool disable_rate_limit = true; 9641 9642 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9643 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9644 continue; 9645 } 9646 9647 if (limits[i] > 0) { 9648 disable_rate_limit = false; 9649 } 9650 9651 if (bdev_qos_is_iops_rate_limit(i) == true) { 9652 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9653 } else { 9654 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9655 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9656 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9657 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9658 } 9659 /* Change from megabyte to byte rate limit */ 9660 limits[i] = limits[i] * 1024 * 1024; 9661 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9662 } 9663 9664 limit_set_complement = limits[i] % min_limit_per_sec; 9665 if (limit_set_complement) { 9666 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9667 limits[i], min_limit_per_sec); 9668 limits[i] += min_limit_per_sec - limit_set_complement; 9669 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9670 } 9671 } 9672 9673 ctx = calloc(1, sizeof(*ctx)); 9674 if (ctx == NULL) { 9675 cb_fn(cb_arg, -ENOMEM); 9676 return; 9677 } 9678 9679 ctx->cb_fn = cb_fn; 9680 ctx->cb_arg = cb_arg; 9681 ctx->bdev = bdev; 9682 9683 spdk_spin_lock(&bdev->internal.spinlock); 9684 if (bdev->internal.qos_mod_in_progress) { 9685 spdk_spin_unlock(&bdev->internal.spinlock); 9686 free(ctx); 9687 cb_fn(cb_arg, -EAGAIN); 9688 return; 9689 } 9690 bdev->internal.qos_mod_in_progress = true; 9691 9692 if (disable_rate_limit == true && bdev->internal.qos) { 9693 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9694 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9695 (bdev->internal.qos->rate_limits[i].limit > 0 && 9696 bdev->internal.qos->rate_limits[i].limit != 9697 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9698 disable_rate_limit = false; 9699 break; 9700 } 9701 } 9702 } 9703 9704 if (disable_rate_limit == false) { 9705 if (bdev->internal.qos == NULL) { 9706 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9707 if (!bdev->internal.qos) { 9708 spdk_spin_unlock(&bdev->internal.spinlock); 9709 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9710 bdev_set_qos_limit_done(ctx, -ENOMEM); 9711 return; 9712 } 9713 } 9714 9715 if (bdev->internal.qos->thread == NULL) { 9716 /* Enabling */ 9717 bdev_set_qos_rate_limits(bdev, limits); 9718 9719 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9720 bdev_enable_qos_done); 9721 } else { 9722 /* Updating */ 9723 bdev_set_qos_rate_limits(bdev, limits); 9724 9725 spdk_thread_send_msg(bdev->internal.qos->thread, 9726 bdev_update_qos_rate_limit_msg, ctx); 9727 } 9728 } else { 9729 if (bdev->internal.qos != NULL) { 9730 bdev_set_qos_rate_limits(bdev, limits); 9731 9732 /* Disabling */ 9733 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9734 bdev_disable_qos_msg_done); 9735 } else { 9736 spdk_spin_unlock(&bdev->internal.spinlock); 9737 bdev_set_qos_limit_done(ctx, 0); 9738 return; 9739 } 9740 } 9741 9742 spdk_spin_unlock(&bdev->internal.spinlock); 9743 } 9744 9745 struct spdk_bdev_histogram_ctx { 9746 spdk_bdev_histogram_status_cb cb_fn; 9747 void *cb_arg; 9748 struct spdk_bdev *bdev; 9749 int status; 9750 }; 9751 9752 static void 9753 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9754 { 9755 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9756 9757 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9758 ctx->bdev->internal.histogram_in_progress = false; 9759 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9760 ctx->cb_fn(ctx->cb_arg, ctx->status); 9761 free(ctx); 9762 } 9763 9764 static void 9765 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9766 struct spdk_io_channel *_ch, void *_ctx) 9767 { 9768 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9769 9770 if (ch->histogram != NULL) { 9771 spdk_histogram_data_free(ch->histogram); 9772 ch->histogram = NULL; 9773 } 9774 spdk_bdev_for_each_channel_continue(i, 0); 9775 } 9776 9777 static void 9778 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9779 { 9780 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9781 9782 if (status != 0) { 9783 ctx->status = status; 9784 ctx->bdev->internal.histogram_enabled = false; 9785 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9786 bdev_histogram_disable_channel_cb); 9787 } else { 9788 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9789 ctx->bdev->internal.histogram_in_progress = false; 9790 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9791 ctx->cb_fn(ctx->cb_arg, ctx->status); 9792 free(ctx); 9793 } 9794 } 9795 9796 static void 9797 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9798 struct spdk_io_channel *_ch, void *_ctx) 9799 { 9800 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9801 int status = 0; 9802 9803 if (ch->histogram == NULL) { 9804 ch->histogram = spdk_histogram_data_alloc(); 9805 if (ch->histogram == NULL) { 9806 status = -ENOMEM; 9807 } 9808 } 9809 9810 spdk_bdev_for_each_channel_continue(i, status); 9811 } 9812 9813 void 9814 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9815 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9816 { 9817 struct spdk_bdev_histogram_ctx *ctx; 9818 9819 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9820 if (ctx == NULL) { 9821 cb_fn(cb_arg, -ENOMEM); 9822 return; 9823 } 9824 9825 ctx->bdev = bdev; 9826 ctx->status = 0; 9827 ctx->cb_fn = cb_fn; 9828 ctx->cb_arg = cb_arg; 9829 9830 spdk_spin_lock(&bdev->internal.spinlock); 9831 if (bdev->internal.histogram_in_progress) { 9832 spdk_spin_unlock(&bdev->internal.spinlock); 9833 free(ctx); 9834 cb_fn(cb_arg, -EAGAIN); 9835 return; 9836 } 9837 9838 bdev->internal.histogram_in_progress = true; 9839 spdk_spin_unlock(&bdev->internal.spinlock); 9840 9841 bdev->internal.histogram_enabled = enable; 9842 bdev->internal.histogram_io_type = opts->io_type; 9843 9844 if (enable) { 9845 /* Allocate histogram for each channel */ 9846 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9847 bdev_histogram_enable_channel_cb); 9848 } else { 9849 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9850 bdev_histogram_disable_channel_cb); 9851 } 9852 } 9853 9854 void 9855 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9856 { 9857 if (opts == NULL) { 9858 SPDK_ERRLOG("opts should not be NULL\n"); 9859 assert(opts != NULL); 9860 return; 9861 } 9862 if (size == 0) { 9863 SPDK_ERRLOG("size should not be zero\n"); 9864 assert(size != 0); 9865 return; 9866 } 9867 9868 memset(opts, 0, size); 9869 opts->size = size; 9870 9871 #define FIELD_OK(field) \ 9872 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9873 9874 #define SET_FIELD(field, value) \ 9875 if (FIELD_OK(field)) { \ 9876 opts->field = value; \ 9877 } \ 9878 9879 SET_FIELD(io_type, 0); 9880 9881 /* You should not remove this statement, but need to update the assert statement 9882 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9883 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9884 9885 #undef FIELD_OK 9886 #undef SET_FIELD 9887 } 9888 9889 void 9890 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9891 void *cb_arg, bool enable) 9892 { 9893 struct spdk_bdev_enable_histogram_opts opts; 9894 9895 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9896 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9897 } 9898 9899 struct spdk_bdev_histogram_data_ctx { 9900 spdk_bdev_histogram_data_cb cb_fn; 9901 void *cb_arg; 9902 struct spdk_bdev *bdev; 9903 /** merged histogram data from all channels */ 9904 struct spdk_histogram_data *histogram; 9905 }; 9906 9907 static void 9908 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9909 { 9910 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9911 9912 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9913 free(ctx); 9914 } 9915 9916 static void 9917 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9918 struct spdk_io_channel *_ch, void *_ctx) 9919 { 9920 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9921 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9922 int status = 0; 9923 9924 if (ch->histogram == NULL) { 9925 status = -EFAULT; 9926 } else { 9927 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9928 } 9929 9930 spdk_bdev_for_each_channel_continue(i, status); 9931 } 9932 9933 void 9934 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9935 spdk_bdev_histogram_data_cb cb_fn, 9936 void *cb_arg) 9937 { 9938 struct spdk_bdev_histogram_data_ctx *ctx; 9939 9940 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9941 if (ctx == NULL) { 9942 cb_fn(cb_arg, -ENOMEM, NULL); 9943 return; 9944 } 9945 9946 ctx->bdev = bdev; 9947 ctx->cb_fn = cb_fn; 9948 ctx->cb_arg = cb_arg; 9949 9950 ctx->histogram = histogram; 9951 9952 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9953 bdev_histogram_get_channel_cb); 9954 } 9955 9956 void 9957 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9958 void *cb_arg) 9959 { 9960 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9961 int status = 0; 9962 9963 assert(cb_fn != NULL); 9964 9965 if (bdev_ch->histogram == NULL) { 9966 status = -EFAULT; 9967 } 9968 cb_fn(cb_arg, status, bdev_ch->histogram); 9969 } 9970 9971 size_t 9972 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9973 size_t max_events) 9974 { 9975 struct media_event_entry *entry; 9976 size_t num_events = 0; 9977 9978 for (; num_events < max_events; ++num_events) { 9979 entry = TAILQ_FIRST(&desc->pending_media_events); 9980 if (entry == NULL) { 9981 break; 9982 } 9983 9984 events[num_events] = entry->event; 9985 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9986 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9987 } 9988 9989 return num_events; 9990 } 9991 9992 int 9993 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9994 size_t num_events) 9995 { 9996 struct spdk_bdev_desc *desc; 9997 struct media_event_entry *entry; 9998 size_t event_id; 9999 int rc = 0; 10000 10001 assert(bdev->media_events); 10002 10003 spdk_spin_lock(&bdev->internal.spinlock); 10004 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10005 if (desc->write) { 10006 break; 10007 } 10008 } 10009 10010 if (desc == NULL || desc->media_events_buffer == NULL) { 10011 rc = -ENODEV; 10012 goto out; 10013 } 10014 10015 for (event_id = 0; event_id < num_events; ++event_id) { 10016 entry = TAILQ_FIRST(&desc->free_media_events); 10017 if (entry == NULL) { 10018 break; 10019 } 10020 10021 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 10022 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 10023 entry->event = events[event_id]; 10024 } 10025 10026 rc = event_id; 10027 out: 10028 spdk_spin_unlock(&bdev->internal.spinlock); 10029 return rc; 10030 } 10031 10032 static void 10033 _media_management_notify(void *arg) 10034 { 10035 struct spdk_bdev_desc *desc = arg; 10036 10037 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 10038 } 10039 10040 void 10041 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 10042 { 10043 struct spdk_bdev_desc *desc; 10044 10045 spdk_spin_lock(&bdev->internal.spinlock); 10046 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10047 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 10048 event_notify(desc, _media_management_notify); 10049 } 10050 } 10051 spdk_spin_unlock(&bdev->internal.spinlock); 10052 } 10053 10054 struct locked_lba_range_ctx { 10055 struct lba_range range; 10056 struct lba_range *current_range; 10057 struct lba_range *owner_range; 10058 struct spdk_poller *poller; 10059 lock_range_cb cb_fn; 10060 void *cb_arg; 10061 }; 10062 10063 static void 10064 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10065 { 10066 struct locked_lba_range_ctx *ctx = _ctx; 10067 10068 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 10069 free(ctx); 10070 } 10071 10072 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 10073 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 10074 10075 static void 10076 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10077 { 10078 struct locked_lba_range_ctx *ctx = _ctx; 10079 10080 if (status == -ENOMEM) { 10081 /* One of the channels could not allocate a range object. 10082 * So we have to go back and clean up any ranges that were 10083 * allocated successfully before we return error status to 10084 * the caller. We can reuse the unlock function to do that 10085 * clean up. 10086 */ 10087 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10088 bdev_lock_error_cleanup_cb); 10089 return; 10090 } 10091 10092 /* All channels have locked this range and no I/O overlapping the range 10093 * are outstanding! Set the owner_ch for the range object for the 10094 * locking channel, so that this channel will know that it is allowed 10095 * to write to this range. 10096 */ 10097 if (ctx->owner_range != NULL) { 10098 ctx->owner_range->owner_ch = ctx->range.owner_ch; 10099 } 10100 10101 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10102 10103 /* Don't free the ctx here. Its range is in the bdev's global list of 10104 * locked ranges still, and will be removed and freed when this range 10105 * is later unlocked. 10106 */ 10107 } 10108 10109 static int 10110 bdev_lock_lba_range_check_io(void *_i) 10111 { 10112 struct spdk_bdev_channel_iter *i = _i; 10113 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 10114 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10115 struct locked_lba_range_ctx *ctx = i->ctx; 10116 struct lba_range *range = ctx->current_range; 10117 struct spdk_bdev_io *bdev_io; 10118 10119 spdk_poller_unregister(&ctx->poller); 10120 10121 /* The range is now in the locked_ranges, so no new IO can be submitted to this 10122 * range. But we need to wait until any outstanding IO overlapping with this range 10123 * are completed. 10124 */ 10125 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 10126 if (bdev_io_range_is_locked(bdev_io, range)) { 10127 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 10128 return SPDK_POLLER_BUSY; 10129 } 10130 } 10131 10132 spdk_bdev_for_each_channel_continue(i, 0); 10133 return SPDK_POLLER_BUSY; 10134 } 10135 10136 static void 10137 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10138 struct spdk_io_channel *_ch, void *_ctx) 10139 { 10140 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10141 struct locked_lba_range_ctx *ctx = _ctx; 10142 struct lba_range *range; 10143 10144 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10145 if (range->length == ctx->range.length && 10146 range->offset == ctx->range.offset && 10147 range->locked_ctx == ctx->range.locked_ctx) { 10148 /* This range already exists on this channel, so don't add 10149 * it again. This can happen when a new channel is created 10150 * while the for_each_channel operation is in progress. 10151 * Do not check for outstanding I/O in that case, since the 10152 * range was locked before any I/O could be submitted to the 10153 * new channel. 10154 */ 10155 spdk_bdev_for_each_channel_continue(i, 0); 10156 return; 10157 } 10158 } 10159 10160 range = calloc(1, sizeof(*range)); 10161 if (range == NULL) { 10162 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 10163 return; 10164 } 10165 10166 range->length = ctx->range.length; 10167 range->offset = ctx->range.offset; 10168 range->locked_ctx = ctx->range.locked_ctx; 10169 range->quiesce = ctx->range.quiesce; 10170 ctx->current_range = range; 10171 if (ctx->range.owner_ch == ch) { 10172 /* This is the range object for the channel that will hold 10173 * the lock. Store it in the ctx object so that we can easily 10174 * set its owner_ch after the lock is finally acquired. 10175 */ 10176 ctx->owner_range = range; 10177 } 10178 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 10179 bdev_lock_lba_range_check_io(i); 10180 } 10181 10182 static void 10183 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 10184 { 10185 assert(spdk_get_thread() == ctx->range.owner_thread); 10186 assert(ctx->range.owner_ch == NULL || 10187 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 10188 10189 /* We will add a copy of this range to each channel now. */ 10190 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 10191 bdev_lock_lba_range_cb); 10192 } 10193 10194 static bool 10195 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 10196 { 10197 struct lba_range *r; 10198 10199 TAILQ_FOREACH(r, tailq, tailq) { 10200 if (bdev_lba_range_overlapped(range, r)) { 10201 return true; 10202 } 10203 } 10204 return false; 10205 } 10206 10207 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10208 10209 static int 10210 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10211 uint64_t offset, uint64_t length, 10212 lock_range_cb cb_fn, void *cb_arg) 10213 { 10214 struct locked_lba_range_ctx *ctx; 10215 10216 ctx = calloc(1, sizeof(*ctx)); 10217 if (ctx == NULL) { 10218 return -ENOMEM; 10219 } 10220 10221 ctx->range.offset = offset; 10222 ctx->range.length = length; 10223 ctx->range.owner_thread = spdk_get_thread(); 10224 ctx->range.owner_ch = ch; 10225 ctx->range.locked_ctx = cb_arg; 10226 ctx->range.bdev = bdev; 10227 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10228 ctx->cb_fn = cb_fn; 10229 ctx->cb_arg = cb_arg; 10230 10231 spdk_spin_lock(&bdev->internal.spinlock); 10232 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10233 /* There is an active lock overlapping with this range. 10234 * Put it on the pending list until this range no 10235 * longer overlaps with another. 10236 */ 10237 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10238 } else { 10239 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10240 bdev_lock_lba_range_ctx(bdev, ctx); 10241 } 10242 spdk_spin_unlock(&bdev->internal.spinlock); 10243 return 0; 10244 } 10245 10246 static int 10247 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10248 uint64_t offset, uint64_t length, 10249 lock_range_cb cb_fn, void *cb_arg) 10250 { 10251 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10252 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10253 10254 if (cb_arg == NULL) { 10255 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10256 return -EINVAL; 10257 } 10258 10259 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10260 } 10261 10262 static void 10263 bdev_lock_lba_range_ctx_msg(void *_ctx) 10264 { 10265 struct locked_lba_range_ctx *ctx = _ctx; 10266 10267 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10268 } 10269 10270 static void 10271 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10272 { 10273 struct locked_lba_range_ctx *ctx = _ctx; 10274 struct locked_lba_range_ctx *pending_ctx; 10275 struct lba_range *range, *tmp; 10276 10277 spdk_spin_lock(&bdev->internal.spinlock); 10278 /* Check if there are any pending locked ranges that overlap with this range 10279 * that was just unlocked. If there are, check that it doesn't overlap with any 10280 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10281 * the lock process. 10282 */ 10283 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10284 if (bdev_lba_range_overlapped(range, &ctx->range) && 10285 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10286 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10287 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10288 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10289 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10290 bdev_lock_lba_range_ctx_msg, pending_ctx); 10291 } 10292 } 10293 spdk_spin_unlock(&bdev->internal.spinlock); 10294 10295 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10296 free(ctx); 10297 } 10298 10299 static void 10300 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10301 struct spdk_io_channel *_ch, void *_ctx) 10302 { 10303 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10304 struct locked_lba_range_ctx *ctx = _ctx; 10305 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10306 struct spdk_bdev_io *bdev_io; 10307 struct lba_range *range; 10308 10309 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10310 if (ctx->range.offset == range->offset && 10311 ctx->range.length == range->length && 10312 ctx->range.locked_ctx == range->locked_ctx) { 10313 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10314 free(range); 10315 break; 10316 } 10317 } 10318 10319 /* Note: we should almost always be able to assert that the range specified 10320 * was found. But there are some very rare corner cases where a new channel 10321 * gets created simultaneously with a range unlock, where this function 10322 * would execute on that new channel and wouldn't have the range. 10323 * We also use this to clean up range allocations when a later allocation 10324 * fails in the locking path. 10325 * So we can't actually assert() here. 10326 */ 10327 10328 /* Swap the locked IO into a temporary list, and then try to submit them again. 10329 * We could hyper-optimize this to only resubmit locked I/O that overlap 10330 * with the range that was just unlocked, but this isn't a performance path so 10331 * we go for simplicity here. 10332 */ 10333 TAILQ_INIT(&io_locked); 10334 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10335 while (!TAILQ_EMPTY(&io_locked)) { 10336 bdev_io = TAILQ_FIRST(&io_locked); 10337 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10338 bdev_io_submit(bdev_io); 10339 } 10340 10341 spdk_bdev_for_each_channel_continue(i, 0); 10342 } 10343 10344 static int 10345 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10346 lock_range_cb cb_fn, void *cb_arg) 10347 { 10348 struct locked_lba_range_ctx *ctx; 10349 struct lba_range *range; 10350 10351 spdk_spin_lock(&bdev->internal.spinlock); 10352 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10353 * and remove it. This ensures new channels don't inherit the locked range. 10354 * Then we will send a message to each channel to remove the range from its 10355 * per-channel list. 10356 */ 10357 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10358 if (range->offset == offset && range->length == length && 10359 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10360 break; 10361 } 10362 } 10363 if (range == NULL) { 10364 assert(false); 10365 spdk_spin_unlock(&bdev->internal.spinlock); 10366 return -EINVAL; 10367 } 10368 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10369 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10370 spdk_spin_unlock(&bdev->internal.spinlock); 10371 10372 ctx->cb_fn = cb_fn; 10373 ctx->cb_arg = cb_arg; 10374 10375 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10376 bdev_unlock_lba_range_cb); 10377 return 0; 10378 } 10379 10380 static int 10381 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10382 uint64_t offset, uint64_t length, 10383 lock_range_cb cb_fn, void *cb_arg) 10384 { 10385 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10386 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10387 struct lba_range *range; 10388 bool range_found = false; 10389 10390 /* Let's make sure the specified channel actually has a lock on 10391 * the specified range. Note that the range must match exactly. 10392 */ 10393 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10394 if (range->offset == offset && range->length == length && 10395 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10396 range_found = true; 10397 break; 10398 } 10399 } 10400 10401 if (!range_found) { 10402 return -EINVAL; 10403 } 10404 10405 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10406 } 10407 10408 struct bdev_quiesce_ctx { 10409 spdk_bdev_quiesce_cb cb_fn; 10410 void *cb_arg; 10411 }; 10412 10413 static void 10414 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10415 { 10416 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10417 10418 if (quiesce_ctx->cb_fn != NULL) { 10419 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10420 } 10421 10422 free(quiesce_ctx); 10423 } 10424 10425 static void 10426 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10427 { 10428 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10429 struct spdk_bdev_module *module = range->bdev->module; 10430 10431 if (status != 0) { 10432 if (quiesce_ctx->cb_fn != NULL) { 10433 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10434 } 10435 free(quiesce_ctx); 10436 return; 10437 } 10438 10439 spdk_spin_lock(&module->internal.spinlock); 10440 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10441 spdk_spin_unlock(&module->internal.spinlock); 10442 10443 if (quiesce_ctx->cb_fn != NULL) { 10444 /* copy the context in case the range is unlocked by the callback */ 10445 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10446 10447 quiesce_ctx->cb_fn = NULL; 10448 quiesce_ctx->cb_arg = NULL; 10449 10450 tmp.cb_fn(tmp.cb_arg, status); 10451 } 10452 /* quiesce_ctx will be freed on unquiesce */ 10453 } 10454 10455 static int 10456 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10457 uint64_t offset, uint64_t length, 10458 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10459 bool unquiesce) 10460 { 10461 struct bdev_quiesce_ctx *quiesce_ctx; 10462 int rc; 10463 10464 if (module != bdev->module) { 10465 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10466 return -EINVAL; 10467 } 10468 10469 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10470 return -EINVAL; 10471 } 10472 10473 if (unquiesce) { 10474 struct lba_range *range; 10475 10476 /* Make sure the specified range is actually quiesced in the specified module and 10477 * then remove it from the list. Note that the range must match exactly. 10478 */ 10479 spdk_spin_lock(&module->internal.spinlock); 10480 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10481 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10482 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10483 break; 10484 } 10485 } 10486 spdk_spin_unlock(&module->internal.spinlock); 10487 10488 if (range == NULL) { 10489 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10490 return -EINVAL; 10491 } 10492 10493 quiesce_ctx = range->locked_ctx; 10494 quiesce_ctx->cb_fn = cb_fn; 10495 quiesce_ctx->cb_arg = cb_arg; 10496 10497 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10498 } else { 10499 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10500 if (quiesce_ctx == NULL) { 10501 return -ENOMEM; 10502 } 10503 10504 quiesce_ctx->cb_fn = cb_fn; 10505 quiesce_ctx->cb_arg = cb_arg; 10506 10507 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10508 if (rc != 0) { 10509 free(quiesce_ctx); 10510 } 10511 } 10512 10513 return rc; 10514 } 10515 10516 int 10517 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10518 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10519 { 10520 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10521 } 10522 10523 int 10524 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10525 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10526 { 10527 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10528 } 10529 10530 int 10531 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10532 uint64_t offset, uint64_t length, 10533 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10534 { 10535 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10536 } 10537 10538 int 10539 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10540 uint64_t offset, uint64_t length, 10541 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10542 { 10543 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10544 } 10545 10546 int 10547 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10548 int array_size) 10549 { 10550 if (!bdev) { 10551 return -EINVAL; 10552 } 10553 10554 if (bdev->fn_table->get_memory_domains) { 10555 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10556 } 10557 10558 return 0; 10559 } 10560 10561 struct spdk_bdev_for_each_io_ctx { 10562 void *ctx; 10563 spdk_bdev_io_fn fn; 10564 spdk_bdev_for_each_io_cb cb; 10565 }; 10566 10567 static void 10568 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10569 struct spdk_io_channel *io_ch, void *_ctx) 10570 { 10571 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10572 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10573 struct spdk_bdev_io *bdev_io; 10574 int rc = 0; 10575 10576 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10577 rc = ctx->fn(ctx->ctx, bdev_io); 10578 if (rc != 0) { 10579 break; 10580 } 10581 } 10582 10583 spdk_bdev_for_each_channel_continue(i, rc); 10584 } 10585 10586 static void 10587 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10588 { 10589 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10590 10591 ctx->cb(ctx->ctx, status); 10592 10593 free(ctx); 10594 } 10595 10596 void 10597 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10598 spdk_bdev_for_each_io_cb cb) 10599 { 10600 struct spdk_bdev_for_each_io_ctx *ctx; 10601 10602 assert(fn != NULL && cb != NULL); 10603 10604 ctx = calloc(1, sizeof(*ctx)); 10605 if (ctx == NULL) { 10606 SPDK_ERRLOG("Failed to allocate context.\n"); 10607 cb(_ctx, -ENOMEM); 10608 return; 10609 } 10610 10611 ctx->ctx = _ctx; 10612 ctx->fn = fn; 10613 ctx->cb = cb; 10614 10615 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10616 bdev_for_each_io_done); 10617 } 10618 10619 void 10620 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10621 { 10622 spdk_for_each_channel_continue(iter->i, status); 10623 } 10624 10625 static struct spdk_bdev * 10626 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10627 { 10628 void *io_device = spdk_io_channel_iter_get_io_device(i); 10629 10630 return __bdev_from_io_dev(io_device); 10631 } 10632 10633 static void 10634 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10635 { 10636 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10637 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10638 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10639 10640 iter->i = i; 10641 iter->fn(iter, bdev, ch, iter->ctx); 10642 } 10643 10644 static void 10645 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10646 { 10647 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10648 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10649 10650 iter->i = i; 10651 iter->cpl(bdev, iter->ctx, status); 10652 10653 free(iter); 10654 } 10655 10656 void 10657 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10658 void *ctx, spdk_bdev_for_each_channel_done cpl) 10659 { 10660 struct spdk_bdev_channel_iter *iter; 10661 10662 assert(bdev != NULL && fn != NULL && ctx != NULL); 10663 10664 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10665 if (iter == NULL) { 10666 SPDK_ERRLOG("Unable to allocate iterator\n"); 10667 assert(false); 10668 return; 10669 } 10670 10671 iter->fn = fn; 10672 iter->cpl = cpl; 10673 iter->ctx = ctx; 10674 10675 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10676 iter, bdev_each_channel_cpl); 10677 } 10678 10679 static void 10680 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10681 { 10682 struct spdk_bdev_io *parent_io = cb_arg; 10683 10684 spdk_bdev_free_io(bdev_io); 10685 10686 /* Check return status of write */ 10687 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10688 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10689 } 10690 10691 static void 10692 bdev_copy_do_write(void *_bdev_io) 10693 { 10694 struct spdk_bdev_io *bdev_io = _bdev_io; 10695 int rc; 10696 10697 /* Write blocks */ 10698 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10699 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10700 bdev_io->u.bdev.iovs[0].iov_base, 10701 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10702 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10703 10704 if (rc == -ENOMEM) { 10705 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10706 } else if (rc != 0) { 10707 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10708 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10709 } 10710 } 10711 10712 static void 10713 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10714 { 10715 struct spdk_bdev_io *parent_io = cb_arg; 10716 10717 spdk_bdev_free_io(bdev_io); 10718 10719 /* Check return status of read */ 10720 if (!success) { 10721 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10722 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10723 return; 10724 } 10725 10726 /* Do write */ 10727 bdev_copy_do_write(parent_io); 10728 } 10729 10730 static void 10731 bdev_copy_do_read(void *_bdev_io) 10732 { 10733 struct spdk_bdev_io *bdev_io = _bdev_io; 10734 int rc; 10735 10736 /* Read blocks */ 10737 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10738 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10739 bdev_io->u.bdev.iovs[0].iov_base, 10740 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10741 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10742 10743 if (rc == -ENOMEM) { 10744 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10745 } else if (rc != 0) { 10746 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10747 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10748 } 10749 } 10750 10751 static void 10752 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10753 { 10754 if (!success) { 10755 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10756 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10757 return; 10758 } 10759 10760 bdev_copy_do_read(bdev_io); 10761 } 10762 10763 int 10764 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10765 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10766 spdk_bdev_io_completion_cb cb, void *cb_arg) 10767 { 10768 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10769 struct spdk_bdev_io *bdev_io; 10770 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10771 10772 if (!desc->write) { 10773 return -EBADF; 10774 } 10775 10776 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10777 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10778 SPDK_DEBUGLOG(bdev, 10779 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10780 dst_offset_blocks, src_offset_blocks, num_blocks); 10781 return -EINVAL; 10782 } 10783 10784 bdev_io = bdev_channel_get_io(channel); 10785 if (!bdev_io) { 10786 return -ENOMEM; 10787 } 10788 10789 bdev_io->internal.ch = channel; 10790 bdev_io->internal.desc = desc; 10791 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10792 10793 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10794 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10795 bdev_io->u.bdev.num_blocks = num_blocks; 10796 bdev_io->u.bdev.memory_domain = NULL; 10797 bdev_io->u.bdev.memory_domain_ctx = NULL; 10798 bdev_io->u.bdev.iovs = NULL; 10799 bdev_io->u.bdev.iovcnt = 0; 10800 bdev_io->u.bdev.md_buf = NULL; 10801 bdev_io->u.bdev.accel_sequence = NULL; 10802 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10803 10804 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10805 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10806 return 0; 10807 } 10808 10809 10810 /* If the copy size is large and should be split, use the generic split logic 10811 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10812 * 10813 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10814 * emulate it using regular read and write requests otherwise. 10815 */ 10816 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10817 bdev_io->internal.f.split) { 10818 bdev_io_submit(bdev_io); 10819 return 0; 10820 } 10821 10822 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10823 10824 return 0; 10825 } 10826 10827 SPDK_LOG_REGISTER_COMPONENT(bdev) 10828 10829 static void 10830 bdev_trace(void) 10831 { 10832 struct spdk_trace_tpoint_opts opts[] = { 10833 { 10834 "BDEV_IO_START", TRACE_BDEV_IO_START, 10835 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10836 { 10837 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10838 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10839 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10840 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10841 } 10842 }, 10843 { 10844 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10845 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10846 { 10847 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10848 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10849 } 10850 }, 10851 { 10852 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10853 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10854 { 10855 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10856 } 10857 }, 10858 { 10859 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10860 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10861 { 10862 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10863 } 10864 }, 10865 }; 10866 10867 10868 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10869 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10870 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10871 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10872 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10873 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10874 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10875 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 10876 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 10877 } 10878 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10879