1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 bool write; 337 bool memory_domains_supported; 338 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 339 struct spdk_bdev_open_opts opts; 340 struct spdk_thread *thread; 341 struct { 342 spdk_bdev_event_cb_t event_fn; 343 void *ctx; 344 } callback; 345 bool closed; 346 struct spdk_spinlock spinlock; 347 uint32_t refs; 348 TAILQ_HEAD(, media_event_entry) pending_media_events; 349 TAILQ_HEAD(, media_event_entry) free_media_events; 350 struct media_event_entry *media_events_buffer; 351 TAILQ_ENTRY(spdk_bdev_desc) link; 352 353 uint64_t timeout_in_sec; 354 spdk_bdev_io_timeout_cb cb_fn; 355 void *cb_arg; 356 struct spdk_poller *io_timeout_poller; 357 struct spdk_bdev_module_claim *claim; 358 }; 359 360 struct spdk_bdev_iostat_ctx { 361 struct spdk_bdev_io_stat *stat; 362 enum spdk_bdev_reset_stat_mode reset_mode; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 }; 392 393 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 394 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 395 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 396 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 397 398 static inline void bdev_io_complete(void *ctx); 399 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 404 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 405 406 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 407 struct spdk_io_channel *ch, void *_ctx); 408 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 409 410 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 412 uint64_t num_blocks, 413 struct spdk_memory_domain *domain, void *domain_ctx, 414 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 415 spdk_bdev_io_completion_cb cb, void *cb_arg); 416 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 417 struct iovec *iov, int iovcnt, void *md_buf, 418 uint64_t offset_blocks, uint64_t num_blocks, 419 struct spdk_memory_domain *domain, void *domain_ctx, 420 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 421 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 422 spdk_bdev_io_completion_cb cb, void *cb_arg); 423 424 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 425 uint64_t offset, uint64_t length, 426 lock_range_cb cb_fn, void *cb_arg); 427 428 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 429 uint64_t offset, uint64_t length, 430 lock_range_cb cb_fn, void *cb_arg); 431 432 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 433 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 434 435 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 436 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 437 static void claim_reset(struct spdk_bdev *bdev); 438 439 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 440 441 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 442 443 #define bdev_get_ext_io_opt(opts, field, defval) \ 444 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 445 446 static inline void 447 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 448 { 449 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 450 bdev_io->internal.ch->queue_depth++; 451 } 452 453 static inline void 454 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 455 { 456 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 457 bdev_io->internal.ch->queue_depth--; 458 } 459 460 void 461 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 462 { 463 if (!opts) { 464 SPDK_ERRLOG("opts should not be NULL\n"); 465 return; 466 } 467 468 if (!opts_size) { 469 SPDK_ERRLOG("opts_size should not be zero value\n"); 470 return; 471 } 472 473 opts->opts_size = opts_size; 474 475 #define SET_FIELD(field) \ 476 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 477 opts->field = g_bdev_opts.field; \ 478 } \ 479 480 SET_FIELD(bdev_io_pool_size); 481 SET_FIELD(bdev_io_cache_size); 482 SET_FIELD(bdev_auto_examine); 483 SET_FIELD(iobuf_small_cache_size); 484 SET_FIELD(iobuf_large_cache_size); 485 486 /* Do not remove this statement, you should always update this statement when you adding a new field, 487 * and do not forget to add the SET_FIELD statement for your added field. */ 488 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 489 490 #undef SET_FIELD 491 } 492 493 int 494 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 495 { 496 uint32_t min_pool_size; 497 498 if (!opts) { 499 SPDK_ERRLOG("opts cannot be NULL\n"); 500 return -1; 501 } 502 503 if (!opts->opts_size) { 504 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 505 return -1; 506 } 507 508 /* 509 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 510 * initialization. A second mgmt_ch will be created on the same thread when the application starts 511 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 512 */ 513 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 514 if (opts->bdev_io_pool_size < min_pool_size) { 515 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 516 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 517 spdk_thread_get_count()); 518 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 519 return -1; 520 } 521 522 #define SET_FIELD(field) \ 523 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 524 g_bdev_opts.field = opts->field; \ 525 } \ 526 527 SET_FIELD(bdev_io_pool_size); 528 SET_FIELD(bdev_io_cache_size); 529 SET_FIELD(bdev_auto_examine); 530 SET_FIELD(iobuf_small_cache_size); 531 SET_FIELD(iobuf_large_cache_size); 532 533 g_bdev_opts.opts_size = opts->opts_size; 534 535 #undef SET_FIELD 536 537 return 0; 538 } 539 540 static struct spdk_bdev * 541 bdev_get_by_name(const char *bdev_name) 542 { 543 struct spdk_bdev_name find; 544 struct spdk_bdev_name *res; 545 546 find.name = (char *)bdev_name; 547 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 548 if (res != NULL) { 549 return res->bdev; 550 } 551 552 return NULL; 553 } 554 555 struct spdk_bdev * 556 spdk_bdev_get_by_name(const char *bdev_name) 557 { 558 struct spdk_bdev *bdev; 559 560 spdk_spin_lock(&g_bdev_mgr.spinlock); 561 bdev = bdev_get_by_name(bdev_name); 562 spdk_spin_unlock(&g_bdev_mgr.spinlock); 563 564 return bdev; 565 } 566 567 struct bdev_io_status_string { 568 enum spdk_bdev_io_status status; 569 const char *str; 570 }; 571 572 static const struct bdev_io_status_string bdev_io_status_strings[] = { 573 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 574 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 575 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 576 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 577 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 578 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 579 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 580 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 581 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 582 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 583 }; 584 585 static const char * 586 bdev_io_status_get_string(enum spdk_bdev_io_status status) 587 { 588 uint32_t i; 589 590 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 591 if (bdev_io_status_strings[i].status == status) { 592 return bdev_io_status_strings[i].str; 593 } 594 } 595 596 return "reserved"; 597 } 598 599 struct spdk_bdev_wait_for_examine_ctx { 600 struct spdk_poller *poller; 601 spdk_bdev_wait_for_examine_cb cb_fn; 602 void *cb_arg; 603 }; 604 605 static bool bdev_module_all_actions_completed(void); 606 607 static int 608 bdev_wait_for_examine_cb(void *arg) 609 { 610 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 611 612 if (!bdev_module_all_actions_completed()) { 613 return SPDK_POLLER_IDLE; 614 } 615 616 spdk_poller_unregister(&ctx->poller); 617 ctx->cb_fn(ctx->cb_arg); 618 free(ctx); 619 620 return SPDK_POLLER_BUSY; 621 } 622 623 int 624 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 625 { 626 struct spdk_bdev_wait_for_examine_ctx *ctx; 627 628 ctx = calloc(1, sizeof(*ctx)); 629 if (ctx == NULL) { 630 return -ENOMEM; 631 } 632 ctx->cb_fn = cb_fn; 633 ctx->cb_arg = cb_arg; 634 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 635 636 return 0; 637 } 638 639 struct spdk_bdev_examine_item { 640 char *name; 641 TAILQ_ENTRY(spdk_bdev_examine_item) link; 642 }; 643 644 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 645 646 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 647 g_bdev_examine_allowlist); 648 649 static inline bool 650 bdev_examine_allowlist_check(const char *name) 651 { 652 struct spdk_bdev_examine_item *item; 653 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 654 if (strcmp(name, item->name) == 0) { 655 return true; 656 } 657 } 658 return false; 659 } 660 661 static inline void 662 bdev_examine_allowlist_remove(const char *name) 663 { 664 struct spdk_bdev_examine_item *item; 665 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 666 if (strcmp(name, item->name) == 0) { 667 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 668 free(item->name); 669 free(item); 670 break; 671 } 672 } 673 } 674 675 static inline void 676 bdev_examine_allowlist_free(void) 677 { 678 struct spdk_bdev_examine_item *item; 679 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 680 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 681 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 682 free(item->name); 683 free(item); 684 } 685 } 686 687 static inline bool 688 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 689 { 690 struct spdk_bdev_alias *tmp; 691 if (bdev_examine_allowlist_check(bdev->name)) { 692 return true; 693 } 694 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 695 if (bdev_examine_allowlist_check(tmp->alias.name)) { 696 return true; 697 } 698 } 699 return false; 700 } 701 702 static inline bool 703 bdev_ok_to_examine(struct spdk_bdev *bdev) 704 { 705 /* Some bdevs may not support the READ command. 706 * Do not try to examine them. 707 */ 708 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 709 return false; 710 } 711 712 if (g_bdev_opts.bdev_auto_examine) { 713 return true; 714 } else { 715 return bdev_in_examine_allowlist(bdev); 716 } 717 } 718 719 static void 720 bdev_examine(struct spdk_bdev *bdev) 721 { 722 struct spdk_bdev_module *module; 723 struct spdk_bdev_module_claim *claim, *tmpclaim; 724 uint32_t action; 725 726 if (!bdev_ok_to_examine(bdev)) { 727 return; 728 } 729 730 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 731 if (module->examine_config) { 732 spdk_spin_lock(&module->internal.spinlock); 733 action = module->internal.action_in_progress; 734 module->internal.action_in_progress++; 735 spdk_spin_unlock(&module->internal.spinlock); 736 module->examine_config(bdev); 737 if (action != module->internal.action_in_progress) { 738 SPDK_ERRLOG("examine_config for module %s did not call " 739 "spdk_bdev_module_examine_done()\n", module->name); 740 } 741 } 742 } 743 744 spdk_spin_lock(&bdev->internal.spinlock); 745 746 switch (bdev->internal.claim_type) { 747 case SPDK_BDEV_CLAIM_NONE: 748 /* Examine by all bdev modules */ 749 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 750 if (module->examine_disk) { 751 spdk_spin_lock(&module->internal.spinlock); 752 module->internal.action_in_progress++; 753 spdk_spin_unlock(&module->internal.spinlock); 754 spdk_spin_unlock(&bdev->internal.spinlock); 755 module->examine_disk(bdev); 756 spdk_spin_lock(&bdev->internal.spinlock); 757 } 758 } 759 break; 760 case SPDK_BDEV_CLAIM_EXCL_WRITE: 761 /* Examine by the one bdev module with a v1 claim */ 762 module = bdev->internal.claim.v1.module; 763 if (module->examine_disk) { 764 spdk_spin_lock(&module->internal.spinlock); 765 module->internal.action_in_progress++; 766 spdk_spin_unlock(&module->internal.spinlock); 767 spdk_spin_unlock(&bdev->internal.spinlock); 768 module->examine_disk(bdev); 769 return; 770 } 771 break; 772 default: 773 /* Examine by all bdev modules with a v2 claim */ 774 assert(claim_type_is_v2(bdev->internal.claim_type)); 775 /* 776 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 777 * list, perhaps accessing freed memory. Without protection, this could happen 778 * while the lock is dropped during the examine callback. 779 */ 780 bdev->internal.examine_in_progress++; 781 782 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 783 module = claim->module; 784 785 if (module == NULL) { 786 /* This is a vestigial claim, held by examine_count */ 787 continue; 788 } 789 790 if (module->examine_disk == NULL) { 791 continue; 792 } 793 794 spdk_spin_lock(&module->internal.spinlock); 795 module->internal.action_in_progress++; 796 spdk_spin_unlock(&module->internal.spinlock); 797 798 /* Call examine_disk without holding internal.spinlock. */ 799 spdk_spin_unlock(&bdev->internal.spinlock); 800 module->examine_disk(bdev); 801 spdk_spin_lock(&bdev->internal.spinlock); 802 } 803 804 assert(bdev->internal.examine_in_progress > 0); 805 bdev->internal.examine_in_progress--; 806 if (bdev->internal.examine_in_progress == 0) { 807 /* Remove any claims that were released during examine_disk */ 808 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 809 if (claim->desc != NULL) { 810 continue; 811 } 812 813 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 814 free(claim); 815 } 816 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 817 claim_reset(bdev); 818 } 819 } 820 } 821 822 spdk_spin_unlock(&bdev->internal.spinlock); 823 } 824 825 int 826 spdk_bdev_examine(const char *name) 827 { 828 struct spdk_bdev *bdev; 829 struct spdk_bdev_examine_item *item; 830 struct spdk_thread *thread = spdk_get_thread(); 831 832 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 833 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 834 thread ? spdk_thread_get_name(thread) : "null"); 835 return -EINVAL; 836 } 837 838 if (g_bdev_opts.bdev_auto_examine) { 839 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 840 return -EINVAL; 841 } 842 843 if (bdev_examine_allowlist_check(name)) { 844 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 845 return -EEXIST; 846 } 847 848 item = calloc(1, sizeof(*item)); 849 if (!item) { 850 return -ENOMEM; 851 } 852 item->name = strdup(name); 853 if (!item->name) { 854 free(item); 855 return -ENOMEM; 856 } 857 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 858 859 bdev = spdk_bdev_get_by_name(name); 860 if (bdev) { 861 bdev_examine(bdev); 862 } 863 return 0; 864 } 865 866 static inline void 867 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 868 { 869 struct spdk_bdev_examine_item *item; 870 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 871 spdk_json_write_object_begin(w); 872 spdk_json_write_named_string(w, "method", "bdev_examine"); 873 spdk_json_write_named_object_begin(w, "params"); 874 spdk_json_write_named_string(w, "name", item->name); 875 spdk_json_write_object_end(w); 876 spdk_json_write_object_end(w); 877 } 878 } 879 880 struct spdk_bdev * 881 spdk_bdev_first(void) 882 { 883 struct spdk_bdev *bdev; 884 885 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 struct spdk_bdev * 894 spdk_bdev_next(struct spdk_bdev *prev) 895 { 896 struct spdk_bdev *bdev; 897 898 bdev = TAILQ_NEXT(prev, internal.link); 899 if (bdev) { 900 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 901 } 902 903 return bdev; 904 } 905 906 static struct spdk_bdev * 907 _bdev_next_leaf(struct spdk_bdev *bdev) 908 { 909 while (bdev != NULL) { 910 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 911 return bdev; 912 } else { 913 bdev = TAILQ_NEXT(bdev, internal.link); 914 } 915 } 916 917 return bdev; 918 } 919 920 struct spdk_bdev * 921 spdk_bdev_first_leaf(void) 922 { 923 struct spdk_bdev *bdev; 924 925 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 926 927 if (bdev) { 928 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 929 } 930 931 return bdev; 932 } 933 934 struct spdk_bdev * 935 spdk_bdev_next_leaf(struct spdk_bdev *prev) 936 { 937 struct spdk_bdev *bdev; 938 939 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 940 941 if (bdev) { 942 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 943 } 944 945 return bdev; 946 } 947 948 static inline bool 949 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 950 { 951 return bdev_io->internal.f.has_memory_domain; 952 } 953 954 static inline bool 955 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 956 { 957 return bdev_io->internal.f.has_accel_sequence; 958 } 959 960 static inline uint32_t 961 bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 962 { 963 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 964 965 if (spdk_unlikely(desc->opts.hide_metadata)) { 966 return bdev->blocklen - bdev->md_len; 967 } else { 968 return bdev->blocklen; 969 } 970 } 971 972 static inline uint32_t 973 bdev_io_get_block_size(struct spdk_bdev_io *bdev_io) 974 { 975 return bdev_desc_get_block_size(bdev_io->internal.desc); 976 } 977 978 static inline void 979 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 980 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 981 { 982 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 983 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 984 * channels we will instead wait for half to complete. 985 */ 986 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 987 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 988 989 assert(state != BDEV_IO_RETRY_STATE_INVALID); 990 bdev_io->internal.retry_state = state; 991 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 992 } 993 994 static inline void 995 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 996 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 997 { 998 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 999 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 1000 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 1001 1002 assert(state != BDEV_IO_RETRY_STATE_INVALID); 1003 bdev_io->internal.retry_state = state; 1004 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1005 } 1006 1007 void 1008 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 1009 { 1010 struct iovec *iovs; 1011 1012 if (bdev_io->u.bdev.iovs == NULL) { 1013 bdev_io->u.bdev.iovs = &bdev_io->iov; 1014 bdev_io->u.bdev.iovcnt = 1; 1015 } 1016 1017 iovs = bdev_io->u.bdev.iovs; 1018 1019 assert(iovs != NULL); 1020 assert(bdev_io->u.bdev.iovcnt >= 1); 1021 1022 iovs[0].iov_base = buf; 1023 iovs[0].iov_len = len; 1024 } 1025 1026 void 1027 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1028 { 1029 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1030 bdev_io->u.bdev.md_buf = md_buf; 1031 } 1032 1033 static bool 1034 _is_buf_allocated(const struct iovec *iovs) 1035 { 1036 if (iovs == NULL) { 1037 return false; 1038 } 1039 1040 return iovs[0].iov_base != NULL; 1041 } 1042 1043 static bool 1044 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1045 { 1046 int i; 1047 uintptr_t iov_base; 1048 1049 if (spdk_likely(alignment == 1)) { 1050 return true; 1051 } 1052 1053 for (i = 0; i < iovcnt; i++) { 1054 iov_base = (uintptr_t)iovs[i].iov_base; 1055 if ((iov_base & (alignment - 1)) != 0) { 1056 return false; 1057 } 1058 } 1059 1060 return true; 1061 } 1062 1063 static inline bool 1064 bdev_io_needs_metadata(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1065 { 1066 return desc->opts.hide_metadata && bdev_io->bdev->md_len != 0; 1067 } 1068 1069 static inline bool 1070 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1071 { 1072 if (!bdev_io_use_accel_sequence(bdev_io)) { 1073 return false; 1074 } 1075 1076 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1077 * bdev module didn't support accel sequences */ 1078 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1079 } 1080 1081 static inline void 1082 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1083 struct spdk_bdev_shared_resource *shared_resource) 1084 { 1085 bdev_ch->io_outstanding++; 1086 shared_resource->io_outstanding++; 1087 } 1088 1089 static inline void 1090 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1091 struct spdk_bdev_shared_resource *shared_resource) 1092 { 1093 assert(bdev_ch->io_outstanding > 0); 1094 assert(shared_resource->io_outstanding > 0); 1095 bdev_ch->io_outstanding--; 1096 shared_resource->io_outstanding--; 1097 } 1098 1099 static void 1100 bdev_io_submit_sequence_cb(void *ctx, int status) 1101 { 1102 struct spdk_bdev_io *bdev_io = ctx; 1103 1104 assert(bdev_io_use_accel_sequence(bdev_io)); 1105 1106 bdev_io->u.bdev.accel_sequence = NULL; 1107 bdev_io->internal.f.has_accel_sequence = false; 1108 1109 if (spdk_unlikely(status != 0)) { 1110 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1111 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1112 bdev_io_complete_unsubmitted(bdev_io); 1113 return; 1114 } 1115 1116 bdev_io_submit(bdev_io); 1117 } 1118 1119 static void 1120 bdev_io_exec_sequence_cb(void *ctx, int status) 1121 { 1122 struct spdk_bdev_io *bdev_io = ctx; 1123 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1124 1125 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1126 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1127 1128 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1129 bdev_ch_retry_io(ch); 1130 } 1131 1132 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1133 } 1134 1135 static void 1136 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1137 { 1138 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1139 1140 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1141 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1142 assert(bdev_io_use_accel_sequence(bdev_io)); 1143 1144 /* Since the operations are appended during submission, they're in the opposite order than 1145 * how we want to execute them for reads (i.e. we need to execute the most recently added 1146 * operation first), so reverse the sequence before executing it. 1147 */ 1148 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1149 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1150 } 1151 1152 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1153 bdev_io_increment_outstanding(ch, ch->shared_resource); 1154 bdev_io->internal.data_transfer_cpl = cb_fn; 1155 1156 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1157 bdev_io_exec_sequence_cb, bdev_io); 1158 } 1159 1160 static void 1161 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1162 { 1163 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1164 void *buf; 1165 1166 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1167 buf = bdev_io->internal.buf.ptr; 1168 bdev_io->internal.buf.ptr = NULL; 1169 bdev_io->internal.f.has_buf = false; 1170 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1171 bdev_io->internal.get_aux_buf_cb = NULL; 1172 } else { 1173 assert(bdev_io->internal.get_buf_cb != NULL); 1174 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1175 bdev_io->internal.get_buf_cb = NULL; 1176 } 1177 } 1178 1179 static void 1180 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1181 { 1182 struct spdk_bdev_io *bdev_io = ctx; 1183 1184 if (rc) { 1185 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1186 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1187 } 1188 bdev_io_get_buf_complete(bdev_io, !rc); 1189 } 1190 1191 static void 1192 bdev_io_pull_md_buf_done(void *ctx, int status) 1193 { 1194 struct spdk_bdev_io *bdev_io = ctx; 1195 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1196 1197 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1198 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1199 1200 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1201 bdev_ch_retry_io(ch); 1202 } 1203 1204 assert(bdev_io->internal.data_transfer_cpl); 1205 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1206 } 1207 1208 static void 1209 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1210 { 1211 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1212 int rc = 0; 1213 1214 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1215 assert(bdev_io->internal.f.has_bounce_buf); 1216 if (bdev_io_use_memory_domain(bdev_io)) { 1217 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1218 bdev_io_increment_outstanding(ch, ch->shared_resource); 1219 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1220 bdev_io->internal.memory_domain_ctx, 1221 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1222 &bdev_io->internal.bounce_buf.md_iov, 1, 1223 bdev_io_pull_md_buf_done, bdev_io); 1224 if (rc == 0) { 1225 /* Continue to submit IO in completion callback */ 1226 return; 1227 } 1228 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1229 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1230 if (rc != -ENOMEM) { 1231 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1232 spdk_memory_domain_get_dma_device_id( 1233 bdev_io->internal.memory_domain), rc); 1234 } 1235 } else { 1236 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1237 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1238 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1239 } 1240 } 1241 1242 if (spdk_unlikely(rc == -ENOMEM)) { 1243 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1244 } else { 1245 assert(bdev_io->internal.data_transfer_cpl); 1246 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1247 } 1248 } 1249 1250 static void 1251 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1252 { 1253 assert(bdev_io->internal.f.has_bounce_buf); 1254 1255 /* save original md_buf */ 1256 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1257 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1258 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1259 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1260 /* set bounce md_buf */ 1261 bdev_io->u.bdev.md_buf = md_buf; 1262 1263 bdev_io_pull_md_buf(bdev_io); 1264 } 1265 1266 static void 1267 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1268 { 1269 struct spdk_bdev *bdev = bdev_io->bdev; 1270 uint64_t md_len; 1271 void *buf; 1272 1273 if (spdk_bdev_is_md_separate(bdev)) { 1274 assert(!bdev_io_use_accel_sequence(bdev_io)); 1275 1276 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1277 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1278 1279 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1280 1281 if (bdev_io->u.bdev.md_buf != NULL) { 1282 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1283 return; 1284 } else { 1285 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1286 } 1287 } 1288 1289 bdev_io_get_buf_complete(bdev_io, true); 1290 } 1291 1292 static inline void 1293 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1294 { 1295 if (rc) { 1296 SPDK_ERRLOG("Failed to get data buffer\n"); 1297 assert(bdev_io->internal.data_transfer_cpl); 1298 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1299 return; 1300 } 1301 1302 _bdev_io_set_md_buf(bdev_io); 1303 } 1304 1305 static void 1306 bdev_io_pull_data_done_and_track(void *ctx, int status) 1307 { 1308 struct spdk_bdev_io *bdev_io = ctx; 1309 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1310 1311 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1312 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1313 1314 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1315 bdev_ch_retry_io(ch); 1316 } 1317 1318 bdev_io_pull_data_done(bdev_io, status); 1319 } 1320 1321 static void 1322 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1323 { 1324 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1325 int rc = 0; 1326 1327 assert(bdev_io->internal.f.has_bounce_buf); 1328 1329 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1330 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1331 * operation */ 1332 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1333 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1334 assert(bdev_io_use_accel_sequence(bdev_io)); 1335 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1336 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1337 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1338 NULL, NULL, 1339 bdev_io->internal.bounce_buf.orig_iovs, 1340 bdev_io->internal.bounce_buf.orig_iovcnt, 1341 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1342 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1343 NULL, NULL); 1344 } else { 1345 /* We need to reverse the src/dst for reads */ 1346 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1347 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1348 bdev_io->internal.bounce_buf.orig_iovs, 1349 bdev_io->internal.bounce_buf.orig_iovcnt, 1350 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1351 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1352 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1353 NULL, NULL, NULL, NULL); 1354 } 1355 1356 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1357 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1358 bdev_io->internal.accel_sequence); 1359 } 1360 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1361 /* if this is write path, copy data from original buffer to bounce buffer */ 1362 if (bdev_io_use_memory_domain(bdev_io)) { 1363 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1364 bdev_io_increment_outstanding(ch, ch->shared_resource); 1365 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1366 bdev_io->internal.memory_domain_ctx, 1367 bdev_io->internal.bounce_buf.orig_iovs, 1368 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1369 bdev_io->u.bdev.iovs, 1, 1370 bdev_io_pull_data_done_and_track, 1371 bdev_io); 1372 if (rc == 0) { 1373 /* Continue to submit IO in completion callback */ 1374 return; 1375 } 1376 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1377 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1378 if (rc != -ENOMEM) { 1379 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1380 spdk_memory_domain_get_dma_device_id( 1381 bdev_io->internal.memory_domain)); 1382 } 1383 } else { 1384 assert(bdev_io->u.bdev.iovcnt == 1); 1385 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1386 bdev_io->u.bdev.iovs[0].iov_len, 1387 bdev_io->internal.bounce_buf.orig_iovs, 1388 bdev_io->internal.bounce_buf.orig_iovcnt); 1389 } 1390 } 1391 1392 if (spdk_unlikely(rc == -ENOMEM)) { 1393 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1394 } else { 1395 bdev_io_pull_data_done(bdev_io, rc); 1396 } 1397 } 1398 1399 static void 1400 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1401 bdev_copy_bounce_buffer_cpl cpl_cb) 1402 { 1403 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1404 1405 assert(bdev_io->internal.f.has_bounce_buf == false); 1406 1407 bdev_io->internal.data_transfer_cpl = cpl_cb; 1408 bdev_io->internal.f.has_bounce_buf = true; 1409 /* save original iovec */ 1410 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1411 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1412 /* zero the other data members */ 1413 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1414 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1415 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1416 /* set bounce iov */ 1417 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1418 bdev_io->u.bdev.iovcnt = 1; 1419 /* set bounce buffer for this operation */ 1420 bdev_io->u.bdev.iovs[0].iov_base = buf; 1421 bdev_io->u.bdev.iovs[0].iov_len = len; 1422 /* Now we use 1 iov, the split condition could have been changed */ 1423 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1424 1425 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1426 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1427 } else { 1428 bdev_io_pull_data(bdev_io); 1429 } 1430 } 1431 1432 static void 1433 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1434 { 1435 struct spdk_bdev *bdev = bdev_io->bdev; 1436 bool buf_allocated; 1437 uint64_t alignment; 1438 void *aligned_buf; 1439 1440 bdev_io->internal.buf.ptr = buf; 1441 bdev_io->internal.f.has_buf = true; 1442 1443 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1444 bdev_io_get_buf_complete(bdev_io, true); 1445 return; 1446 } 1447 1448 alignment = spdk_bdev_get_buf_align(bdev); 1449 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1450 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1451 1452 if (buf_allocated) { 1453 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1454 /* Continue in completion callback */ 1455 return; 1456 } else { 1457 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1458 } 1459 1460 _bdev_io_set_md_buf(bdev_io); 1461 } 1462 1463 static inline uint64_t 1464 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1465 { 1466 struct spdk_bdev *bdev = bdev_io->bdev; 1467 uint64_t md_len, alignment; 1468 1469 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1470 1471 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1472 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1473 1474 return len + alignment + md_len; 1475 } 1476 1477 static void 1478 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1479 { 1480 struct spdk_bdev_mgmt_channel *ch; 1481 1482 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1483 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1484 } 1485 1486 static void 1487 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1488 { 1489 assert(bdev_io->internal.f.has_buf); 1490 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1491 bdev_io->internal.buf.ptr = NULL; 1492 bdev_io->internal.f.has_buf = false; 1493 } 1494 1495 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1496 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1497 1498 void 1499 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1500 { 1501 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1502 1503 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1504 1505 assert(buf != NULL); 1506 _bdev_io_put_buf(bdev_io, buf, len); 1507 } 1508 1509 static inline void 1510 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1511 struct spdk_bdev_io *bdev_io) 1512 { 1513 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1514 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1515 * sequence pointer to make sure we won't touch it anymore. */ 1516 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1517 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1518 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1519 bdev_io->internal.f.has_accel_sequence = false; 1520 } 1521 1522 bdev->fn_table->submit_request(ioch, bdev_io); 1523 } 1524 1525 static inline void 1526 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1527 { 1528 struct spdk_bdev *bdev = bdev_io->bdev; 1529 1530 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1531 bdev_io->internal.error.nvme.cdw0 = 0; 1532 bdev_io->num_retries++; 1533 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1534 } 1535 1536 static void 1537 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1538 { 1539 struct spdk_bdev_io *bdev_io; 1540 1541 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1542 /* 1543 * Allow some more I/O to complete before retrying the nomem_io queue. 1544 * Some drivers (such as nvme) cannot immediately take a new I/O in 1545 * the context of a completion, because the resources for the I/O are 1546 * not released until control returns to the bdev poller. Also, we 1547 * may require several small I/O to complete before a larger I/O 1548 * (that requires splitting) can be submitted. 1549 */ 1550 return; 1551 } 1552 1553 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1554 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1555 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1556 1557 switch (bdev_io->internal.retry_state) { 1558 case BDEV_IO_RETRY_STATE_SUBMIT: 1559 bdev_ch_resubmit_io(shared_resource, bdev_io); 1560 break; 1561 case BDEV_IO_RETRY_STATE_PULL: 1562 bdev_io_pull_data(bdev_io); 1563 break; 1564 case BDEV_IO_RETRY_STATE_PULL_MD: 1565 bdev_io_pull_md_buf(bdev_io); 1566 break; 1567 case BDEV_IO_RETRY_STATE_PUSH: 1568 bdev_io_push_bounce_data(bdev_io); 1569 break; 1570 case BDEV_IO_RETRY_STATE_PUSH_MD: 1571 bdev_io_push_bounce_md_buf(bdev_io); 1572 break; 1573 default: 1574 assert(0 && "invalid retry state"); 1575 break; 1576 } 1577 1578 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1579 /* This IO completed again with NOMEM status, so break the loop and 1580 * don't try anymore. Note that a bdev_io that fails with NOMEM 1581 * always gets requeued at the front of the list, to maintain 1582 * ordering. 1583 */ 1584 break; 1585 } 1586 } 1587 } 1588 1589 static void 1590 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1591 { 1592 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1593 } 1594 1595 static int 1596 bdev_no_mem_poller(void *ctx) 1597 { 1598 struct spdk_bdev_shared_resource *shared_resource = ctx; 1599 1600 spdk_poller_unregister(&shared_resource->nomem_poller); 1601 1602 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1603 bdev_shared_ch_retry_io(shared_resource); 1604 } 1605 /* the retry cb may re-register the poller so double check */ 1606 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1607 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1608 /* No IOs were submitted, try again */ 1609 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1610 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1611 } 1612 1613 return SPDK_POLLER_BUSY; 1614 } 1615 1616 static inline bool 1617 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1618 { 1619 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1620 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1621 1622 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1623 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1624 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1625 1626 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1627 /* Special case when we have nomem IOs and no outstanding IOs which completions 1628 * could trigger retry of queued IOs 1629 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1630 * new IOs submitted, e.g. qd==1 */ 1631 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1632 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1633 } 1634 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1635 * ownership of that sequence is transferred back to the bdev layer, so we need to 1636 * restore internal.accel_sequence to make sure that the sequence is handled 1637 * correctly in case the I/O is later aborted. */ 1638 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1639 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1640 assert(!bdev_io_use_accel_sequence(bdev_io)); 1641 bdev_io->internal.f.has_accel_sequence = true; 1642 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1643 } 1644 1645 return true; 1646 } 1647 1648 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1649 bdev_ch_retry_io(bdev_ch); 1650 } 1651 1652 return false; 1653 } 1654 1655 static void 1656 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1657 { 1658 struct spdk_bdev_io *bdev_io = ctx; 1659 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1660 1661 if (rc) { 1662 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1663 } 1664 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1665 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1666 */ 1667 bdev_io_put_buf(bdev_io); 1668 1669 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1670 bdev_ch_retry_io(ch); 1671 } 1672 1673 /* Continue with IO completion flow */ 1674 bdev_io_complete(bdev_io); 1675 } 1676 1677 static void 1678 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1679 { 1680 struct spdk_bdev_io *bdev_io = ctx; 1681 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1682 1683 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1684 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1685 bdev_io->internal.f.has_bounce_buf = false; 1686 1687 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1688 bdev_ch_retry_io(ch); 1689 } 1690 1691 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1692 } 1693 1694 static inline void 1695 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1696 { 1697 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1698 int rc = 0; 1699 1700 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1701 assert(bdev_io->internal.f.has_bounce_buf); 1702 1703 /* do the same for metadata buffer */ 1704 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1705 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1706 1707 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1708 if (bdev_io_use_memory_domain(bdev_io)) { 1709 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1710 bdev_io_increment_outstanding(ch, ch->shared_resource); 1711 /* If memory domain is used then we need to call async push function */ 1712 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1713 bdev_io->internal.memory_domain_ctx, 1714 &bdev_io->internal.bounce_buf.orig_md_iov, 1715 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1716 &bdev_io->internal.bounce_buf.md_iov, 1, 1717 bdev_io_push_bounce_md_buf_done, 1718 bdev_io); 1719 if (rc == 0) { 1720 /* Continue IO completion in async callback */ 1721 return; 1722 } 1723 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1724 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1725 if (rc != -ENOMEM) { 1726 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1727 spdk_memory_domain_get_dma_device_id( 1728 bdev_io->internal.memory_domain)); 1729 } 1730 } else { 1731 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1732 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1733 } 1734 } 1735 } 1736 1737 if (spdk_unlikely(rc == -ENOMEM)) { 1738 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1739 } else { 1740 assert(bdev_io->internal.data_transfer_cpl); 1741 bdev_io->internal.f.has_bounce_buf = false; 1742 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1743 } 1744 } 1745 1746 static inline void 1747 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1748 { 1749 assert(bdev_io->internal.data_transfer_cpl); 1750 if (rc) { 1751 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1752 return; 1753 } 1754 1755 /* set original buffer for this io */ 1756 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1757 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1758 1759 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1760 * we still need to clear the md buf */ 1761 1762 bdev_io_push_bounce_md_buf(bdev_io); 1763 } 1764 1765 static void 1766 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1767 { 1768 struct spdk_bdev_io *bdev_io = ctx; 1769 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1770 1771 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1772 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1773 1774 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1775 bdev_ch_retry_io(ch); 1776 } 1777 1778 bdev_io_push_bounce_data_done(bdev_io, status); 1779 } 1780 1781 static inline void 1782 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1783 { 1784 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1785 int rc = 0; 1786 1787 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1788 assert(!bdev_io_use_accel_sequence(bdev_io)); 1789 assert(bdev_io->internal.f.has_bounce_buf); 1790 1791 /* if this is read path, copy data from bounce buffer to original buffer */ 1792 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1793 if (bdev_io_use_memory_domain(bdev_io)) { 1794 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1795 bdev_io_increment_outstanding(ch, ch->shared_resource); 1796 /* If memory domain is used then we need to call async push function */ 1797 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1798 bdev_io->internal.memory_domain_ctx, 1799 bdev_io->internal.bounce_buf.orig_iovs, 1800 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1801 &bdev_io->internal.bounce_buf.iov, 1, 1802 bdev_io_push_bounce_data_done_and_track, 1803 bdev_io); 1804 if (rc == 0) { 1805 /* Continue IO completion in async callback */ 1806 return; 1807 } 1808 1809 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1810 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1811 if (rc != -ENOMEM) { 1812 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1813 spdk_memory_domain_get_dma_device_id( 1814 bdev_io->internal.memory_domain)); 1815 } 1816 } else { 1817 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1818 bdev_io->internal.bounce_buf.orig_iovcnt, 1819 bdev_io->internal.bounce_buf.iov.iov_base, 1820 bdev_io->internal.bounce_buf.iov.iov_len); 1821 } 1822 } 1823 1824 if (spdk_unlikely(rc == -ENOMEM)) { 1825 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1826 } else { 1827 bdev_io_push_bounce_data_done(bdev_io, rc); 1828 } 1829 } 1830 1831 static inline void 1832 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1833 { 1834 bdev_io->internal.data_transfer_cpl = cpl_cb; 1835 bdev_io_push_bounce_data(bdev_io); 1836 } 1837 1838 static void 1839 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1840 { 1841 struct spdk_bdev_io *bdev_io; 1842 1843 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1844 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1845 } 1846 1847 static void 1848 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1849 { 1850 struct spdk_bdev_mgmt_channel *mgmt_ch; 1851 uint64_t max_len; 1852 void *buf; 1853 1854 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1855 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1856 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1857 1858 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1859 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1860 bdev_io_get_buf_complete(bdev_io, false); 1861 return; 1862 } 1863 1864 bdev_io->internal.buf.len = len; 1865 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1866 bdev_io_get_iobuf_cb); 1867 if (buf != NULL) { 1868 _bdev_io_set_buf(bdev_io, buf, len); 1869 } 1870 } 1871 1872 void 1873 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1874 { 1875 struct spdk_bdev *bdev = bdev_io->bdev; 1876 uint64_t alignment; 1877 1878 assert(cb != NULL); 1879 bdev_io->internal.get_buf_cb = cb; 1880 1881 alignment = spdk_bdev_get_buf_align(bdev); 1882 1883 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1884 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1885 /* Buffer already present and aligned */ 1886 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1887 return; 1888 } 1889 1890 bdev_io_get_buf(bdev_io, len); 1891 } 1892 1893 static void 1894 _bdev_io_get_bounce_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1895 uint64_t len) 1896 { 1897 assert(cb != NULL); 1898 bdev_io->internal.get_buf_cb = cb; 1899 1900 bdev_io_get_buf(bdev_io, len); 1901 } 1902 1903 1904 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 1905 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 1906 1907 void 1908 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1909 { 1910 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1911 1912 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 1913 1914 assert(cb != NULL); 1915 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1916 bdev_io->internal.get_aux_buf_cb = cb; 1917 bdev_io_get_buf(bdev_io, len); 1918 } 1919 1920 static int 1921 bdev_module_get_max_ctx_size(void) 1922 { 1923 struct spdk_bdev_module *bdev_module; 1924 int max_bdev_module_size = 0; 1925 1926 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1927 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1928 max_bdev_module_size = bdev_module->get_ctx_size(); 1929 } 1930 } 1931 1932 return max_bdev_module_size; 1933 } 1934 1935 static void 1936 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1937 { 1938 if (!bdev->internal.histogram_enabled) { 1939 return; 1940 } 1941 1942 spdk_json_write_object_begin(w); 1943 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1944 1945 spdk_json_write_named_object_begin(w, "params"); 1946 spdk_json_write_named_string(w, "name", bdev->name); 1947 1948 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1949 1950 if (bdev->internal.histogram_io_type) { 1951 spdk_json_write_named_string(w, "opc", 1952 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1953 } 1954 1955 spdk_json_write_object_end(w); 1956 1957 spdk_json_write_object_end(w); 1958 } 1959 1960 static void 1961 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1962 { 1963 int i; 1964 struct spdk_bdev_qos *qos = bdev->internal.qos; 1965 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1966 1967 if (!qos) { 1968 return; 1969 } 1970 1971 spdk_bdev_get_qos_rate_limits(bdev, limits); 1972 1973 spdk_json_write_object_begin(w); 1974 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1975 1976 spdk_json_write_named_object_begin(w, "params"); 1977 spdk_json_write_named_string(w, "name", bdev->name); 1978 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1979 if (limits[i] > 0) { 1980 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1981 } 1982 } 1983 spdk_json_write_object_end(w); 1984 1985 spdk_json_write_object_end(w); 1986 } 1987 1988 void 1989 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1990 { 1991 struct spdk_bdev_module *bdev_module; 1992 struct spdk_bdev *bdev; 1993 1994 assert(w != NULL); 1995 1996 spdk_json_write_array_begin(w); 1997 1998 spdk_json_write_object_begin(w); 1999 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2000 spdk_json_write_named_object_begin(w, "params"); 2001 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2002 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2003 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2004 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2005 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2006 spdk_json_write_object_end(w); 2007 spdk_json_write_object_end(w); 2008 2009 bdev_examine_allowlist_config_json(w); 2010 2011 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2012 if (bdev_module->config_json) { 2013 bdev_module->config_json(w); 2014 } 2015 } 2016 2017 spdk_spin_lock(&g_bdev_mgr.spinlock); 2018 2019 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2020 if (bdev->fn_table->write_config_json) { 2021 bdev->fn_table->write_config_json(bdev, w); 2022 } 2023 2024 bdev_qos_config_json(bdev, w); 2025 bdev_enable_histogram_config_json(bdev, w); 2026 } 2027 2028 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2029 2030 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2031 spdk_json_write_object_begin(w); 2032 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2033 spdk_json_write_object_end(w); 2034 2035 spdk_json_write_array_end(w); 2036 } 2037 2038 static void 2039 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2040 { 2041 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2042 struct spdk_bdev_io *bdev_io; 2043 2044 spdk_iobuf_channel_fini(&ch->iobuf); 2045 2046 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2047 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2048 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2049 ch->per_thread_cache_count--; 2050 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2051 } 2052 2053 assert(ch->per_thread_cache_count == 0); 2054 } 2055 2056 static int 2057 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2058 { 2059 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2060 struct spdk_bdev_io *bdev_io; 2061 uint32_t i; 2062 int rc; 2063 2064 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2065 g_bdev_opts.iobuf_small_cache_size, 2066 g_bdev_opts.iobuf_large_cache_size); 2067 if (rc != 0) { 2068 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2069 return -1; 2070 } 2071 2072 STAILQ_INIT(&ch->per_thread_cache); 2073 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2074 2075 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2076 ch->per_thread_cache_count = 0; 2077 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2078 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2079 if (bdev_io == NULL) { 2080 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2081 assert(false); 2082 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2083 return -1; 2084 } 2085 ch->per_thread_cache_count++; 2086 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2087 } 2088 2089 TAILQ_INIT(&ch->shared_resources); 2090 TAILQ_INIT(&ch->io_wait_queue); 2091 2092 return 0; 2093 } 2094 2095 static void 2096 bdev_init_complete(int rc) 2097 { 2098 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2099 void *cb_arg = g_init_cb_arg; 2100 struct spdk_bdev_module *m; 2101 2102 g_bdev_mgr.init_complete = true; 2103 g_init_cb_fn = NULL; 2104 g_init_cb_arg = NULL; 2105 2106 /* 2107 * For modules that need to know when subsystem init is complete, 2108 * inform them now. 2109 */ 2110 if (rc == 0) { 2111 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2112 if (m->init_complete) { 2113 m->init_complete(); 2114 } 2115 } 2116 } 2117 2118 cb_fn(cb_arg, rc); 2119 } 2120 2121 static bool 2122 bdev_module_all_actions_completed(void) 2123 { 2124 struct spdk_bdev_module *m; 2125 2126 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2127 if (m->internal.action_in_progress > 0) { 2128 return false; 2129 } 2130 } 2131 return true; 2132 } 2133 2134 static void 2135 bdev_module_action_complete(void) 2136 { 2137 /* 2138 * Don't finish bdev subsystem initialization if 2139 * module pre-initialization is still in progress, or 2140 * the subsystem been already initialized. 2141 */ 2142 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2143 return; 2144 } 2145 2146 /* 2147 * Check all bdev modules for inits/examinations in progress. If any 2148 * exist, return immediately since we cannot finish bdev subsystem 2149 * initialization until all are completed. 2150 */ 2151 if (!bdev_module_all_actions_completed()) { 2152 return; 2153 } 2154 2155 /* 2156 * Modules already finished initialization - now that all 2157 * the bdev modules have finished their asynchronous I/O 2158 * processing, the entire bdev layer can be marked as complete. 2159 */ 2160 bdev_init_complete(0); 2161 } 2162 2163 static void 2164 bdev_module_action_done(struct spdk_bdev_module *module) 2165 { 2166 spdk_spin_lock(&module->internal.spinlock); 2167 assert(module->internal.action_in_progress > 0); 2168 module->internal.action_in_progress--; 2169 spdk_spin_unlock(&module->internal.spinlock); 2170 bdev_module_action_complete(); 2171 } 2172 2173 void 2174 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2175 { 2176 assert(module->async_init); 2177 bdev_module_action_done(module); 2178 } 2179 2180 void 2181 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2182 { 2183 bdev_module_action_done(module); 2184 } 2185 2186 /** The last initialized bdev module */ 2187 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2188 2189 static void 2190 bdev_init_failed(void *cb_arg) 2191 { 2192 struct spdk_bdev_module *module = cb_arg; 2193 2194 spdk_spin_lock(&module->internal.spinlock); 2195 assert(module->internal.action_in_progress > 0); 2196 module->internal.action_in_progress--; 2197 spdk_spin_unlock(&module->internal.spinlock); 2198 bdev_init_complete(-1); 2199 } 2200 2201 static int 2202 bdev_modules_init(void) 2203 { 2204 struct spdk_bdev_module *module; 2205 int rc = 0; 2206 2207 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2208 g_resume_bdev_module = module; 2209 if (module->async_init) { 2210 spdk_spin_lock(&module->internal.spinlock); 2211 module->internal.action_in_progress = 1; 2212 spdk_spin_unlock(&module->internal.spinlock); 2213 } 2214 rc = module->module_init(); 2215 if (rc != 0) { 2216 /* Bump action_in_progress to prevent other modules from completion of modules_init 2217 * Send message to defer application shutdown until resources are cleaned up */ 2218 spdk_spin_lock(&module->internal.spinlock); 2219 module->internal.action_in_progress = 1; 2220 spdk_spin_unlock(&module->internal.spinlock); 2221 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2222 return rc; 2223 } 2224 } 2225 2226 g_resume_bdev_module = NULL; 2227 return 0; 2228 } 2229 2230 void 2231 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2232 { 2233 int rc = 0; 2234 char mempool_name[32]; 2235 2236 assert(cb_fn != NULL); 2237 2238 g_init_cb_fn = cb_fn; 2239 g_init_cb_arg = cb_arg; 2240 2241 spdk_notify_type_register("bdev_register"); 2242 spdk_notify_type_register("bdev_unregister"); 2243 2244 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2245 2246 rc = spdk_iobuf_register_module("bdev"); 2247 if (rc != 0) { 2248 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2249 bdev_init_complete(-1); 2250 return; 2251 } 2252 2253 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2254 g_bdev_opts.bdev_io_pool_size, 2255 sizeof(struct spdk_bdev_io) + 2256 bdev_module_get_max_ctx_size(), 2257 0, 2258 SPDK_ENV_NUMA_ID_ANY); 2259 2260 if (g_bdev_mgr.bdev_io_pool == NULL) { 2261 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2262 bdev_init_complete(-1); 2263 return; 2264 } 2265 2266 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2267 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2268 if (!g_bdev_mgr.zero_buffer) { 2269 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2270 bdev_init_complete(-1); 2271 return; 2272 } 2273 2274 #ifdef SPDK_CONFIG_VTUNE 2275 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2276 #endif 2277 2278 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2279 bdev_mgmt_channel_destroy, 2280 sizeof(struct spdk_bdev_mgmt_channel), 2281 "bdev_mgr"); 2282 2283 rc = bdev_modules_init(); 2284 g_bdev_mgr.module_init_complete = true; 2285 if (rc != 0) { 2286 SPDK_ERRLOG("bdev modules init failed\n"); 2287 return; 2288 } 2289 2290 bdev_module_action_complete(); 2291 } 2292 2293 static void 2294 bdev_mgr_unregister_cb(void *io_device) 2295 { 2296 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2297 2298 if (g_bdev_mgr.bdev_io_pool) { 2299 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2300 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2301 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2302 g_bdev_opts.bdev_io_pool_size); 2303 } 2304 2305 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2306 } 2307 2308 spdk_free(g_bdev_mgr.zero_buffer); 2309 2310 bdev_examine_allowlist_free(); 2311 2312 cb_fn(g_fini_cb_arg); 2313 g_fini_cb_fn = NULL; 2314 g_fini_cb_arg = NULL; 2315 g_bdev_mgr.init_complete = false; 2316 g_bdev_mgr.module_init_complete = false; 2317 } 2318 2319 static void 2320 bdev_module_fini_iter(void *arg) 2321 { 2322 struct spdk_bdev_module *bdev_module; 2323 2324 /* FIXME: Handling initialization failures is broken now, 2325 * so we won't even try cleaning up after successfully 2326 * initialized modules. if module_init_complete is false, 2327 * just call spdk_bdev_mgr_unregister_cb 2328 */ 2329 if (!g_bdev_mgr.module_init_complete) { 2330 bdev_mgr_unregister_cb(NULL); 2331 return; 2332 } 2333 2334 /* Start iterating from the last touched module */ 2335 if (!g_resume_bdev_module) { 2336 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2337 } else { 2338 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2339 internal.tailq); 2340 } 2341 2342 while (bdev_module) { 2343 if (bdev_module->async_fini) { 2344 /* Save our place so we can resume later. We must 2345 * save the variable here, before calling module_fini() 2346 * below, because in some cases the module may immediately 2347 * call spdk_bdev_module_fini_done() and re-enter 2348 * this function to continue iterating. */ 2349 g_resume_bdev_module = bdev_module; 2350 } 2351 2352 if (bdev_module->module_fini) { 2353 bdev_module->module_fini(); 2354 } 2355 2356 if (bdev_module->async_fini) { 2357 return; 2358 } 2359 2360 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2361 internal.tailq); 2362 } 2363 2364 g_resume_bdev_module = NULL; 2365 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2366 } 2367 2368 void 2369 spdk_bdev_module_fini_done(void) 2370 { 2371 if (spdk_get_thread() != g_fini_thread) { 2372 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2373 } else { 2374 bdev_module_fini_iter(NULL); 2375 } 2376 } 2377 2378 static void 2379 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2380 { 2381 struct spdk_bdev *bdev = cb_arg; 2382 2383 if (bdeverrno && bdev) { 2384 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2385 bdev->name); 2386 2387 /* 2388 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2389 * bdev; try to continue by manually removing this bdev from the list and continue 2390 * with the next bdev in the list. 2391 */ 2392 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2393 } 2394 2395 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2396 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2397 /* 2398 * Bdev module finish need to be deferred as we might be in the middle of some context 2399 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2400 * after returning. 2401 */ 2402 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2403 return; 2404 } 2405 2406 /* 2407 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2408 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2409 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2410 * base bdevs. 2411 * 2412 * Also, walk the list in the reverse order. 2413 */ 2414 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2415 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2416 spdk_spin_lock(&bdev->internal.spinlock); 2417 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2418 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2419 spdk_spin_unlock(&bdev->internal.spinlock); 2420 continue; 2421 } 2422 spdk_spin_unlock(&bdev->internal.spinlock); 2423 2424 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2425 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2426 return; 2427 } 2428 2429 /* 2430 * If any bdev fails to unclaim underlying bdev properly, we may face the 2431 * case of bdev list consisting of claimed bdevs only (if claims are managed 2432 * correctly, this would mean there's a loop in the claims graph which is 2433 * clearly impossible). Warn and unregister last bdev on the list then. 2434 */ 2435 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2436 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2437 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2438 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2439 return; 2440 } 2441 } 2442 2443 static void 2444 bdev_module_fini_start_iter(void *arg) 2445 { 2446 struct spdk_bdev_module *bdev_module; 2447 2448 if (!g_resume_bdev_module) { 2449 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2450 } else { 2451 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2452 } 2453 2454 while (bdev_module) { 2455 if (bdev_module->async_fini_start) { 2456 /* Save our place so we can resume later. We must 2457 * save the variable here, before calling fini_start() 2458 * below, because in some cases the module may immediately 2459 * call spdk_bdev_module_fini_start_done() and re-enter 2460 * this function to continue iterating. */ 2461 g_resume_bdev_module = bdev_module; 2462 } 2463 2464 if (bdev_module->fini_start) { 2465 bdev_module->fini_start(); 2466 } 2467 2468 if (bdev_module->async_fini_start) { 2469 return; 2470 } 2471 2472 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2473 } 2474 2475 g_resume_bdev_module = NULL; 2476 2477 bdev_finish_unregister_bdevs_iter(NULL, 0); 2478 } 2479 2480 void 2481 spdk_bdev_module_fini_start_done(void) 2482 { 2483 if (spdk_get_thread() != g_fini_thread) { 2484 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2485 } else { 2486 bdev_module_fini_start_iter(NULL); 2487 } 2488 } 2489 2490 static void 2491 bdev_finish_wait_for_examine_done(void *cb_arg) 2492 { 2493 bdev_module_fini_start_iter(NULL); 2494 } 2495 2496 static void bdev_open_async_fini(void); 2497 2498 void 2499 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2500 { 2501 int rc; 2502 2503 assert(cb_fn != NULL); 2504 2505 g_fini_thread = spdk_get_thread(); 2506 2507 g_fini_cb_fn = cb_fn; 2508 g_fini_cb_arg = cb_arg; 2509 2510 bdev_open_async_fini(); 2511 2512 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2513 if (rc != 0) { 2514 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2515 bdev_finish_wait_for_examine_done(NULL); 2516 } 2517 } 2518 2519 struct spdk_bdev_io * 2520 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2521 { 2522 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2523 struct spdk_bdev_io *bdev_io; 2524 2525 if (ch->per_thread_cache_count > 0) { 2526 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2527 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2528 ch->per_thread_cache_count--; 2529 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2530 /* 2531 * Don't try to look for bdev_ios in the global pool if there are 2532 * waiters on bdev_ios - we don't want this caller to jump the line. 2533 */ 2534 bdev_io = NULL; 2535 } else { 2536 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2537 } 2538 2539 return bdev_io; 2540 } 2541 2542 void 2543 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2544 { 2545 struct spdk_bdev_mgmt_channel *ch; 2546 2547 assert(bdev_io != NULL); 2548 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2549 2550 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2551 2552 if (bdev_io->internal.f.has_buf) { 2553 bdev_io_put_buf(bdev_io); 2554 } 2555 2556 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2557 ch->per_thread_cache_count++; 2558 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2559 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2560 struct spdk_bdev_io_wait_entry *entry; 2561 2562 entry = TAILQ_FIRST(&ch->io_wait_queue); 2563 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2564 entry->cb_fn(entry->cb_arg); 2565 } 2566 } else { 2567 /* We should never have a full cache with entries on the io wait queue. */ 2568 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2569 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2570 } 2571 } 2572 2573 static bool 2574 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2575 { 2576 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2577 2578 switch (limit) { 2579 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2580 return true; 2581 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2582 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2583 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2584 return false; 2585 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2586 default: 2587 return false; 2588 } 2589 } 2590 2591 static bool 2592 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2593 { 2594 switch (bdev_io->type) { 2595 case SPDK_BDEV_IO_TYPE_NVME_IO: 2596 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2597 case SPDK_BDEV_IO_TYPE_READ: 2598 case SPDK_BDEV_IO_TYPE_WRITE: 2599 return true; 2600 case SPDK_BDEV_IO_TYPE_ZCOPY: 2601 if (bdev_io->u.bdev.zcopy.start) { 2602 return true; 2603 } else { 2604 return false; 2605 } 2606 default: 2607 return false; 2608 } 2609 } 2610 2611 static bool 2612 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2613 { 2614 switch (bdev_io->type) { 2615 case SPDK_BDEV_IO_TYPE_NVME_IO: 2616 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2617 /* Bit 1 (0x2) set for read operation */ 2618 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2619 return true; 2620 } else { 2621 return false; 2622 } 2623 case SPDK_BDEV_IO_TYPE_READ: 2624 return true; 2625 case SPDK_BDEV_IO_TYPE_ZCOPY: 2626 /* Populate to read from disk */ 2627 if (bdev_io->u.bdev.zcopy.populate) { 2628 return true; 2629 } else { 2630 return false; 2631 } 2632 default: 2633 return false; 2634 } 2635 } 2636 2637 static uint64_t 2638 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2639 { 2640 uint32_t blocklen = bdev_io_get_block_size(bdev_io); 2641 2642 switch (bdev_io->type) { 2643 case SPDK_BDEV_IO_TYPE_NVME_IO: 2644 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2645 return bdev_io->u.nvme_passthru.nbytes; 2646 case SPDK_BDEV_IO_TYPE_READ: 2647 case SPDK_BDEV_IO_TYPE_WRITE: 2648 return bdev_io->u.bdev.num_blocks * blocklen; 2649 case SPDK_BDEV_IO_TYPE_ZCOPY: 2650 /* Track the data in the start phase only */ 2651 if (bdev_io->u.bdev.zcopy.start) { 2652 return bdev_io->u.bdev.num_blocks * blocklen; 2653 } else { 2654 return 0; 2655 } 2656 default: 2657 return 0; 2658 } 2659 } 2660 2661 static inline bool 2662 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2663 { 2664 int64_t remaining_this_timeslice; 2665 2666 if (!limit->max_per_timeslice) { 2667 /* The QoS is disabled */ 2668 return false; 2669 } 2670 2671 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2672 __ATOMIC_RELAXED); 2673 if (remaining_this_timeslice + (int64_t)delta > 0) { 2674 /* There was still a quota for this delta -> the IO shouldn't be queued 2675 * 2676 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2677 * quota can be allowed once a while. Such overrun then taken into account in 2678 * the QoS poller, where the next timeslice quota is calculated. 2679 */ 2680 return false; 2681 } 2682 2683 /* There was no quota for this delta -> the IO should be queued 2684 * The remaining_this_timeslice must be rewinded so it reflects the real 2685 * amount of IOs or bytes allowed. 2686 */ 2687 __atomic_add_fetch( 2688 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2689 return true; 2690 } 2691 2692 static inline void 2693 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2694 { 2695 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2696 } 2697 2698 static bool 2699 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2700 { 2701 return bdev_qos_rw_queue_io(limit, io, 1); 2702 } 2703 2704 static void 2705 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2706 { 2707 bdev_qos_rw_rewind_io(limit, io, 1); 2708 } 2709 2710 static bool 2711 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2712 { 2713 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2714 } 2715 2716 static void 2717 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2718 { 2719 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2720 } 2721 2722 static bool 2723 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2724 { 2725 if (bdev_is_read_io(io) == false) { 2726 return false; 2727 } 2728 2729 return bdev_qos_rw_bps_queue(limit, io); 2730 } 2731 2732 static void 2733 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2734 { 2735 if (bdev_is_read_io(io) != false) { 2736 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2737 } 2738 } 2739 2740 static bool 2741 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2742 { 2743 if (bdev_is_read_io(io) == true) { 2744 return false; 2745 } 2746 2747 return bdev_qos_rw_bps_queue(limit, io); 2748 } 2749 2750 static void 2751 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2752 { 2753 if (bdev_is_read_io(io) != true) { 2754 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2755 } 2756 } 2757 2758 static void 2759 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2760 { 2761 int i; 2762 2763 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2764 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2765 qos->rate_limits[i].queue_io = NULL; 2766 continue; 2767 } 2768 2769 switch (i) { 2770 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2771 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2772 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2773 break; 2774 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2775 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2776 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2777 break; 2778 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2779 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2780 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2781 break; 2782 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2783 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2784 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2785 break; 2786 default: 2787 break; 2788 } 2789 } 2790 } 2791 2792 static void 2793 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2794 struct spdk_bdev_io *bdev_io, 2795 enum spdk_bdev_io_status status) 2796 { 2797 bdev_io->internal.f.in_submit_request = true; 2798 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2799 spdk_bdev_io_complete(bdev_io, status); 2800 bdev_io->internal.f.in_submit_request = false; 2801 } 2802 2803 static inline void 2804 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2805 { 2806 struct spdk_bdev *bdev = bdev_io->bdev; 2807 struct spdk_io_channel *ch = bdev_ch->channel; 2808 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2809 2810 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2811 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2812 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2813 2814 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2815 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2816 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2817 SPDK_BDEV_IO_STATUS_SUCCESS); 2818 return; 2819 } 2820 } 2821 2822 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2823 bdev_io->bdev->split_on_write_unit && 2824 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2825 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2826 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2827 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2828 return; 2829 } 2830 2831 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2832 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2833 bdev_io->internal.f.in_submit_request = true; 2834 bdev_submit_request(bdev, ch, bdev_io); 2835 bdev_io->internal.f.in_submit_request = false; 2836 } else { 2837 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2838 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2839 /* Special case when we have nomem IOs and no outstanding IOs which completions 2840 * could trigger retry of queued IOs */ 2841 bdev_shared_ch_retry_io(shared_resource); 2842 } 2843 } 2844 } 2845 2846 static bool 2847 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2848 { 2849 int i; 2850 2851 if (bdev_qos_io_to_limit(bdev_io) == true) { 2852 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2853 if (!qos->rate_limits[i].queue_io) { 2854 continue; 2855 } 2856 2857 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2858 bdev_io) == true) { 2859 for (i -= 1; i >= 0 ; i--) { 2860 if (!qos->rate_limits[i].queue_io) { 2861 continue; 2862 } 2863 2864 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2865 } 2866 return true; 2867 } 2868 } 2869 } 2870 2871 return false; 2872 } 2873 2874 static int 2875 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2876 { 2877 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2878 int submitted_ios = 0; 2879 2880 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2881 if (!bdev_qos_queue_io(qos, bdev_io)) { 2882 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2883 bdev_io_do_submit(ch, bdev_io); 2884 2885 submitted_ios++; 2886 } 2887 } 2888 2889 return submitted_ios; 2890 } 2891 2892 static void 2893 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2894 { 2895 int rc; 2896 2897 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2898 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2899 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2900 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2901 &bdev_io->internal.waitq_entry); 2902 if (rc != 0) { 2903 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2904 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2905 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2906 } 2907 } 2908 2909 static bool 2910 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2911 { 2912 uint32_t io_boundary; 2913 struct spdk_bdev *bdev = bdev_io->bdev; 2914 uint32_t max_segment_size = bdev->max_segment_size; 2915 uint32_t max_size = bdev->max_rw_size; 2916 int max_segs = bdev->max_num_segments; 2917 2918 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2919 io_boundary = bdev->write_unit_size; 2920 } else if (bdev->split_on_optimal_io_boundary) { 2921 io_boundary = bdev->optimal_io_boundary; 2922 } else { 2923 io_boundary = 0; 2924 } 2925 2926 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2927 return false; 2928 } 2929 2930 if (io_boundary) { 2931 uint64_t start_stripe, end_stripe; 2932 2933 start_stripe = bdev_io->u.bdev.offset_blocks; 2934 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2935 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2936 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2937 start_stripe >>= spdk_u32log2(io_boundary); 2938 end_stripe >>= spdk_u32log2(io_boundary); 2939 } else { 2940 start_stripe /= io_boundary; 2941 end_stripe /= io_boundary; 2942 } 2943 2944 if (start_stripe != end_stripe) { 2945 return true; 2946 } 2947 } 2948 2949 if (max_segs) { 2950 if (bdev_io->u.bdev.iovcnt > max_segs) { 2951 return true; 2952 } 2953 } 2954 2955 if (max_segment_size) { 2956 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2957 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2958 return true; 2959 } 2960 } 2961 } 2962 2963 if (max_size) { 2964 if (bdev_io->u.bdev.num_blocks > max_size) { 2965 return true; 2966 } 2967 } 2968 2969 return false; 2970 } 2971 2972 static bool 2973 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2974 { 2975 uint32_t num_unmap_segments; 2976 2977 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2978 return false; 2979 } 2980 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2981 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2982 return true; 2983 } 2984 2985 return false; 2986 } 2987 2988 static bool 2989 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2990 { 2991 if (!bdev_io->bdev->max_write_zeroes) { 2992 return false; 2993 } 2994 2995 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2996 return true; 2997 } 2998 2999 return false; 3000 } 3001 3002 static bool 3003 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3004 { 3005 if (bdev_io->bdev->max_copy != 0 && 3006 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3007 return true; 3008 } 3009 3010 return false; 3011 } 3012 3013 static bool 3014 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3015 { 3016 switch (bdev_io->type) { 3017 case SPDK_BDEV_IO_TYPE_READ: 3018 case SPDK_BDEV_IO_TYPE_WRITE: 3019 return bdev_rw_should_split(bdev_io); 3020 case SPDK_BDEV_IO_TYPE_UNMAP: 3021 return bdev_unmap_should_split(bdev_io); 3022 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3023 return bdev_write_zeroes_should_split(bdev_io); 3024 case SPDK_BDEV_IO_TYPE_COPY: 3025 return bdev_copy_should_split(bdev_io); 3026 default: 3027 return false; 3028 } 3029 } 3030 3031 static uint32_t 3032 _to_next_boundary(uint64_t offset, uint32_t boundary) 3033 { 3034 return (boundary - (offset % boundary)); 3035 } 3036 3037 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3038 3039 static void _bdev_rw_split(void *_bdev_io); 3040 3041 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3042 3043 static void 3044 _bdev_unmap_split(void *_bdev_io) 3045 { 3046 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3047 } 3048 3049 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3050 3051 static void 3052 _bdev_write_zeroes_split(void *_bdev_io) 3053 { 3054 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3055 } 3056 3057 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3058 3059 static void 3060 _bdev_copy_split(void *_bdev_io) 3061 { 3062 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3063 } 3064 3065 static int 3066 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3067 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3068 { 3069 int rc; 3070 uint64_t current_offset, current_remaining, current_src_offset; 3071 spdk_bdev_io_wait_cb io_wait_fn; 3072 3073 current_offset = *offset; 3074 current_remaining = *remaining; 3075 3076 assert(bdev_io->internal.f.split); 3077 3078 bdev_io->internal.split.outstanding++; 3079 3080 io_wait_fn = _bdev_rw_split; 3081 switch (bdev_io->type) { 3082 case SPDK_BDEV_IO_TYPE_READ: 3083 assert(bdev_io->u.bdev.accel_sequence == NULL); 3084 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3085 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3086 iov, iovcnt, md_buf, current_offset, 3087 num_blocks, 3088 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3089 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3090 NULL, 3091 bdev_io->u.bdev.dif_check_flags, 3092 bdev_io_split_done, bdev_io); 3093 break; 3094 case SPDK_BDEV_IO_TYPE_WRITE: 3095 assert(bdev_io->u.bdev.accel_sequence == NULL); 3096 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3097 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3098 iov, iovcnt, md_buf, current_offset, 3099 num_blocks, 3100 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3101 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3102 NULL, 3103 bdev_io->u.bdev.dif_check_flags, 3104 bdev_io->u.bdev.nvme_cdw12.raw, 3105 bdev_io->u.bdev.nvme_cdw13.raw, 3106 bdev_io_split_done, bdev_io); 3107 break; 3108 case SPDK_BDEV_IO_TYPE_UNMAP: 3109 io_wait_fn = _bdev_unmap_split; 3110 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3111 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3112 current_offset, num_blocks, 3113 bdev_io_split_done, bdev_io); 3114 break; 3115 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3116 io_wait_fn = _bdev_write_zeroes_split; 3117 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3118 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3119 current_offset, num_blocks, 3120 bdev_io_split_done, bdev_io); 3121 break; 3122 case SPDK_BDEV_IO_TYPE_COPY: 3123 io_wait_fn = _bdev_copy_split; 3124 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3125 (current_offset - bdev_io->u.bdev.offset_blocks); 3126 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3127 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3128 current_offset, current_src_offset, num_blocks, 3129 bdev_io_split_done, bdev_io); 3130 break; 3131 default: 3132 assert(false); 3133 rc = -EINVAL; 3134 break; 3135 } 3136 3137 if (rc == 0) { 3138 current_offset += num_blocks; 3139 current_remaining -= num_blocks; 3140 bdev_io->internal.split.current_offset_blocks = current_offset; 3141 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3142 *offset = current_offset; 3143 *remaining = current_remaining; 3144 } else { 3145 bdev_io->internal.split.outstanding--; 3146 if (rc == -ENOMEM) { 3147 if (bdev_io->internal.split.outstanding == 0) { 3148 /* No I/O is outstanding. Hence we should wait here. */ 3149 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3150 } 3151 } else { 3152 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3153 if (bdev_io->internal.split.outstanding == 0) { 3154 bdev_ch_remove_from_io_submitted(bdev_io); 3155 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3156 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3157 bdev_io->internal.ch->queue_depth); 3158 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3159 } 3160 } 3161 } 3162 3163 return rc; 3164 } 3165 3166 static void 3167 _bdev_rw_split(void *_bdev_io) 3168 { 3169 struct iovec *parent_iov, *iov; 3170 struct spdk_bdev_io *bdev_io = _bdev_io; 3171 struct spdk_bdev *bdev = bdev_io->bdev; 3172 uint64_t parent_offset, current_offset, remaining; 3173 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3174 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3175 uint32_t iovcnt, iov_len, child_iovsize; 3176 uint32_t blocklen; 3177 uint32_t io_boundary; 3178 uint32_t max_segment_size = bdev->max_segment_size; 3179 uint32_t max_child_iovcnt = bdev->max_num_segments; 3180 uint32_t max_size = bdev->max_rw_size; 3181 void *md_buf = NULL; 3182 int rc; 3183 3184 blocklen = bdev_io_get_block_size(bdev_io); 3185 3186 max_size = max_size ? max_size : UINT32_MAX; 3187 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3188 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3189 SPDK_BDEV_IO_NUM_CHILD_IOV; 3190 3191 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3192 io_boundary = bdev->write_unit_size; 3193 } else if (bdev->split_on_optimal_io_boundary) { 3194 io_boundary = bdev->optimal_io_boundary; 3195 } else { 3196 io_boundary = UINT32_MAX; 3197 } 3198 3199 assert(bdev_io->internal.f.split); 3200 3201 remaining = bdev_io->internal.split.remaining_num_blocks; 3202 current_offset = bdev_io->internal.split.current_offset_blocks; 3203 parent_offset = bdev_io->u.bdev.offset_blocks; 3204 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3205 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3206 3207 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3208 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3209 if (parent_iov_offset < parent_iov->iov_len) { 3210 break; 3211 } 3212 parent_iov_offset -= parent_iov->iov_len; 3213 } 3214 3215 child_iovcnt = 0; 3216 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3217 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3218 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3219 to_next_boundary = spdk_min(remaining, to_next_boundary); 3220 to_next_boundary = spdk_min(max_size, to_next_boundary); 3221 to_next_boundary_bytes = to_next_boundary * blocklen; 3222 3223 iov = &bdev_io->child_iov[child_iovcnt]; 3224 iovcnt = 0; 3225 3226 if (bdev_io->u.bdev.md_buf) { 3227 md_buf = (char *)bdev_io->u.bdev.md_buf + 3228 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3229 } 3230 3231 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3232 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3233 iovcnt < child_iovsize) { 3234 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3235 iov_len = parent_iov->iov_len - parent_iov_offset; 3236 3237 iov_len = spdk_min(iov_len, max_segment_size); 3238 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3239 to_next_boundary_bytes -= iov_len; 3240 3241 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3242 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3243 3244 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3245 parent_iov_offset += iov_len; 3246 } else { 3247 parent_iovpos++; 3248 parent_iov_offset = 0; 3249 } 3250 child_iovcnt++; 3251 iovcnt++; 3252 } 3253 3254 if (to_next_boundary_bytes > 0) { 3255 /* We had to stop this child I/O early because we ran out of 3256 * child_iov space or were limited by max_num_segments. 3257 * Ensure the iovs to be aligned with block size and 3258 * then adjust to_next_boundary before starting the 3259 * child I/O. 3260 */ 3261 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3262 iovcnt == child_iovsize); 3263 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3264 if (to_last_block_bytes != 0) { 3265 uint32_t child_iovpos = child_iovcnt - 1; 3266 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3267 * so the loop will naturally end 3268 */ 3269 3270 to_last_block_bytes = blocklen - to_last_block_bytes; 3271 to_next_boundary_bytes += to_last_block_bytes; 3272 while (to_last_block_bytes > 0 && iovcnt > 0) { 3273 iov_len = spdk_min(to_last_block_bytes, 3274 bdev_io->child_iov[child_iovpos].iov_len); 3275 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3276 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3277 child_iovpos--; 3278 if (--iovcnt == 0) { 3279 /* If the child IO is less than a block size just return. 3280 * If the first child IO of any split round is less than 3281 * a block size, an error exit. 3282 */ 3283 if (bdev_io->internal.split.outstanding == 0) { 3284 SPDK_ERRLOG("The first child io was less than a block size\n"); 3285 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3286 bdev_ch_remove_from_io_submitted(bdev_io); 3287 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3288 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3289 bdev_io->internal.ch->queue_depth); 3290 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3291 } 3292 3293 return; 3294 } 3295 } 3296 3297 to_last_block_bytes -= iov_len; 3298 3299 if (parent_iov_offset == 0) { 3300 parent_iovpos--; 3301 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3302 } 3303 parent_iov_offset -= iov_len; 3304 } 3305 3306 assert(to_last_block_bytes == 0); 3307 } 3308 to_next_boundary -= to_next_boundary_bytes / blocklen; 3309 } 3310 3311 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3312 ¤t_offset, &remaining); 3313 if (spdk_unlikely(rc)) { 3314 return; 3315 } 3316 } 3317 } 3318 3319 static void 3320 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3321 { 3322 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3323 uint32_t num_children_reqs = 0; 3324 int rc; 3325 3326 assert(bdev_io->internal.f.split); 3327 3328 offset = bdev_io->internal.split.current_offset_blocks; 3329 remaining = bdev_io->internal.split.remaining_num_blocks; 3330 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3331 3332 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3333 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3334 3335 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3336 &offset, &remaining); 3337 if (spdk_likely(rc == 0)) { 3338 num_children_reqs++; 3339 } else { 3340 return; 3341 } 3342 } 3343 } 3344 3345 static void 3346 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3347 { 3348 uint64_t offset, write_zeroes_blocks, remaining; 3349 uint32_t num_children_reqs = 0; 3350 int rc; 3351 3352 assert(bdev_io->internal.f.split); 3353 3354 offset = bdev_io->internal.split.current_offset_blocks; 3355 remaining = bdev_io->internal.split.remaining_num_blocks; 3356 3357 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3358 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3359 3360 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3361 &offset, &remaining); 3362 if (spdk_likely(rc == 0)) { 3363 num_children_reqs++; 3364 } else { 3365 return; 3366 } 3367 } 3368 } 3369 3370 static void 3371 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3372 { 3373 uint64_t offset, copy_blocks, remaining; 3374 uint32_t num_children_reqs = 0; 3375 int rc; 3376 3377 assert(bdev_io->internal.f.split); 3378 3379 offset = bdev_io->internal.split.current_offset_blocks; 3380 remaining = bdev_io->internal.split.remaining_num_blocks; 3381 3382 assert(bdev_io->bdev->max_copy != 0); 3383 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3384 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3385 3386 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3387 &offset, &remaining); 3388 if (spdk_likely(rc == 0)) { 3389 num_children_reqs++; 3390 } else { 3391 return; 3392 } 3393 } 3394 } 3395 3396 static void 3397 parent_bdev_io_complete(void *ctx, int rc) 3398 { 3399 struct spdk_bdev_io *parent_io = ctx; 3400 3401 if (rc) { 3402 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3403 } 3404 3405 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3406 parent_io->internal.caller_ctx); 3407 } 3408 3409 static void 3410 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3411 { 3412 struct spdk_bdev_io *bdev_io = ctx; 3413 3414 /* u.bdev.accel_sequence should have already been cleared at this point */ 3415 assert(bdev_io->u.bdev.accel_sequence == NULL); 3416 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3417 bdev_io->internal.f.has_accel_sequence = false; 3418 3419 if (spdk_unlikely(status != 0)) { 3420 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3421 } 3422 3423 parent_bdev_io_complete(bdev_io, status); 3424 } 3425 3426 static void 3427 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3428 { 3429 struct spdk_bdev_io *parent_io = cb_arg; 3430 3431 spdk_bdev_free_io(bdev_io); 3432 3433 assert(parent_io->internal.f.split); 3434 3435 if (!success) { 3436 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3437 /* If any child I/O failed, stop further splitting process. */ 3438 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3439 parent_io->internal.split.remaining_num_blocks = 0; 3440 } 3441 parent_io->internal.split.outstanding--; 3442 if (parent_io->internal.split.outstanding != 0) { 3443 return; 3444 } 3445 3446 /* 3447 * Parent I/O finishes when all blocks are consumed. 3448 */ 3449 if (parent_io->internal.split.remaining_num_blocks == 0) { 3450 assert(parent_io->internal.cb != bdev_io_split_done); 3451 bdev_ch_remove_from_io_submitted(parent_io); 3452 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3453 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3454 parent_io->internal.ch->queue_depth); 3455 3456 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3457 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3458 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3459 return; 3460 } else if (parent_io->internal.f.has_bounce_buf && 3461 !bdev_io_use_accel_sequence(bdev_io)) { 3462 /* bdev IO will be completed in the callback */ 3463 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3464 return; 3465 } 3466 } 3467 3468 parent_bdev_io_complete(parent_io, 0); 3469 return; 3470 } 3471 3472 /* 3473 * Continue with the splitting process. This function will complete the parent I/O if the 3474 * splitting is done. 3475 */ 3476 switch (parent_io->type) { 3477 case SPDK_BDEV_IO_TYPE_READ: 3478 case SPDK_BDEV_IO_TYPE_WRITE: 3479 _bdev_rw_split(parent_io); 3480 break; 3481 case SPDK_BDEV_IO_TYPE_UNMAP: 3482 bdev_unmap_split(parent_io); 3483 break; 3484 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3485 bdev_write_zeroes_split(parent_io); 3486 break; 3487 case SPDK_BDEV_IO_TYPE_COPY: 3488 bdev_copy_split(parent_io); 3489 break; 3490 default: 3491 assert(false); 3492 break; 3493 } 3494 } 3495 3496 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3497 bool success); 3498 3499 static void 3500 bdev_io_split(struct spdk_bdev_io *bdev_io) 3501 { 3502 assert(bdev_io_should_split(bdev_io)); 3503 assert(bdev_io->internal.f.split); 3504 3505 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3506 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3507 bdev_io->internal.split.outstanding = 0; 3508 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3509 3510 switch (bdev_io->type) { 3511 case SPDK_BDEV_IO_TYPE_READ: 3512 case SPDK_BDEV_IO_TYPE_WRITE: 3513 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3514 _bdev_rw_split(bdev_io); 3515 } else { 3516 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3517 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3518 bdev_io->u.bdev.num_blocks * bdev_io_get_block_size(bdev_io)); 3519 } 3520 break; 3521 case SPDK_BDEV_IO_TYPE_UNMAP: 3522 bdev_unmap_split(bdev_io); 3523 break; 3524 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3525 bdev_write_zeroes_split(bdev_io); 3526 break; 3527 case SPDK_BDEV_IO_TYPE_COPY: 3528 bdev_copy_split(bdev_io); 3529 break; 3530 default: 3531 assert(false); 3532 break; 3533 } 3534 } 3535 3536 static void 3537 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3538 { 3539 if (!success) { 3540 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3541 return; 3542 } 3543 3544 _bdev_rw_split(bdev_io); 3545 } 3546 3547 static inline void 3548 _bdev_io_submit(struct spdk_bdev_io *bdev_io) 3549 { 3550 struct spdk_bdev *bdev = bdev_io->bdev; 3551 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3552 3553 if (spdk_likely(bdev_ch->flags == 0)) { 3554 bdev_io_do_submit(bdev_ch, bdev_io); 3555 return; 3556 } 3557 3558 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3559 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3560 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3561 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3562 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3563 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3564 } else { 3565 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3566 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3567 } 3568 } else { 3569 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3570 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3571 } 3572 } 3573 3574 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3575 3576 bool 3577 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3578 { 3579 if (range1->length == 0 || range2->length == 0) { 3580 return false; 3581 } 3582 3583 if (range1->offset + range1->length <= range2->offset) { 3584 return false; 3585 } 3586 3587 if (range2->offset + range2->length <= range1->offset) { 3588 return false; 3589 } 3590 3591 return true; 3592 } 3593 3594 static bool 3595 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3596 { 3597 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3598 struct lba_range r; 3599 3600 switch (bdev_io->type) { 3601 case SPDK_BDEV_IO_TYPE_NVME_IO: 3602 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3603 /* Don't try to decode the NVMe command - just assume worst-case and that 3604 * it overlaps a locked range. 3605 */ 3606 return true; 3607 case SPDK_BDEV_IO_TYPE_READ: 3608 if (!range->quiesce) { 3609 return false; 3610 } 3611 /* fallthrough */ 3612 case SPDK_BDEV_IO_TYPE_WRITE: 3613 case SPDK_BDEV_IO_TYPE_UNMAP: 3614 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3615 case SPDK_BDEV_IO_TYPE_ZCOPY: 3616 case SPDK_BDEV_IO_TYPE_COPY: 3617 r.offset = bdev_io->u.bdev.offset_blocks; 3618 r.length = bdev_io->u.bdev.num_blocks; 3619 if (!bdev_lba_range_overlapped(range, &r)) { 3620 /* This I/O doesn't overlap the specified LBA range. */ 3621 return false; 3622 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3623 /* This I/O overlaps, but the I/O is on the same channel that locked this 3624 * range, and the caller_ctx is the same as the locked_ctx. This means 3625 * that this I/O is associated with the lock, and is allowed to execute. 3626 */ 3627 return false; 3628 } else { 3629 return true; 3630 } 3631 default: 3632 return false; 3633 } 3634 } 3635 3636 void 3637 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3638 { 3639 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3640 3641 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3642 3643 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3644 struct lba_range *range; 3645 3646 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3647 if (bdev_io_range_is_locked(bdev_io, range)) { 3648 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3649 return; 3650 } 3651 } 3652 } 3653 3654 bdev_ch_add_to_io_submitted(bdev_io); 3655 3656 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3657 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3658 ch->trace_id, bdev_io->u.bdev.num_blocks, 3659 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3660 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3661 3662 if (bdev_io->internal.f.split) { 3663 bdev_io_split(bdev_io); 3664 return; 3665 } 3666 3667 _bdev_io_submit(bdev_io); 3668 } 3669 3670 static inline int 3671 bdev_io_init_dif_ctx(struct spdk_bdev_io *bdev_io) 3672 { 3673 struct spdk_bdev *bdev = bdev_io->bdev; 3674 struct spdk_dif_ctx_init_ext_opts dif_opts; 3675 3676 memset(&bdev_io->u.bdev.dif_err, 0, sizeof(struct spdk_dif_error)); 3677 3678 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 3679 dif_opts.dif_pi_format = bdev->dif_pi_format; 3680 3681 return spdk_dif_ctx_init(&bdev_io->u.bdev.dif_ctx, 3682 bdev->blocklen, 3683 bdev->md_len, 3684 bdev->md_interleave, 3685 bdev->dif_is_head_of_md, 3686 bdev->dif_type, 3687 bdev_io->u.bdev.dif_check_flags, 3688 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 3689 0xFFFF, 0, 0, 0, &dif_opts); 3690 } 3691 3692 static void 3693 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3694 bool success) 3695 { 3696 if (!success) { 3697 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 3698 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3699 bdev_io_complete_unsubmitted(bdev_io); 3700 return; 3701 } 3702 3703 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 3704 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3705 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3706 return; 3707 } 3708 /* For reads we'll execute the sequence after the data is read, so, for now, only 3709 * clear out accel_sequence pointer and submit the IO */ 3710 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3711 bdev_io->u.bdev.accel_sequence = NULL; 3712 } 3713 3714 bdev_io_submit(bdev_io); 3715 } 3716 3717 static inline void 3718 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3719 { 3720 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3721 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3722 * For write operation we need to pull buffers from memory domain before submitting IO. 3723 * Once read operation completes, we need to use memory_domain push functionality to 3724 * update data in original memory domain IO buffer 3725 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3726 assert(bdev_io->internal.f.has_memory_domain); 3727 bdev_io->u.bdev.memory_domain = NULL; 3728 bdev_io->u.bdev.memory_domain_ctx = NULL; 3729 _bdev_io_get_bounce_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3730 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3731 } 3732 3733 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3734 * support them, but we need to execute an accel sequence and the data buffer is from accel 3735 * memory domain (to avoid doing a push/pull from that domain). 3736 */ 3737 static inline bool 3738 bdev_io_needs_bounce_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3739 { 3740 if (bdev_io_use_memory_domain(bdev_io)) { 3741 if (!desc->memory_domains_supported || 3742 (bdev_io_needs_sequence_exec(desc, bdev_io) && 3743 bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3744 return true; 3745 } 3746 } 3747 3748 return false; 3749 } 3750 3751 static inline void 3752 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3753 { 3754 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3755 int rc; 3756 3757 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3758 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3759 bdev_io_complete_unsubmitted(bdev_io); 3760 return; 3761 } 3762 3763 if (bdev_io_needs_metadata(desc, bdev_io)) { 3764 rc = bdev_io_init_dif_ctx(bdev_io); 3765 if (spdk_unlikely(rc != 0)) { 3766 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3767 bdev_io_complete_unsubmitted(bdev_io); 3768 return; 3769 } 3770 } 3771 3772 if (bdev_io_needs_bounce_buffer(desc, bdev_io)) { 3773 _bdev_io_ext_use_bounce_buffer(bdev_io); 3774 return; 3775 } 3776 3777 if (bdev_io_needs_sequence_exec(desc, bdev_io)) { 3778 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3779 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3780 return; 3781 } 3782 /* For reads we'll execute the sequence after the data is read, so, for now, only 3783 * clear out accel_sequence pointer and submit the IO */ 3784 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3785 bdev_io->u.bdev.accel_sequence = NULL; 3786 } 3787 3788 bdev_io_submit(bdev_io); 3789 } 3790 3791 static void 3792 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3793 { 3794 struct spdk_bdev *bdev = bdev_io->bdev; 3795 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3796 struct spdk_io_channel *ch = bdev_ch->channel; 3797 3798 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3799 3800 bdev_io->internal.f.in_submit_request = true; 3801 bdev_submit_request(bdev, ch, bdev_io); 3802 bdev_io->internal.f.in_submit_request = false; 3803 } 3804 3805 void 3806 bdev_io_init(struct spdk_bdev_io *bdev_io, 3807 struct spdk_bdev *bdev, void *cb_arg, 3808 spdk_bdev_io_completion_cb cb) 3809 { 3810 bdev_io->bdev = bdev; 3811 bdev_io->internal.f.raw = 0; 3812 bdev_io->internal.caller_ctx = cb_arg; 3813 bdev_io->internal.cb = cb; 3814 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3815 bdev_io->internal.f.in_submit_request = false; 3816 bdev_io->internal.error.nvme.cdw0 = 0; 3817 bdev_io->num_retries = 0; 3818 bdev_io->internal.get_buf_cb = NULL; 3819 bdev_io->internal.get_aux_buf_cb = NULL; 3820 bdev_io->internal.data_transfer_cpl = NULL; 3821 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3822 } 3823 3824 static bool 3825 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3826 { 3827 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3828 } 3829 3830 bool 3831 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3832 { 3833 bool supported; 3834 3835 supported = bdev_io_type_supported(bdev, io_type); 3836 3837 if (!supported) { 3838 switch (io_type) { 3839 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3840 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3841 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3842 break; 3843 default: 3844 break; 3845 } 3846 } 3847 3848 return supported; 3849 } 3850 3851 static const char *g_io_type_strings[] = { 3852 [SPDK_BDEV_IO_TYPE_READ] = "read", 3853 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3854 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3855 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3856 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3857 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3858 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3859 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3860 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3861 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3862 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3863 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3864 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3865 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3866 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3867 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3868 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3869 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3870 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3871 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3872 }; 3873 3874 const char * 3875 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3876 { 3877 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3878 return NULL; 3879 } 3880 3881 return g_io_type_strings[io_type]; 3882 } 3883 3884 int 3885 spdk_bdev_get_io_type(const char *io_type_string) 3886 { 3887 int i; 3888 3889 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3890 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3891 return i; 3892 } 3893 } 3894 3895 return -1; 3896 } 3897 3898 uint64_t 3899 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3900 { 3901 return bdev_io->internal.submit_tsc; 3902 } 3903 3904 int 3905 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3906 { 3907 if (bdev->fn_table->dump_info_json) { 3908 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3909 } 3910 3911 return 0; 3912 } 3913 3914 static void 3915 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3916 { 3917 uint32_t max_per_timeslice = 0; 3918 int i; 3919 3920 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3921 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3922 qos->rate_limits[i].max_per_timeslice = 0; 3923 continue; 3924 } 3925 3926 max_per_timeslice = qos->rate_limits[i].limit * 3927 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3928 3929 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3930 qos->rate_limits[i].min_per_timeslice); 3931 3932 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3933 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3934 } 3935 3936 bdev_qos_set_ops(qos); 3937 } 3938 3939 static void 3940 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3941 struct spdk_io_channel *io_ch, void *ctx) 3942 { 3943 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3944 int status; 3945 3946 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3947 3948 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3949 /* TODO: channels round robing */ 3950 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3951 3952 spdk_bdev_for_each_channel_continue(i, status); 3953 } 3954 3955 3956 static void 3957 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3958 { 3959 3960 } 3961 3962 static int 3963 bdev_channel_poll_qos(void *arg) 3964 { 3965 struct spdk_bdev *bdev = arg; 3966 struct spdk_bdev_qos *qos = bdev->internal.qos; 3967 uint64_t now = spdk_get_ticks(); 3968 int i; 3969 int64_t remaining_last_timeslice; 3970 3971 if (spdk_unlikely(qos->thread == NULL)) { 3972 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3973 return SPDK_POLLER_IDLE; 3974 } 3975 3976 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3977 /* We received our callback earlier than expected - return 3978 * immediately and wait to do accounting until at least one 3979 * timeslice has actually expired. This should never happen 3980 * with a well-behaved timer implementation. 3981 */ 3982 return SPDK_POLLER_IDLE; 3983 } 3984 3985 /* Reset for next round of rate limiting */ 3986 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3987 /* We may have allowed the IOs or bytes to slightly overrun in the last 3988 * timeslice. remaining_this_timeslice is signed, so if it's negative 3989 * here, we'll account for the overrun so that the next timeslice will 3990 * be appropriately reduced. 3991 */ 3992 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3993 0, __ATOMIC_RELAXED); 3994 if (remaining_last_timeslice < 0) { 3995 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3996 * potentially use 2 atomic ops each, so they can intertwine. 3997 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3998 */ 3999 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 4000 remaining_last_timeslice, __ATOMIC_RELAXED); 4001 } 4002 } 4003 4004 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 4005 qos->last_timeslice += qos->timeslice_size; 4006 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4007 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 4008 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 4009 } 4010 } 4011 4012 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 4013 bdev_channel_submit_qos_io_done); 4014 4015 return SPDK_POLLER_BUSY; 4016 } 4017 4018 static void 4019 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 4020 { 4021 struct spdk_bdev_shared_resource *shared_resource; 4022 struct lba_range *range; 4023 4024 bdev_free_io_stat(ch->stat); 4025 #ifdef SPDK_CONFIG_VTUNE 4026 bdev_free_io_stat(ch->prev_stat); 4027 #endif 4028 4029 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 4030 range = TAILQ_FIRST(&ch->locked_ranges); 4031 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 4032 free(range); 4033 } 4034 4035 spdk_put_io_channel(ch->channel); 4036 spdk_put_io_channel(ch->accel_channel); 4037 4038 shared_resource = ch->shared_resource; 4039 4040 assert(TAILQ_EMPTY(&ch->io_locked)); 4041 assert(TAILQ_EMPTY(&ch->io_submitted)); 4042 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 4043 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 4044 assert(ch->io_outstanding == 0); 4045 assert(shared_resource->ref > 0); 4046 shared_resource->ref--; 4047 if (shared_resource->ref == 0) { 4048 assert(shared_resource->io_outstanding == 0); 4049 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 4050 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 4051 spdk_poller_unregister(&shared_resource->nomem_poller); 4052 free(shared_resource); 4053 } 4054 } 4055 4056 static void 4057 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 4058 { 4059 struct spdk_bdev_qos *qos = bdev->internal.qos; 4060 int i; 4061 4062 assert(spdk_spin_held(&bdev->internal.spinlock)); 4063 4064 /* Rate limiting on this bdev enabled */ 4065 if (qos) { 4066 if (qos->ch == NULL) { 4067 struct spdk_io_channel *io_ch; 4068 4069 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4070 bdev->name, spdk_get_thread()); 4071 4072 /* No qos channel has been selected, so set one up */ 4073 4074 /* Take another reference to ch */ 4075 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4076 assert(io_ch != NULL); 4077 qos->ch = ch; 4078 4079 qos->thread = spdk_io_channel_get_thread(io_ch); 4080 4081 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4082 if (bdev_qos_is_iops_rate_limit(i) == true) { 4083 qos->rate_limits[i].min_per_timeslice = 4084 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4085 } else { 4086 qos->rate_limits[i].min_per_timeslice = 4087 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4088 } 4089 4090 if (qos->rate_limits[i].limit == 0) { 4091 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4092 } 4093 } 4094 bdev_qos_update_max_quota_per_timeslice(qos); 4095 qos->timeslice_size = 4096 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4097 qos->last_timeslice = spdk_get_ticks(); 4098 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4099 bdev, 4100 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4101 } 4102 4103 ch->flags |= BDEV_CH_QOS_ENABLED; 4104 } 4105 } 4106 4107 struct poll_timeout_ctx { 4108 struct spdk_bdev_desc *desc; 4109 uint64_t timeout_in_sec; 4110 spdk_bdev_io_timeout_cb cb_fn; 4111 void *cb_arg; 4112 }; 4113 4114 static void 4115 bdev_desc_free(struct spdk_bdev_desc *desc) 4116 { 4117 spdk_spin_destroy(&desc->spinlock); 4118 free(desc->media_events_buffer); 4119 free(desc); 4120 } 4121 4122 static void 4123 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4124 { 4125 struct poll_timeout_ctx *ctx = _ctx; 4126 struct spdk_bdev_desc *desc = ctx->desc; 4127 4128 free(ctx); 4129 4130 spdk_spin_lock(&desc->spinlock); 4131 desc->refs--; 4132 if (desc->closed == true && desc->refs == 0) { 4133 spdk_spin_unlock(&desc->spinlock); 4134 bdev_desc_free(desc); 4135 return; 4136 } 4137 spdk_spin_unlock(&desc->spinlock); 4138 } 4139 4140 static void 4141 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4142 struct spdk_io_channel *io_ch, void *_ctx) 4143 { 4144 struct poll_timeout_ctx *ctx = _ctx; 4145 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4146 struct spdk_bdev_desc *desc = ctx->desc; 4147 struct spdk_bdev_io *bdev_io; 4148 uint64_t now; 4149 4150 spdk_spin_lock(&desc->spinlock); 4151 if (desc->closed == true) { 4152 spdk_spin_unlock(&desc->spinlock); 4153 spdk_bdev_for_each_channel_continue(i, -1); 4154 return; 4155 } 4156 spdk_spin_unlock(&desc->spinlock); 4157 4158 now = spdk_get_ticks(); 4159 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4160 /* Exclude any I/O that are generated via splitting. */ 4161 if (bdev_io->internal.cb == bdev_io_split_done) { 4162 continue; 4163 } 4164 4165 /* Once we find an I/O that has not timed out, we can immediately 4166 * exit the loop. 4167 */ 4168 if (now < (bdev_io->internal.submit_tsc + 4169 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4170 goto end; 4171 } 4172 4173 if (bdev_io->internal.desc == desc) { 4174 ctx->cb_fn(ctx->cb_arg, bdev_io); 4175 } 4176 } 4177 4178 end: 4179 spdk_bdev_for_each_channel_continue(i, 0); 4180 } 4181 4182 static int 4183 bdev_poll_timeout_io(void *arg) 4184 { 4185 struct spdk_bdev_desc *desc = arg; 4186 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4187 struct poll_timeout_ctx *ctx; 4188 4189 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4190 if (!ctx) { 4191 SPDK_ERRLOG("failed to allocate memory\n"); 4192 return SPDK_POLLER_BUSY; 4193 } 4194 ctx->desc = desc; 4195 ctx->cb_arg = desc->cb_arg; 4196 ctx->cb_fn = desc->cb_fn; 4197 ctx->timeout_in_sec = desc->timeout_in_sec; 4198 4199 /* Take a ref on the descriptor in case it gets closed while we are checking 4200 * all of the channels. 4201 */ 4202 spdk_spin_lock(&desc->spinlock); 4203 desc->refs++; 4204 spdk_spin_unlock(&desc->spinlock); 4205 4206 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4207 bdev_channel_poll_timeout_io_done); 4208 4209 return SPDK_POLLER_BUSY; 4210 } 4211 4212 int 4213 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4214 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4215 { 4216 assert(desc->thread == spdk_get_thread()); 4217 4218 spdk_poller_unregister(&desc->io_timeout_poller); 4219 4220 if (timeout_in_sec) { 4221 assert(cb_fn != NULL); 4222 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4223 desc, 4224 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4225 1000); 4226 if (desc->io_timeout_poller == NULL) { 4227 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4228 return -1; 4229 } 4230 } 4231 4232 desc->cb_fn = cb_fn; 4233 desc->cb_arg = cb_arg; 4234 desc->timeout_in_sec = timeout_in_sec; 4235 4236 return 0; 4237 } 4238 4239 static int 4240 bdev_channel_create(void *io_device, void *ctx_buf) 4241 { 4242 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4243 struct spdk_bdev_channel *ch = ctx_buf; 4244 struct spdk_io_channel *mgmt_io_ch; 4245 struct spdk_bdev_mgmt_channel *mgmt_ch; 4246 struct spdk_bdev_shared_resource *shared_resource; 4247 struct lba_range *range; 4248 4249 ch->bdev = bdev; 4250 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4251 if (!ch->channel) { 4252 return -1; 4253 } 4254 4255 ch->accel_channel = spdk_accel_get_io_channel(); 4256 if (!ch->accel_channel) { 4257 spdk_put_io_channel(ch->channel); 4258 return -1; 4259 } 4260 4261 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4262 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4263 4264 assert(ch->histogram == NULL); 4265 if (bdev->internal.histogram_enabled) { 4266 ch->histogram = spdk_histogram_data_alloc(); 4267 if (ch->histogram == NULL) { 4268 SPDK_ERRLOG("Could not allocate histogram\n"); 4269 } 4270 } 4271 4272 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4273 if (!mgmt_io_ch) { 4274 spdk_put_io_channel(ch->channel); 4275 spdk_put_io_channel(ch->accel_channel); 4276 return -1; 4277 } 4278 4279 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4280 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4281 if (shared_resource->shared_ch == ch->channel) { 4282 spdk_put_io_channel(mgmt_io_ch); 4283 shared_resource->ref++; 4284 break; 4285 } 4286 } 4287 4288 if (shared_resource == NULL) { 4289 shared_resource = calloc(1, sizeof(*shared_resource)); 4290 if (shared_resource == NULL) { 4291 spdk_put_io_channel(ch->channel); 4292 spdk_put_io_channel(ch->accel_channel); 4293 spdk_put_io_channel(mgmt_io_ch); 4294 return -1; 4295 } 4296 4297 shared_resource->mgmt_ch = mgmt_ch; 4298 shared_resource->io_outstanding = 0; 4299 TAILQ_INIT(&shared_resource->nomem_io); 4300 shared_resource->nomem_threshold = 0; 4301 shared_resource->shared_ch = ch->channel; 4302 shared_resource->ref = 1; 4303 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4304 } 4305 4306 ch->io_outstanding = 0; 4307 TAILQ_INIT(&ch->locked_ranges); 4308 TAILQ_INIT(&ch->qos_queued_io); 4309 ch->flags = 0; 4310 ch->trace_id = bdev->internal.trace_id; 4311 ch->shared_resource = shared_resource; 4312 4313 TAILQ_INIT(&ch->io_submitted); 4314 TAILQ_INIT(&ch->io_locked); 4315 TAILQ_INIT(&ch->io_accel_exec); 4316 TAILQ_INIT(&ch->io_memory_domain); 4317 4318 ch->stat = bdev_alloc_io_stat(false); 4319 if (ch->stat == NULL) { 4320 bdev_channel_destroy_resource(ch); 4321 return -1; 4322 } 4323 4324 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4325 4326 #ifdef SPDK_CONFIG_VTUNE 4327 { 4328 char *name; 4329 __itt_init_ittlib(NULL, 0); 4330 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4331 if (!name) { 4332 bdev_channel_destroy_resource(ch); 4333 return -1; 4334 } 4335 ch->handle = __itt_string_handle_create(name); 4336 free(name); 4337 ch->start_tsc = spdk_get_ticks(); 4338 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4339 ch->prev_stat = bdev_alloc_io_stat(false); 4340 if (ch->prev_stat == NULL) { 4341 bdev_channel_destroy_resource(ch); 4342 return -1; 4343 } 4344 } 4345 #endif 4346 4347 spdk_spin_lock(&bdev->internal.spinlock); 4348 bdev_enable_qos(bdev, ch); 4349 4350 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4351 struct lba_range *new_range; 4352 4353 new_range = calloc(1, sizeof(*new_range)); 4354 if (new_range == NULL) { 4355 spdk_spin_unlock(&bdev->internal.spinlock); 4356 bdev_channel_destroy_resource(ch); 4357 return -1; 4358 } 4359 new_range->length = range->length; 4360 new_range->offset = range->offset; 4361 new_range->locked_ctx = range->locked_ctx; 4362 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4363 } 4364 4365 spdk_spin_unlock(&bdev->internal.spinlock); 4366 4367 return 0; 4368 } 4369 4370 static int 4371 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4372 void *cb_ctx) 4373 { 4374 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4375 struct spdk_bdev_io *bdev_io; 4376 uint64_t buf_len; 4377 4378 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4379 if (bdev_io->internal.ch == bdev_ch) { 4380 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4381 spdk_iobuf_entry_abort(ch, entry, buf_len); 4382 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4383 } 4384 4385 return 0; 4386 } 4387 4388 /* 4389 * Abort I/O that are waiting on a data buffer. 4390 */ 4391 static void 4392 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4393 { 4394 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4395 } 4396 4397 /* 4398 * Abort I/O that are queued waiting for submission. These types of I/O are 4399 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4400 */ 4401 static void 4402 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4403 { 4404 struct spdk_bdev_io *bdev_io, *tmp; 4405 4406 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4407 if (bdev_io->internal.ch == ch) { 4408 TAILQ_REMOVE(queue, bdev_io, internal.link); 4409 /* 4410 * spdk_bdev_io_complete() assumes that the completed I/O had 4411 * been submitted to the bdev module. Since in this case it 4412 * hadn't, bump io_outstanding to account for the decrement 4413 * that spdk_bdev_io_complete() will do. 4414 */ 4415 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4416 bdev_io_increment_outstanding(ch, ch->shared_resource); 4417 } 4418 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4419 } 4420 } 4421 } 4422 4423 static bool 4424 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4425 { 4426 struct spdk_bdev_io *bdev_io; 4427 4428 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4429 if (bdev_io == bio_to_abort) { 4430 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4431 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4432 return true; 4433 } 4434 } 4435 4436 return false; 4437 } 4438 4439 static int 4440 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4441 { 4442 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4443 uint64_t buf_len; 4444 4445 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4446 if (bdev_io == bio_to_abort) { 4447 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4448 spdk_iobuf_entry_abort(ch, entry, buf_len); 4449 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4450 return 1; 4451 } 4452 4453 return 0; 4454 } 4455 4456 static bool 4457 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4458 { 4459 int rc; 4460 4461 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4462 return rc == 1; 4463 } 4464 4465 static void 4466 bdev_qos_channel_destroy(void *cb_arg) 4467 { 4468 struct spdk_bdev_qos *qos = cb_arg; 4469 4470 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4471 spdk_poller_unregister(&qos->poller); 4472 4473 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4474 4475 free(qos); 4476 } 4477 4478 static int 4479 bdev_qos_destroy(struct spdk_bdev *bdev) 4480 { 4481 int i; 4482 4483 /* 4484 * Cleanly shutting down the QoS poller is tricky, because 4485 * during the asynchronous operation the user could open 4486 * a new descriptor and create a new channel, spawning 4487 * a new QoS poller. 4488 * 4489 * The strategy is to create a new QoS structure here and swap it 4490 * in. The shutdown path then continues to refer to the old one 4491 * until it completes and then releases it. 4492 */ 4493 struct spdk_bdev_qos *new_qos, *old_qos; 4494 4495 old_qos = bdev->internal.qos; 4496 4497 new_qos = calloc(1, sizeof(*new_qos)); 4498 if (!new_qos) { 4499 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4500 return -ENOMEM; 4501 } 4502 4503 /* Copy the old QoS data into the newly allocated structure */ 4504 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4505 4506 /* Zero out the key parts of the QoS structure */ 4507 new_qos->ch = NULL; 4508 new_qos->thread = NULL; 4509 new_qos->poller = NULL; 4510 /* 4511 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4512 * It will be used later for the new QoS structure. 4513 */ 4514 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4515 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4516 new_qos->rate_limits[i].min_per_timeslice = 0; 4517 new_qos->rate_limits[i].max_per_timeslice = 0; 4518 } 4519 4520 bdev->internal.qos = new_qos; 4521 4522 if (old_qos->thread == NULL) { 4523 free(old_qos); 4524 } else { 4525 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4526 } 4527 4528 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4529 * been destroyed yet. The destruction path will end up waiting for the final 4530 * channel to be put before it releases resources. */ 4531 4532 return 0; 4533 } 4534 4535 void 4536 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4537 { 4538 total->bytes_read += add->bytes_read; 4539 total->num_read_ops += add->num_read_ops; 4540 total->bytes_written += add->bytes_written; 4541 total->num_write_ops += add->num_write_ops; 4542 total->bytes_unmapped += add->bytes_unmapped; 4543 total->num_unmap_ops += add->num_unmap_ops; 4544 total->bytes_copied += add->bytes_copied; 4545 total->num_copy_ops += add->num_copy_ops; 4546 total->read_latency_ticks += add->read_latency_ticks; 4547 total->write_latency_ticks += add->write_latency_ticks; 4548 total->unmap_latency_ticks += add->unmap_latency_ticks; 4549 total->copy_latency_ticks += add->copy_latency_ticks; 4550 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4551 total->max_read_latency_ticks = add->max_read_latency_ticks; 4552 } 4553 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4554 total->min_read_latency_ticks = add->min_read_latency_ticks; 4555 } 4556 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4557 total->max_write_latency_ticks = add->max_write_latency_ticks; 4558 } 4559 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4560 total->min_write_latency_ticks = add->min_write_latency_ticks; 4561 } 4562 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4563 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4564 } 4565 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4566 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4567 } 4568 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4569 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4570 } 4571 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4572 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4573 } 4574 } 4575 4576 static void 4577 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4578 { 4579 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4580 4581 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4582 memcpy(to_stat->io_error, from_stat->io_error, 4583 sizeof(struct spdk_bdev_io_error_stat)); 4584 } 4585 } 4586 4587 void 4588 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4589 { 4590 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4591 return; 4592 } 4593 4594 stat->max_read_latency_ticks = 0; 4595 stat->min_read_latency_ticks = UINT64_MAX; 4596 stat->max_write_latency_ticks = 0; 4597 stat->min_write_latency_ticks = UINT64_MAX; 4598 stat->max_unmap_latency_ticks = 0; 4599 stat->min_unmap_latency_ticks = UINT64_MAX; 4600 stat->max_copy_latency_ticks = 0; 4601 stat->min_copy_latency_ticks = UINT64_MAX; 4602 4603 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4604 return; 4605 } 4606 4607 stat->bytes_read = 0; 4608 stat->num_read_ops = 0; 4609 stat->bytes_written = 0; 4610 stat->num_write_ops = 0; 4611 stat->bytes_unmapped = 0; 4612 stat->num_unmap_ops = 0; 4613 stat->bytes_copied = 0; 4614 stat->num_copy_ops = 0; 4615 stat->read_latency_ticks = 0; 4616 stat->write_latency_ticks = 0; 4617 stat->unmap_latency_ticks = 0; 4618 stat->copy_latency_ticks = 0; 4619 4620 if (stat->io_error != NULL) { 4621 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4622 } 4623 } 4624 4625 struct spdk_bdev_io_stat * 4626 bdev_alloc_io_stat(bool io_error_stat) 4627 { 4628 struct spdk_bdev_io_stat *stat; 4629 4630 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4631 if (stat == NULL) { 4632 return NULL; 4633 } 4634 4635 if (io_error_stat) { 4636 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4637 if (stat->io_error == NULL) { 4638 free(stat); 4639 return NULL; 4640 } 4641 } else { 4642 stat->io_error = NULL; 4643 } 4644 4645 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4646 4647 return stat; 4648 } 4649 4650 void 4651 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4652 { 4653 if (stat != NULL) { 4654 free(stat->io_error); 4655 free(stat); 4656 } 4657 } 4658 4659 void 4660 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4661 { 4662 int i; 4663 4664 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4665 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4666 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4667 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4668 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4669 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4670 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4671 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4672 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4673 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4674 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4675 stat->min_read_latency_ticks != UINT64_MAX ? 4676 stat->min_read_latency_ticks : 0); 4677 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4678 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4679 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4680 stat->min_write_latency_ticks != UINT64_MAX ? 4681 stat->min_write_latency_ticks : 0); 4682 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4683 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4684 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4685 stat->min_unmap_latency_ticks != UINT64_MAX ? 4686 stat->min_unmap_latency_ticks : 0); 4687 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4688 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4689 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4690 stat->min_copy_latency_ticks != UINT64_MAX ? 4691 stat->min_copy_latency_ticks : 0); 4692 4693 if (stat->io_error != NULL) { 4694 spdk_json_write_named_object_begin(w, "io_error"); 4695 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4696 if (stat->io_error->error_status[i] != 0) { 4697 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4698 stat->io_error->error_status[i]); 4699 } 4700 } 4701 spdk_json_write_object_end(w); 4702 } 4703 } 4704 4705 static void 4706 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4707 { 4708 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4709 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4710 4711 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4712 bdev_abort_all_buf_io(mgmt_ch, ch); 4713 } 4714 4715 static void 4716 bdev_channel_destroy(void *io_device, void *ctx_buf) 4717 { 4718 struct spdk_bdev_channel *ch = ctx_buf; 4719 4720 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4721 spdk_get_thread()); 4722 4723 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4724 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4725 4726 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4727 spdk_spin_lock(&ch->bdev->internal.spinlock); 4728 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4729 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4730 4731 bdev_channel_abort_queued_ios(ch); 4732 4733 if (ch->histogram) { 4734 spdk_histogram_data_free(ch->histogram); 4735 } 4736 4737 bdev_channel_destroy_resource(ch); 4738 } 4739 4740 /* 4741 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4742 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4743 */ 4744 static int 4745 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4746 { 4747 struct spdk_bdev_name *tmp; 4748 4749 bdev_name->name = strdup(name); 4750 if (bdev_name->name == NULL) { 4751 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4752 return -ENOMEM; 4753 } 4754 4755 bdev_name->bdev = bdev; 4756 4757 spdk_spin_lock(&g_bdev_mgr.spinlock); 4758 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4759 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4760 4761 if (tmp != NULL) { 4762 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4763 free(bdev_name->name); 4764 return -EEXIST; 4765 } 4766 4767 return 0; 4768 } 4769 4770 static void 4771 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4772 { 4773 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4774 free(bdev_name->name); 4775 } 4776 4777 static void 4778 bdev_name_del(struct spdk_bdev_name *bdev_name) 4779 { 4780 spdk_spin_lock(&g_bdev_mgr.spinlock); 4781 bdev_name_del_unsafe(bdev_name); 4782 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4783 } 4784 4785 int 4786 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4787 { 4788 struct spdk_bdev_alias *tmp; 4789 int ret; 4790 4791 if (alias == NULL) { 4792 SPDK_ERRLOG("Empty alias passed\n"); 4793 return -EINVAL; 4794 } 4795 4796 tmp = calloc(1, sizeof(*tmp)); 4797 if (tmp == NULL) { 4798 SPDK_ERRLOG("Unable to allocate alias\n"); 4799 return -ENOMEM; 4800 } 4801 4802 ret = bdev_name_add(&tmp->alias, bdev, alias); 4803 if (ret != 0) { 4804 free(tmp); 4805 return ret; 4806 } 4807 4808 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4809 4810 return 0; 4811 } 4812 4813 static int 4814 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4815 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4816 { 4817 struct spdk_bdev_alias *tmp; 4818 4819 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4820 if (strcmp(alias, tmp->alias.name) == 0) { 4821 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4822 alias_del_fn(&tmp->alias); 4823 free(tmp); 4824 return 0; 4825 } 4826 } 4827 4828 return -ENOENT; 4829 } 4830 4831 int 4832 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4833 { 4834 int rc; 4835 4836 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4837 if (rc == -ENOENT) { 4838 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4839 } 4840 4841 return rc; 4842 } 4843 4844 void 4845 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4846 { 4847 struct spdk_bdev_alias *p, *tmp; 4848 4849 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4850 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4851 bdev_name_del(&p->alias); 4852 free(p); 4853 } 4854 } 4855 4856 struct spdk_io_channel * 4857 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4858 { 4859 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4860 } 4861 4862 void * 4863 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4864 { 4865 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4866 void *ctx = NULL; 4867 4868 if (bdev->fn_table->get_module_ctx) { 4869 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4870 } 4871 4872 return ctx; 4873 } 4874 4875 const char * 4876 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4877 { 4878 return bdev->module->name; 4879 } 4880 4881 const char * 4882 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4883 { 4884 return bdev->name; 4885 } 4886 4887 const char * 4888 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4889 { 4890 return bdev->product_name; 4891 } 4892 4893 const struct spdk_bdev_aliases_list * 4894 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4895 { 4896 return &bdev->aliases; 4897 } 4898 4899 uint32_t 4900 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4901 { 4902 return bdev->blocklen; 4903 } 4904 4905 uint32_t 4906 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4907 { 4908 return bdev->write_unit_size; 4909 } 4910 4911 uint64_t 4912 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4913 { 4914 return bdev->blockcnt; 4915 } 4916 4917 const char * 4918 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4919 { 4920 return qos_rpc_type[type]; 4921 } 4922 4923 void 4924 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4925 { 4926 int i; 4927 4928 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4929 4930 spdk_spin_lock(&bdev->internal.spinlock); 4931 if (bdev->internal.qos) { 4932 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4933 if (bdev->internal.qos->rate_limits[i].limit != 4934 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4935 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4936 if (bdev_qos_is_iops_rate_limit(i) == false) { 4937 /* Change from Byte to Megabyte which is user visible. */ 4938 limits[i] = limits[i] / 1024 / 1024; 4939 } 4940 } 4941 } 4942 } 4943 spdk_spin_unlock(&bdev->internal.spinlock); 4944 } 4945 4946 size_t 4947 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4948 { 4949 return 1 << bdev->required_alignment; 4950 } 4951 4952 uint32_t 4953 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4954 { 4955 return bdev->optimal_io_boundary; 4956 } 4957 4958 bool 4959 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4960 { 4961 return bdev->write_cache; 4962 } 4963 4964 const struct spdk_uuid * 4965 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4966 { 4967 return &bdev->uuid; 4968 } 4969 4970 uint16_t 4971 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4972 { 4973 return bdev->acwu; 4974 } 4975 4976 uint32_t 4977 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4978 { 4979 return bdev->md_len; 4980 } 4981 4982 bool 4983 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4984 { 4985 return (bdev->md_len != 0) && bdev->md_interleave; 4986 } 4987 4988 bool 4989 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4990 { 4991 return (bdev->md_len != 0) && !bdev->md_interleave; 4992 } 4993 4994 bool 4995 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4996 { 4997 return bdev->zoned; 4998 } 4999 5000 uint32_t 5001 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 5002 { 5003 if (spdk_bdev_is_md_interleaved(bdev)) { 5004 return bdev->blocklen - bdev->md_len; 5005 } else { 5006 return bdev->blocklen; 5007 } 5008 } 5009 5010 uint32_t 5011 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 5012 { 5013 return bdev->phys_blocklen; 5014 } 5015 5016 static uint32_t 5017 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 5018 { 5019 if (!spdk_bdev_is_md_interleaved(bdev)) { 5020 return bdev->blocklen + bdev->md_len; 5021 } else { 5022 return bdev->blocklen; 5023 } 5024 } 5025 5026 /* We have to use the typedef in the function declaration to appease astyle. */ 5027 typedef enum spdk_dif_type spdk_dif_type_t; 5028 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 5029 5030 spdk_dif_type_t 5031 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 5032 { 5033 if (bdev->md_len != 0) { 5034 return bdev->dif_type; 5035 } else { 5036 return SPDK_DIF_DISABLE; 5037 } 5038 } 5039 5040 spdk_dif_pi_format_t 5041 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 5042 { 5043 return bdev->dif_pi_format; 5044 } 5045 5046 bool 5047 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 5048 { 5049 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 5050 return bdev->dif_is_head_of_md; 5051 } else { 5052 return false; 5053 } 5054 } 5055 5056 bool 5057 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 5058 enum spdk_dif_check_type check_type) 5059 { 5060 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 5061 return false; 5062 } 5063 5064 switch (check_type) { 5065 case SPDK_DIF_CHECK_TYPE_REFTAG: 5066 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5067 case SPDK_DIF_CHECK_TYPE_APPTAG: 5068 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5069 case SPDK_DIF_CHECK_TYPE_GUARD: 5070 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5071 default: 5072 return false; 5073 } 5074 } 5075 5076 static uint32_t 5077 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5078 { 5079 uint64_t aligned_length, max_write_blocks; 5080 5081 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5082 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5083 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5084 5085 return max_write_blocks; 5086 } 5087 5088 uint32_t 5089 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5090 { 5091 return bdev->max_copy; 5092 } 5093 5094 uint64_t 5095 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5096 { 5097 return bdev->internal.measured_queue_depth; 5098 } 5099 5100 uint64_t 5101 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5102 { 5103 return bdev->internal.period; 5104 } 5105 5106 uint64_t 5107 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5108 { 5109 return bdev->internal.weighted_io_time; 5110 } 5111 5112 uint64_t 5113 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5114 { 5115 return bdev->internal.io_time; 5116 } 5117 5118 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5119 { 5120 return bdev->ctratt; 5121 } 5122 5123 uint32_t 5124 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5125 { 5126 return bdev->nsid; 5127 } 5128 5129 uint32_t 5130 spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 5131 { 5132 struct spdk_bdev *bdev = desc->bdev; 5133 5134 return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen; 5135 } 5136 5137 uint32_t 5138 spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc) 5139 { 5140 struct spdk_bdev *bdev = desc->bdev; 5141 5142 return desc->opts.hide_metadata ? 0 : bdev->md_len; 5143 } 5144 5145 bool 5146 spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc) 5147 { 5148 struct spdk_bdev *bdev = desc->bdev; 5149 5150 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev); 5151 } 5152 5153 bool 5154 spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc) 5155 { 5156 struct spdk_bdev *bdev = desc->bdev; 5157 5158 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev); 5159 } 5160 5161 spdk_dif_type_t 5162 spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc) 5163 { 5164 struct spdk_bdev *bdev = desc->bdev; 5165 5166 return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev); 5167 } 5168 5169 spdk_dif_pi_format_t 5170 spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc) 5171 { 5172 struct spdk_bdev *bdev = desc->bdev; 5173 5174 return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev); 5175 } 5176 5177 bool 5178 spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc) 5179 { 5180 struct spdk_bdev *bdev = desc->bdev; 5181 5182 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev); 5183 } 5184 5185 bool 5186 spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc, 5187 enum spdk_dif_check_type check_type) 5188 { 5189 struct spdk_bdev *bdev = desc->bdev; 5190 5191 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type); 5192 } 5193 5194 static void bdev_update_qd_sampling_period(void *ctx); 5195 5196 static void 5197 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5198 { 5199 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5200 5201 if (bdev->internal.measured_queue_depth) { 5202 bdev->internal.io_time += bdev->internal.period; 5203 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5204 } 5205 5206 bdev->internal.qd_poll_in_progress = false; 5207 5208 bdev_update_qd_sampling_period(bdev); 5209 } 5210 5211 static void 5212 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5213 struct spdk_io_channel *io_ch, void *_ctx) 5214 { 5215 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5216 5217 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5218 spdk_bdev_for_each_channel_continue(i, 0); 5219 } 5220 5221 static int 5222 bdev_calculate_measured_queue_depth(void *ctx) 5223 { 5224 struct spdk_bdev *bdev = ctx; 5225 5226 bdev->internal.qd_poll_in_progress = true; 5227 bdev->internal.temporary_queue_depth = 0; 5228 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5229 return SPDK_POLLER_BUSY; 5230 } 5231 5232 static void 5233 bdev_update_qd_sampling_period(void *ctx) 5234 { 5235 struct spdk_bdev *bdev = ctx; 5236 5237 if (bdev->internal.period == bdev->internal.new_period) { 5238 return; 5239 } 5240 5241 if (bdev->internal.qd_poll_in_progress) { 5242 return; 5243 } 5244 5245 bdev->internal.period = bdev->internal.new_period; 5246 5247 spdk_poller_unregister(&bdev->internal.qd_poller); 5248 if (bdev->internal.period != 0) { 5249 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5250 bdev, bdev->internal.period); 5251 } else { 5252 spdk_bdev_close(bdev->internal.qd_desc); 5253 bdev->internal.qd_desc = NULL; 5254 } 5255 } 5256 5257 static void 5258 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5259 { 5260 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5261 } 5262 5263 void 5264 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5265 { 5266 int rc; 5267 5268 if (bdev->internal.new_period == period) { 5269 return; 5270 } 5271 5272 bdev->internal.new_period = period; 5273 5274 if (bdev->internal.qd_desc != NULL) { 5275 assert(bdev->internal.period != 0); 5276 5277 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5278 bdev_update_qd_sampling_period, bdev); 5279 return; 5280 } 5281 5282 assert(bdev->internal.period == 0); 5283 5284 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5285 NULL, &bdev->internal.qd_desc); 5286 if (rc != 0) { 5287 return; 5288 } 5289 5290 bdev->internal.period = period; 5291 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5292 bdev, period); 5293 } 5294 5295 struct bdev_get_current_qd_ctx { 5296 uint64_t current_qd; 5297 spdk_bdev_get_current_qd_cb cb_fn; 5298 void *cb_arg; 5299 }; 5300 5301 static void 5302 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5303 { 5304 struct bdev_get_current_qd_ctx *ctx = _ctx; 5305 5306 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5307 5308 free(ctx); 5309 } 5310 5311 static void 5312 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5313 struct spdk_io_channel *io_ch, void *_ctx) 5314 { 5315 struct bdev_get_current_qd_ctx *ctx = _ctx; 5316 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5317 5318 ctx->current_qd += bdev_ch->io_outstanding; 5319 5320 spdk_bdev_for_each_channel_continue(i, 0); 5321 } 5322 5323 void 5324 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5325 void *cb_arg) 5326 { 5327 struct bdev_get_current_qd_ctx *ctx; 5328 5329 assert(cb_fn != NULL); 5330 5331 ctx = calloc(1, sizeof(*ctx)); 5332 if (ctx == NULL) { 5333 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5334 return; 5335 } 5336 5337 ctx->cb_fn = cb_fn; 5338 ctx->cb_arg = cb_arg; 5339 5340 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5341 } 5342 5343 static void 5344 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5345 { 5346 assert(desc->thread == spdk_get_thread()); 5347 5348 spdk_spin_lock(&desc->spinlock); 5349 desc->refs--; 5350 if (!desc->closed) { 5351 spdk_spin_unlock(&desc->spinlock); 5352 desc->callback.event_fn(type, 5353 desc->bdev, 5354 desc->callback.ctx); 5355 return; 5356 } else if (desc->refs == 0) { 5357 /* This descriptor was closed after this event_notify message was sent. 5358 * spdk_bdev_close() could not free the descriptor since this message was 5359 * in flight, so we free it now using bdev_desc_free(). 5360 */ 5361 spdk_spin_unlock(&desc->spinlock); 5362 bdev_desc_free(desc); 5363 return; 5364 } 5365 spdk_spin_unlock(&desc->spinlock); 5366 } 5367 5368 static void 5369 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5370 { 5371 spdk_spin_lock(&desc->spinlock); 5372 desc->refs++; 5373 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5374 spdk_spin_unlock(&desc->spinlock); 5375 } 5376 5377 static void 5378 _resize_notify(void *ctx) 5379 { 5380 struct spdk_bdev_desc *desc = ctx; 5381 5382 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5383 } 5384 5385 int 5386 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5387 { 5388 struct spdk_bdev_desc *desc; 5389 int ret; 5390 5391 if (size == bdev->blockcnt) { 5392 return 0; 5393 } 5394 5395 spdk_spin_lock(&bdev->internal.spinlock); 5396 5397 /* bdev has open descriptors */ 5398 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5399 bdev->blockcnt > size) { 5400 ret = -EBUSY; 5401 } else { 5402 bdev->blockcnt = size; 5403 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5404 event_notify(desc, _resize_notify); 5405 } 5406 ret = 0; 5407 } 5408 5409 spdk_spin_unlock(&bdev->internal.spinlock); 5410 5411 return ret; 5412 } 5413 5414 /* 5415 * Convert I/O offset and length from bytes to blocks. 5416 * 5417 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5418 */ 5419 static uint64_t 5420 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes, 5421 uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks) 5422 { 5423 uint32_t block_size = bdev_desc_get_block_size(desc); 5424 uint8_t shift_cnt; 5425 5426 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5427 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5428 shift_cnt = spdk_u32log2(block_size); 5429 *offset_blocks = offset_bytes >> shift_cnt; 5430 *num_blocks = num_bytes >> shift_cnt; 5431 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5432 (num_bytes - (*num_blocks << shift_cnt)); 5433 } else { 5434 *offset_blocks = offset_bytes / block_size; 5435 *num_blocks = num_bytes / block_size; 5436 return (offset_bytes % block_size) | (num_bytes % block_size); 5437 } 5438 } 5439 5440 static bool 5441 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5442 { 5443 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5444 * has been an overflow and hence the offset has been wrapped around */ 5445 if (offset_blocks + num_blocks < offset_blocks) { 5446 return false; 5447 } 5448 5449 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5450 if (offset_blocks + num_blocks > bdev->blockcnt) { 5451 return false; 5452 } 5453 5454 return true; 5455 } 5456 5457 static void 5458 bdev_seek_complete_cb(void *ctx) 5459 { 5460 struct spdk_bdev_io *bdev_io = ctx; 5461 5462 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5463 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5464 } 5465 5466 static int 5467 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5468 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5469 spdk_bdev_io_completion_cb cb, void *cb_arg) 5470 { 5471 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5472 struct spdk_bdev_io *bdev_io; 5473 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5474 5475 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5476 5477 /* Check if offset_blocks is valid looking at the validity of one block */ 5478 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5479 return -EINVAL; 5480 } 5481 5482 bdev_io = bdev_channel_get_io(channel); 5483 if (!bdev_io) { 5484 return -ENOMEM; 5485 } 5486 5487 bdev_io->internal.ch = channel; 5488 bdev_io->internal.desc = desc; 5489 bdev_io->type = io_type; 5490 bdev_io->u.bdev.offset_blocks = offset_blocks; 5491 bdev_io->u.bdev.memory_domain = NULL; 5492 bdev_io->u.bdev.memory_domain_ctx = NULL; 5493 bdev_io->u.bdev.accel_sequence = NULL; 5494 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5495 5496 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5497 /* In case bdev doesn't support seek to next data/hole offset, 5498 * it is assumed that only data and no holes are present */ 5499 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5500 bdev_io->u.bdev.seek.offset = offset_blocks; 5501 } else { 5502 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5503 } 5504 5505 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5506 return 0; 5507 } 5508 5509 bdev_io_submit(bdev_io); 5510 return 0; 5511 } 5512 5513 int 5514 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5515 uint64_t offset_blocks, 5516 spdk_bdev_io_completion_cb cb, void *cb_arg) 5517 { 5518 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5519 } 5520 5521 int 5522 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5523 uint64_t offset_blocks, 5524 spdk_bdev_io_completion_cb cb, void *cb_arg) 5525 { 5526 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5527 } 5528 5529 uint64_t 5530 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5531 { 5532 return bdev_io->u.bdev.seek.offset; 5533 } 5534 5535 static int 5536 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5537 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5538 spdk_bdev_io_completion_cb cb, void *cb_arg) 5539 { 5540 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5541 struct spdk_bdev_io *bdev_io; 5542 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5543 5544 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5545 return -EINVAL; 5546 } 5547 5548 bdev_io = bdev_channel_get_io(channel); 5549 if (!bdev_io) { 5550 return -ENOMEM; 5551 } 5552 5553 bdev_io->internal.ch = channel; 5554 bdev_io->internal.desc = desc; 5555 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5556 bdev_io->u.bdev.iovs = &bdev_io->iov; 5557 bdev_io->u.bdev.iovs[0].iov_base = buf; 5558 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5559 bdev_io->u.bdev.iovcnt = 1; 5560 bdev_io->u.bdev.md_buf = md_buf; 5561 bdev_io->u.bdev.num_blocks = num_blocks; 5562 bdev_io->u.bdev.offset_blocks = offset_blocks; 5563 bdev_io->u.bdev.memory_domain = NULL; 5564 bdev_io->u.bdev.memory_domain_ctx = NULL; 5565 bdev_io->u.bdev.accel_sequence = NULL; 5566 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5567 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5568 5569 bdev_io_submit(bdev_io); 5570 return 0; 5571 } 5572 5573 int 5574 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5575 void *buf, uint64_t offset, uint64_t nbytes, 5576 spdk_bdev_io_completion_cb cb, void *cb_arg) 5577 { 5578 uint64_t offset_blocks, num_blocks; 5579 5580 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5581 return -EINVAL; 5582 } 5583 5584 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5585 } 5586 5587 int 5588 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5589 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5590 spdk_bdev_io_completion_cb cb, void *cb_arg) 5591 { 5592 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5593 } 5594 5595 int 5596 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5597 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5598 spdk_bdev_io_completion_cb cb, void *cb_arg) 5599 { 5600 struct iovec iov = { 5601 .iov_base = buf, 5602 }; 5603 5604 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5605 return -EINVAL; 5606 } 5607 5608 if (md_buf && !_is_buf_allocated(&iov)) { 5609 return -EINVAL; 5610 } 5611 5612 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5613 cb, cb_arg); 5614 } 5615 5616 int 5617 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5618 struct iovec *iov, int iovcnt, 5619 uint64_t offset, uint64_t nbytes, 5620 spdk_bdev_io_completion_cb cb, void *cb_arg) 5621 { 5622 uint64_t offset_blocks, num_blocks; 5623 5624 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5625 return -EINVAL; 5626 } 5627 5628 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5629 } 5630 5631 static int 5632 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5633 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5634 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5635 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5636 spdk_bdev_io_completion_cb cb, void *cb_arg) 5637 { 5638 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5639 struct spdk_bdev_io *bdev_io; 5640 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5641 5642 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5643 return -EINVAL; 5644 } 5645 5646 bdev_io = bdev_channel_get_io(channel); 5647 if (spdk_unlikely(!bdev_io)) { 5648 return -ENOMEM; 5649 } 5650 5651 bdev_io->internal.ch = channel; 5652 bdev_io->internal.desc = desc; 5653 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5654 bdev_io->u.bdev.iovs = iov; 5655 bdev_io->u.bdev.iovcnt = iovcnt; 5656 bdev_io->u.bdev.md_buf = md_buf; 5657 bdev_io->u.bdev.num_blocks = num_blocks; 5658 bdev_io->u.bdev.offset_blocks = offset_blocks; 5659 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5660 5661 if (seq != NULL) { 5662 bdev_io->internal.f.has_accel_sequence = true; 5663 bdev_io->internal.accel_sequence = seq; 5664 } 5665 5666 if (domain != NULL) { 5667 bdev_io->internal.f.has_memory_domain = true; 5668 bdev_io->internal.memory_domain = domain; 5669 bdev_io->internal.memory_domain_ctx = domain_ctx; 5670 } 5671 5672 bdev_io->u.bdev.memory_domain = domain; 5673 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5674 bdev_io->u.bdev.accel_sequence = seq; 5675 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5676 5677 _bdev_io_submit_ext(desc, bdev_io); 5678 5679 return 0; 5680 } 5681 5682 int 5683 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5684 struct iovec *iov, int iovcnt, 5685 uint64_t offset_blocks, uint64_t num_blocks, 5686 spdk_bdev_io_completion_cb cb, void *cb_arg) 5687 { 5688 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5689 5690 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5691 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5692 } 5693 5694 int 5695 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5696 struct iovec *iov, int iovcnt, void *md_buf, 5697 uint64_t offset_blocks, uint64_t num_blocks, 5698 spdk_bdev_io_completion_cb cb, void *cb_arg) 5699 { 5700 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5701 5702 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5703 return -EINVAL; 5704 } 5705 5706 if (md_buf && !_is_buf_allocated(iov)) { 5707 return -EINVAL; 5708 } 5709 5710 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5711 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5712 } 5713 5714 static inline bool 5715 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5716 { 5717 /* 5718 * We check if opts size is at least of size when we first introduced 5719 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5720 * are not checked internal. 5721 */ 5722 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5723 sizeof(opts->metadata) && 5724 opts->size <= sizeof(*opts) && 5725 /* When memory domain is used, the user must provide data buffers */ 5726 (!opts->memory_domain || (iov && iov[0].iov_base)); 5727 } 5728 5729 int 5730 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5731 struct iovec *iov, int iovcnt, 5732 uint64_t offset_blocks, uint64_t num_blocks, 5733 spdk_bdev_io_completion_cb cb, void *cb_arg, 5734 struct spdk_bdev_ext_io_opts *opts) 5735 { 5736 struct spdk_memory_domain *domain = NULL; 5737 struct spdk_accel_sequence *seq = NULL; 5738 void *domain_ctx = NULL, *md = NULL; 5739 uint32_t dif_check_flags = 0; 5740 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5741 5742 if (opts) { 5743 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5744 return -EINVAL; 5745 } 5746 5747 md = opts->metadata; 5748 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5749 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5750 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5751 if (md) { 5752 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5753 return -EINVAL; 5754 } 5755 5756 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5757 return -EINVAL; 5758 } 5759 5760 if (spdk_unlikely(seq != NULL)) { 5761 return -EINVAL; 5762 } 5763 } 5764 } 5765 5766 dif_check_flags = bdev->dif_check_flags & 5767 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5768 5769 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5770 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5771 } 5772 5773 static int 5774 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5775 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5776 spdk_bdev_io_completion_cb cb, void *cb_arg) 5777 { 5778 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5779 struct spdk_bdev_io *bdev_io; 5780 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5781 5782 if (!desc->write) { 5783 return -EBADF; 5784 } 5785 5786 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5787 return -EINVAL; 5788 } 5789 5790 bdev_io = bdev_channel_get_io(channel); 5791 if (!bdev_io) { 5792 return -ENOMEM; 5793 } 5794 5795 bdev_io->internal.ch = channel; 5796 bdev_io->internal.desc = desc; 5797 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5798 bdev_io->u.bdev.iovs = &bdev_io->iov; 5799 bdev_io->u.bdev.iovs[0].iov_base = buf; 5800 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5801 bdev_io->u.bdev.iovcnt = 1; 5802 bdev_io->u.bdev.md_buf = md_buf; 5803 bdev_io->u.bdev.num_blocks = num_blocks; 5804 bdev_io->u.bdev.offset_blocks = offset_blocks; 5805 bdev_io->u.bdev.memory_domain = NULL; 5806 bdev_io->u.bdev.memory_domain_ctx = NULL; 5807 bdev_io->u.bdev.accel_sequence = NULL; 5808 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5809 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5810 5811 bdev_io_submit(bdev_io); 5812 return 0; 5813 } 5814 5815 int 5816 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5817 void *buf, uint64_t offset, uint64_t nbytes, 5818 spdk_bdev_io_completion_cb cb, void *cb_arg) 5819 { 5820 uint64_t offset_blocks, num_blocks; 5821 5822 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5823 return -EINVAL; 5824 } 5825 5826 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5827 } 5828 5829 int 5830 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5831 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5832 spdk_bdev_io_completion_cb cb, void *cb_arg) 5833 { 5834 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5835 cb, cb_arg); 5836 } 5837 5838 int 5839 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5840 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5841 spdk_bdev_io_completion_cb cb, void *cb_arg) 5842 { 5843 struct iovec iov = { 5844 .iov_base = buf, 5845 }; 5846 5847 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5848 return -EINVAL; 5849 } 5850 5851 if (md_buf && !_is_buf_allocated(&iov)) { 5852 return -EINVAL; 5853 } 5854 5855 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5856 cb, cb_arg); 5857 } 5858 5859 static int 5860 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5861 struct iovec *iov, int iovcnt, void *md_buf, 5862 uint64_t offset_blocks, uint64_t num_blocks, 5863 struct spdk_memory_domain *domain, void *domain_ctx, 5864 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5865 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5866 spdk_bdev_io_completion_cb cb, void *cb_arg) 5867 { 5868 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5869 struct spdk_bdev_io *bdev_io; 5870 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5871 5872 if (spdk_unlikely(!desc->write)) { 5873 return -EBADF; 5874 } 5875 5876 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5877 return -EINVAL; 5878 } 5879 5880 bdev_io = bdev_channel_get_io(channel); 5881 if (spdk_unlikely(!bdev_io)) { 5882 return -ENOMEM; 5883 } 5884 5885 bdev_io->internal.ch = channel; 5886 bdev_io->internal.desc = desc; 5887 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5888 bdev_io->u.bdev.iovs = iov; 5889 bdev_io->u.bdev.iovcnt = iovcnt; 5890 bdev_io->u.bdev.md_buf = md_buf; 5891 bdev_io->u.bdev.num_blocks = num_blocks; 5892 bdev_io->u.bdev.offset_blocks = offset_blocks; 5893 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5894 if (seq != NULL) { 5895 bdev_io->internal.f.has_accel_sequence = true; 5896 bdev_io->internal.accel_sequence = seq; 5897 } 5898 5899 if (domain != NULL) { 5900 bdev_io->internal.f.has_memory_domain = true; 5901 bdev_io->internal.memory_domain = domain; 5902 bdev_io->internal.memory_domain_ctx = domain_ctx; 5903 } 5904 5905 bdev_io->u.bdev.memory_domain = domain; 5906 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5907 bdev_io->u.bdev.accel_sequence = seq; 5908 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5909 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5910 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5911 5912 _bdev_io_submit_ext(desc, bdev_io); 5913 5914 return 0; 5915 } 5916 5917 int 5918 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5919 struct iovec *iov, int iovcnt, 5920 uint64_t offset, uint64_t len, 5921 spdk_bdev_io_completion_cb cb, void *cb_arg) 5922 { 5923 uint64_t offset_blocks, num_blocks; 5924 5925 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 5926 return -EINVAL; 5927 } 5928 5929 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5930 } 5931 5932 int 5933 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5934 struct iovec *iov, int iovcnt, 5935 uint64_t offset_blocks, uint64_t num_blocks, 5936 spdk_bdev_io_completion_cb cb, void *cb_arg) 5937 { 5938 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5939 5940 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5941 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5942 cb, cb_arg); 5943 } 5944 5945 int 5946 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5947 struct iovec *iov, int iovcnt, void *md_buf, 5948 uint64_t offset_blocks, uint64_t num_blocks, 5949 spdk_bdev_io_completion_cb cb, void *cb_arg) 5950 { 5951 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5952 5953 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5954 return -EINVAL; 5955 } 5956 5957 if (md_buf && !_is_buf_allocated(iov)) { 5958 return -EINVAL; 5959 } 5960 5961 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5962 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5963 cb, cb_arg); 5964 } 5965 5966 int 5967 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5968 struct iovec *iov, int iovcnt, 5969 uint64_t offset_blocks, uint64_t num_blocks, 5970 spdk_bdev_io_completion_cb cb, void *cb_arg, 5971 struct spdk_bdev_ext_io_opts *opts) 5972 { 5973 struct spdk_memory_domain *domain = NULL; 5974 struct spdk_accel_sequence *seq = NULL; 5975 void *domain_ctx = NULL, *md = NULL; 5976 uint32_t dif_check_flags = 0; 5977 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5978 uint32_t nvme_cdw12_raw = 0; 5979 uint32_t nvme_cdw13_raw = 0; 5980 5981 if (opts) { 5982 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5983 return -EINVAL; 5984 } 5985 md = opts->metadata; 5986 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5987 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5988 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5989 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5990 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5991 if (md) { 5992 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5993 return -EINVAL; 5994 } 5995 5996 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5997 return -EINVAL; 5998 } 5999 6000 if (spdk_unlikely(seq != NULL)) { 6001 return -EINVAL; 6002 } 6003 } 6004 } 6005 6006 dif_check_flags = bdev->dif_check_flags & 6007 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 6008 6009 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 6010 domain, domain_ctx, seq, dif_check_flags, 6011 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 6012 } 6013 6014 static void 6015 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6016 { 6017 struct spdk_bdev_io *parent_io = cb_arg; 6018 struct spdk_bdev *bdev = parent_io->bdev; 6019 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 6020 int i, rc = 0; 6021 6022 if (!success) { 6023 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6024 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6025 spdk_bdev_free_io(bdev_io); 6026 return; 6027 } 6028 6029 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 6030 rc = memcmp(read_buf, 6031 parent_io->u.bdev.iovs[i].iov_base, 6032 parent_io->u.bdev.iovs[i].iov_len); 6033 if (rc) { 6034 break; 6035 } 6036 read_buf += parent_io->u.bdev.iovs[i].iov_len; 6037 } 6038 6039 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 6040 rc = memcmp(bdev_io->u.bdev.md_buf, 6041 parent_io->u.bdev.md_buf, 6042 spdk_bdev_get_md_size(bdev)); 6043 } 6044 6045 spdk_bdev_free_io(bdev_io); 6046 6047 if (rc == 0) { 6048 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6049 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6050 } else { 6051 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 6052 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6053 } 6054 } 6055 6056 static void 6057 bdev_compare_do_read(void *_bdev_io) 6058 { 6059 struct spdk_bdev_io *bdev_io = _bdev_io; 6060 int rc; 6061 6062 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 6063 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 6064 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6065 bdev_compare_do_read_done, bdev_io); 6066 6067 if (rc == -ENOMEM) { 6068 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 6069 } else if (rc != 0) { 6070 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6071 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6072 } 6073 } 6074 6075 static int 6076 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6077 struct iovec *iov, int iovcnt, void *md_buf, 6078 uint64_t offset_blocks, uint64_t num_blocks, 6079 spdk_bdev_io_completion_cb cb, void *cb_arg) 6080 { 6081 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6082 struct spdk_bdev_io *bdev_io; 6083 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6084 6085 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6086 return -EINVAL; 6087 } 6088 6089 bdev_io = bdev_channel_get_io(channel); 6090 if (!bdev_io) { 6091 return -ENOMEM; 6092 } 6093 6094 bdev_io->internal.ch = channel; 6095 bdev_io->internal.desc = desc; 6096 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6097 bdev_io->u.bdev.iovs = iov; 6098 bdev_io->u.bdev.iovcnt = iovcnt; 6099 bdev_io->u.bdev.md_buf = md_buf; 6100 bdev_io->u.bdev.num_blocks = num_blocks; 6101 bdev_io->u.bdev.offset_blocks = offset_blocks; 6102 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6103 bdev_io->u.bdev.memory_domain = NULL; 6104 bdev_io->u.bdev.memory_domain_ctx = NULL; 6105 bdev_io->u.bdev.accel_sequence = NULL; 6106 6107 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6108 bdev_io_submit(bdev_io); 6109 return 0; 6110 } 6111 6112 bdev_compare_do_read(bdev_io); 6113 6114 return 0; 6115 } 6116 6117 int 6118 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6119 struct iovec *iov, int iovcnt, 6120 uint64_t offset_blocks, uint64_t num_blocks, 6121 spdk_bdev_io_completion_cb cb, void *cb_arg) 6122 { 6123 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6124 num_blocks, cb, cb_arg); 6125 } 6126 6127 int 6128 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6129 struct iovec *iov, int iovcnt, void *md_buf, 6130 uint64_t offset_blocks, uint64_t num_blocks, 6131 spdk_bdev_io_completion_cb cb, void *cb_arg) 6132 { 6133 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6134 return -EINVAL; 6135 } 6136 6137 if (md_buf && !_is_buf_allocated(iov)) { 6138 return -EINVAL; 6139 } 6140 6141 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6142 num_blocks, cb, cb_arg); 6143 } 6144 6145 static int 6146 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6147 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6148 spdk_bdev_io_completion_cb cb, void *cb_arg) 6149 { 6150 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6151 struct spdk_bdev_io *bdev_io; 6152 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6153 6154 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6155 return -EINVAL; 6156 } 6157 6158 bdev_io = bdev_channel_get_io(channel); 6159 if (!bdev_io) { 6160 return -ENOMEM; 6161 } 6162 6163 bdev_io->internal.ch = channel; 6164 bdev_io->internal.desc = desc; 6165 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6166 bdev_io->u.bdev.iovs = &bdev_io->iov; 6167 bdev_io->u.bdev.iovs[0].iov_base = buf; 6168 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 6169 bdev_io->u.bdev.iovcnt = 1; 6170 bdev_io->u.bdev.md_buf = md_buf; 6171 bdev_io->u.bdev.num_blocks = num_blocks; 6172 bdev_io->u.bdev.offset_blocks = offset_blocks; 6173 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6174 bdev_io->u.bdev.memory_domain = NULL; 6175 bdev_io->u.bdev.memory_domain_ctx = NULL; 6176 bdev_io->u.bdev.accel_sequence = NULL; 6177 6178 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6179 bdev_io_submit(bdev_io); 6180 return 0; 6181 } 6182 6183 bdev_compare_do_read(bdev_io); 6184 6185 return 0; 6186 } 6187 6188 int 6189 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6190 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6191 spdk_bdev_io_completion_cb cb, void *cb_arg) 6192 { 6193 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6194 cb, cb_arg); 6195 } 6196 6197 int 6198 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6199 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6200 spdk_bdev_io_completion_cb cb, void *cb_arg) 6201 { 6202 struct iovec iov = { 6203 .iov_base = buf, 6204 }; 6205 6206 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6207 return -EINVAL; 6208 } 6209 6210 if (md_buf && !_is_buf_allocated(&iov)) { 6211 return -EINVAL; 6212 } 6213 6214 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6215 cb, cb_arg); 6216 } 6217 6218 static void 6219 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6220 { 6221 struct spdk_bdev_io *bdev_io = ctx; 6222 6223 if (unlock_status) { 6224 SPDK_ERRLOG("LBA range unlock failed\n"); 6225 } 6226 6227 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6228 false, bdev_io->internal.caller_ctx); 6229 } 6230 6231 static void 6232 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6233 { 6234 bdev_io->internal.status = status; 6235 6236 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6237 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6238 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6239 } 6240 6241 static void 6242 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6243 { 6244 struct spdk_bdev_io *parent_io = cb_arg; 6245 6246 if (!success) { 6247 SPDK_ERRLOG("Compare and write operation failed\n"); 6248 } 6249 6250 spdk_bdev_free_io(bdev_io); 6251 6252 bdev_comparev_and_writev_blocks_unlock(parent_io, 6253 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6254 } 6255 6256 static void 6257 bdev_compare_and_write_do_write(void *_bdev_io) 6258 { 6259 struct spdk_bdev_io *bdev_io = _bdev_io; 6260 int rc; 6261 6262 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6263 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6264 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6265 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6266 bdev_compare_and_write_do_write_done, bdev_io); 6267 6268 6269 if (rc == -ENOMEM) { 6270 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6271 } else if (rc != 0) { 6272 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6273 } 6274 } 6275 6276 static void 6277 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6278 { 6279 struct spdk_bdev_io *parent_io = cb_arg; 6280 6281 spdk_bdev_free_io(bdev_io); 6282 6283 if (!success) { 6284 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6285 return; 6286 } 6287 6288 bdev_compare_and_write_do_write(parent_io); 6289 } 6290 6291 static void 6292 bdev_compare_and_write_do_compare(void *_bdev_io) 6293 { 6294 struct spdk_bdev_io *bdev_io = _bdev_io; 6295 int rc; 6296 6297 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6298 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6299 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6300 bdev_compare_and_write_do_compare_done, bdev_io); 6301 6302 if (rc == -ENOMEM) { 6303 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6304 } else if (rc != 0) { 6305 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6306 } 6307 } 6308 6309 static void 6310 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6311 { 6312 struct spdk_bdev_io *bdev_io = ctx; 6313 6314 if (status) { 6315 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6316 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6317 return; 6318 } 6319 6320 bdev_compare_and_write_do_compare(bdev_io); 6321 } 6322 6323 int 6324 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6325 struct iovec *compare_iov, int compare_iovcnt, 6326 struct iovec *write_iov, int write_iovcnt, 6327 uint64_t offset_blocks, uint64_t num_blocks, 6328 spdk_bdev_io_completion_cb cb, void *cb_arg) 6329 { 6330 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6331 struct spdk_bdev_io *bdev_io; 6332 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6333 6334 if (!desc->write) { 6335 return -EBADF; 6336 } 6337 6338 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6339 return -EINVAL; 6340 } 6341 6342 if (num_blocks > bdev->acwu) { 6343 return -EINVAL; 6344 } 6345 6346 bdev_io = bdev_channel_get_io(channel); 6347 if (!bdev_io) { 6348 return -ENOMEM; 6349 } 6350 6351 bdev_io->internal.ch = channel; 6352 bdev_io->internal.desc = desc; 6353 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6354 bdev_io->u.bdev.iovs = compare_iov; 6355 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6356 bdev_io->u.bdev.fused_iovs = write_iov; 6357 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6358 bdev_io->u.bdev.md_buf = NULL; 6359 bdev_io->u.bdev.num_blocks = num_blocks; 6360 bdev_io->u.bdev.offset_blocks = offset_blocks; 6361 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6362 bdev_io->u.bdev.memory_domain = NULL; 6363 bdev_io->u.bdev.memory_domain_ctx = NULL; 6364 bdev_io->u.bdev.accel_sequence = NULL; 6365 6366 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6367 bdev_io_submit(bdev_io); 6368 return 0; 6369 } 6370 6371 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6372 bdev_comparev_and_writev_blocks_locked, bdev_io); 6373 } 6374 6375 int 6376 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6377 struct iovec *iov, int iovcnt, 6378 uint64_t offset_blocks, uint64_t num_blocks, 6379 bool populate, 6380 spdk_bdev_io_completion_cb cb, void *cb_arg) 6381 { 6382 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6383 struct spdk_bdev_io *bdev_io; 6384 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6385 6386 if (!desc->write) { 6387 return -EBADF; 6388 } 6389 6390 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6391 return -EINVAL; 6392 } 6393 6394 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6395 return -ENOTSUP; 6396 } 6397 6398 bdev_io = bdev_channel_get_io(channel); 6399 if (!bdev_io) { 6400 return -ENOMEM; 6401 } 6402 6403 bdev_io->internal.ch = channel; 6404 bdev_io->internal.desc = desc; 6405 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6406 bdev_io->u.bdev.num_blocks = num_blocks; 6407 bdev_io->u.bdev.offset_blocks = offset_blocks; 6408 bdev_io->u.bdev.iovs = iov; 6409 bdev_io->u.bdev.iovcnt = iovcnt; 6410 bdev_io->u.bdev.md_buf = NULL; 6411 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6412 bdev_io->u.bdev.zcopy.commit = 0; 6413 bdev_io->u.bdev.zcopy.start = 1; 6414 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6415 bdev_io->u.bdev.memory_domain = NULL; 6416 bdev_io->u.bdev.memory_domain_ctx = NULL; 6417 bdev_io->u.bdev.accel_sequence = NULL; 6418 6419 bdev_io_submit(bdev_io); 6420 6421 return 0; 6422 } 6423 6424 int 6425 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6426 spdk_bdev_io_completion_cb cb, void *cb_arg) 6427 { 6428 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6429 return -EINVAL; 6430 } 6431 6432 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6433 bdev_io->u.bdev.zcopy.start = 0; 6434 bdev_io->internal.caller_ctx = cb_arg; 6435 bdev_io->internal.cb = cb; 6436 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6437 6438 bdev_io_submit(bdev_io); 6439 6440 return 0; 6441 } 6442 6443 int 6444 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6445 uint64_t offset, uint64_t len, 6446 spdk_bdev_io_completion_cb cb, void *cb_arg) 6447 { 6448 uint64_t offset_blocks, num_blocks; 6449 6450 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6451 return -EINVAL; 6452 } 6453 6454 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6455 } 6456 6457 int 6458 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6459 uint64_t offset_blocks, uint64_t num_blocks, 6460 spdk_bdev_io_completion_cb cb, void *cb_arg) 6461 { 6462 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6463 struct spdk_bdev_io *bdev_io; 6464 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6465 6466 if (!desc->write) { 6467 return -EBADF; 6468 } 6469 6470 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6471 return -EINVAL; 6472 } 6473 6474 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6475 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6476 return -ENOTSUP; 6477 } 6478 6479 bdev_io = bdev_channel_get_io(channel); 6480 6481 if (!bdev_io) { 6482 return -ENOMEM; 6483 } 6484 6485 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6486 bdev_io->internal.ch = channel; 6487 bdev_io->internal.desc = desc; 6488 bdev_io->u.bdev.offset_blocks = offset_blocks; 6489 bdev_io->u.bdev.num_blocks = num_blocks; 6490 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6491 bdev_io->u.bdev.memory_domain = NULL; 6492 bdev_io->u.bdev.memory_domain_ctx = NULL; 6493 bdev_io->u.bdev.accel_sequence = NULL; 6494 6495 /* If the write_zeroes size is large and should be split, use the generic split 6496 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6497 * 6498 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6499 * or emulate it using regular write request otherwise. 6500 */ 6501 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6502 bdev_io->internal.f.split) { 6503 bdev_io_submit(bdev_io); 6504 return 0; 6505 } 6506 6507 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6508 6509 return bdev_write_zero_buffer(bdev_io); 6510 } 6511 6512 int 6513 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6514 uint64_t offset, uint64_t nbytes, 6515 spdk_bdev_io_completion_cb cb, void *cb_arg) 6516 { 6517 uint64_t offset_blocks, num_blocks; 6518 6519 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6520 return -EINVAL; 6521 } 6522 6523 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6524 } 6525 6526 static void 6527 bdev_io_complete_cb(void *ctx) 6528 { 6529 struct spdk_bdev_io *bdev_io = ctx; 6530 6531 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6532 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6533 } 6534 6535 int 6536 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6537 uint64_t offset_blocks, uint64_t num_blocks, 6538 spdk_bdev_io_completion_cb cb, void *cb_arg) 6539 { 6540 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6541 struct spdk_bdev_io *bdev_io; 6542 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6543 6544 if (!desc->write) { 6545 return -EBADF; 6546 } 6547 6548 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6549 return -EINVAL; 6550 } 6551 6552 bdev_io = bdev_channel_get_io(channel); 6553 if (!bdev_io) { 6554 return -ENOMEM; 6555 } 6556 6557 bdev_io->internal.ch = channel; 6558 bdev_io->internal.desc = desc; 6559 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6560 6561 bdev_io->u.bdev.iovs = &bdev_io->iov; 6562 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6563 bdev_io->u.bdev.iovs[0].iov_len = 0; 6564 bdev_io->u.bdev.iovcnt = 1; 6565 6566 bdev_io->u.bdev.offset_blocks = offset_blocks; 6567 bdev_io->u.bdev.num_blocks = num_blocks; 6568 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6569 bdev_io->u.bdev.memory_domain = NULL; 6570 bdev_io->u.bdev.memory_domain_ctx = NULL; 6571 bdev_io->u.bdev.accel_sequence = NULL; 6572 6573 if (num_blocks == 0) { 6574 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6575 return 0; 6576 } 6577 6578 bdev_io_submit(bdev_io); 6579 return 0; 6580 } 6581 6582 int 6583 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6584 uint64_t offset, uint64_t length, 6585 spdk_bdev_io_completion_cb cb, void *cb_arg) 6586 { 6587 uint64_t offset_blocks, num_blocks; 6588 6589 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) { 6590 return -EINVAL; 6591 } 6592 6593 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6594 } 6595 6596 int 6597 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6598 uint64_t offset_blocks, uint64_t num_blocks, 6599 spdk_bdev_io_completion_cb cb, void *cb_arg) 6600 { 6601 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6602 struct spdk_bdev_io *bdev_io; 6603 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6604 6605 if (!desc->write) { 6606 return -EBADF; 6607 } 6608 6609 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6610 return -ENOTSUP; 6611 } 6612 6613 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6614 return -EINVAL; 6615 } 6616 6617 bdev_io = bdev_channel_get_io(channel); 6618 if (!bdev_io) { 6619 return -ENOMEM; 6620 } 6621 6622 bdev_io->internal.ch = channel; 6623 bdev_io->internal.desc = desc; 6624 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6625 bdev_io->u.bdev.iovs = NULL; 6626 bdev_io->u.bdev.iovcnt = 0; 6627 bdev_io->u.bdev.offset_blocks = offset_blocks; 6628 bdev_io->u.bdev.num_blocks = num_blocks; 6629 bdev_io->u.bdev.memory_domain = NULL; 6630 bdev_io->u.bdev.memory_domain_ctx = NULL; 6631 bdev_io->u.bdev.accel_sequence = NULL; 6632 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6633 6634 bdev_io_submit(bdev_io); 6635 return 0; 6636 } 6637 6638 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6639 6640 static void 6641 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6642 { 6643 struct spdk_bdev_io *bdev_io = _ctx; 6644 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6645 6646 if (status == -EBUSY) { 6647 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6648 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6649 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6650 } else { 6651 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6652 /* If outstanding IOs are still present and reset_io_drain_timeout 6653 * seconds passed, start the reset. */ 6654 bdev_io_submit_reset(bdev_io); 6655 } else { 6656 /* We still have in progress memory domain pull/push or we're 6657 * executing accel sequence. Since we cannot abort either of those 6658 * operations, fail the reset request. */ 6659 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6660 } 6661 } 6662 } else { 6663 SPDK_DEBUGLOG(bdev, 6664 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6665 ch->bdev->name); 6666 /* Mark the completion status as a SUCCESS and complete the reset. */ 6667 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6668 } 6669 } 6670 6671 static void 6672 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6673 struct spdk_io_channel *io_ch, void *_ctx) 6674 { 6675 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6676 int status = 0; 6677 6678 if (cur_ch->io_outstanding > 0 || 6679 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6680 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6681 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6682 * further iteration over the rest of the channels and pass non-zero status 6683 * to the callback function. */ 6684 status = -EBUSY; 6685 } 6686 spdk_bdev_for_each_channel_continue(i, status); 6687 } 6688 6689 static int 6690 bdev_reset_poll_for_outstanding_io(void *ctx) 6691 { 6692 struct spdk_bdev_io *bdev_io = ctx; 6693 6694 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6695 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6696 bdev_reset_check_outstanding_io_done); 6697 6698 return SPDK_POLLER_BUSY; 6699 } 6700 6701 static void 6702 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6703 { 6704 struct spdk_bdev_io *bdev_io = _ctx; 6705 6706 if (bdev->reset_io_drain_timeout == 0) { 6707 bdev_io_submit_reset(bdev_io); 6708 return; 6709 } 6710 6711 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6712 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6713 6714 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6715 * submit the reset to the underlying module only if outstanding I/O 6716 * remain after reset_io_drain_timeout seconds have passed. */ 6717 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6718 bdev_reset_check_outstanding_io_done); 6719 } 6720 6721 static void 6722 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6723 struct spdk_io_channel *ch, void *_ctx) 6724 { 6725 struct spdk_bdev_channel *channel; 6726 struct spdk_bdev_mgmt_channel *mgmt_channel; 6727 struct spdk_bdev_shared_resource *shared_resource; 6728 bdev_io_tailq_t tmp_queued; 6729 6730 TAILQ_INIT(&tmp_queued); 6731 6732 channel = __io_ch_to_bdev_ch(ch); 6733 shared_resource = channel->shared_resource; 6734 mgmt_channel = shared_resource->mgmt_ch; 6735 6736 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6737 6738 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6739 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6740 } 6741 6742 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6743 bdev_abort_all_buf_io(mgmt_channel, channel); 6744 bdev_abort_all_queued_io(&tmp_queued, channel); 6745 6746 spdk_bdev_for_each_channel_continue(i, 0); 6747 } 6748 6749 static void 6750 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6751 { 6752 struct spdk_bdev *bdev = bdev_io->bdev; 6753 bool freeze_channel = false; 6754 6755 bdev_ch_add_to_io_submitted(bdev_io); 6756 6757 /** 6758 * Take a channel reference for the target bdev for the life of this 6759 * reset. This guards against the channel getting destroyed before 6760 * the reset is completed. We will release the reference when this 6761 * reset is completed. 6762 */ 6763 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6764 6765 spdk_spin_lock(&bdev->internal.spinlock); 6766 if (bdev->internal.reset_in_progress == NULL) { 6767 bdev->internal.reset_in_progress = bdev_io; 6768 freeze_channel = true; 6769 } else { 6770 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6771 } 6772 spdk_spin_unlock(&bdev->internal.spinlock); 6773 6774 if (freeze_channel) { 6775 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6776 bdev_reset_freeze_channel_done); 6777 } 6778 } 6779 6780 int 6781 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6782 spdk_bdev_io_completion_cb cb, void *cb_arg) 6783 { 6784 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6785 struct spdk_bdev_io *bdev_io; 6786 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6787 6788 bdev_io = bdev_channel_get_io(channel); 6789 if (!bdev_io) { 6790 return -ENOMEM; 6791 } 6792 6793 bdev_io->internal.ch = channel; 6794 bdev_io->internal.desc = desc; 6795 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6796 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6797 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6798 6799 bdev_start_reset(bdev_io); 6800 return 0; 6801 } 6802 6803 void 6804 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6805 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6806 { 6807 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6808 6809 bdev_get_io_stat(stat, channel->stat); 6810 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 6811 } 6812 6813 static void 6814 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6815 { 6816 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6817 6818 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6819 bdev_iostat_ctx->cb_arg, 0); 6820 free(bdev_iostat_ctx); 6821 } 6822 6823 static void 6824 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6825 struct spdk_io_channel *ch, void *_ctx) 6826 { 6827 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6828 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6829 6830 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6831 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 6832 spdk_bdev_for_each_channel_continue(i, 0); 6833 } 6834 6835 void 6836 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6837 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6838 { 6839 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6840 6841 assert(bdev != NULL); 6842 assert(stat != NULL); 6843 assert(cb != NULL); 6844 6845 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6846 if (bdev_iostat_ctx == NULL) { 6847 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6848 cb(bdev, stat, cb_arg, -ENOMEM); 6849 return; 6850 } 6851 6852 bdev_iostat_ctx->stat = stat; 6853 bdev_iostat_ctx->cb = cb; 6854 bdev_iostat_ctx->cb_arg = cb_arg; 6855 bdev_iostat_ctx->reset_mode = reset_mode; 6856 6857 /* Start with the statistics from previously deleted channels. */ 6858 spdk_spin_lock(&bdev->internal.spinlock); 6859 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6860 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 6861 spdk_spin_unlock(&bdev->internal.spinlock); 6862 6863 /* Then iterate and add the statistics from each existing channel. */ 6864 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6865 bdev_get_device_stat_done); 6866 } 6867 6868 struct bdev_iostat_reset_ctx { 6869 enum spdk_bdev_reset_stat_mode mode; 6870 bdev_reset_device_stat_cb cb; 6871 void *cb_arg; 6872 }; 6873 6874 static void 6875 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6876 { 6877 struct bdev_iostat_reset_ctx *ctx = _ctx; 6878 6879 ctx->cb(bdev, ctx->cb_arg, 0); 6880 6881 free(ctx); 6882 } 6883 6884 static void 6885 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6886 struct spdk_io_channel *ch, void *_ctx) 6887 { 6888 struct bdev_iostat_reset_ctx *ctx = _ctx; 6889 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6890 6891 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6892 6893 spdk_bdev_for_each_channel_continue(i, 0); 6894 } 6895 6896 void 6897 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6898 bdev_reset_device_stat_cb cb, void *cb_arg) 6899 { 6900 struct bdev_iostat_reset_ctx *ctx; 6901 6902 assert(bdev != NULL); 6903 assert(cb != NULL); 6904 6905 ctx = calloc(1, sizeof(*ctx)); 6906 if (ctx == NULL) { 6907 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6908 cb(bdev, cb_arg, -ENOMEM); 6909 return; 6910 } 6911 6912 ctx->mode = mode; 6913 ctx->cb = cb; 6914 ctx->cb_arg = cb_arg; 6915 6916 spdk_spin_lock(&bdev->internal.spinlock); 6917 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6918 spdk_spin_unlock(&bdev->internal.spinlock); 6919 6920 spdk_bdev_for_each_channel(bdev, 6921 bdev_reset_each_channel_stat, 6922 ctx, 6923 bdev_reset_device_stat_done); 6924 } 6925 6926 int 6927 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6928 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6929 spdk_bdev_io_completion_cb cb, void *cb_arg) 6930 { 6931 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6932 struct spdk_bdev_io *bdev_io; 6933 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6934 6935 if (!desc->write) { 6936 return -EBADF; 6937 } 6938 6939 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6940 return -ENOTSUP; 6941 } 6942 6943 bdev_io = bdev_channel_get_io(channel); 6944 if (!bdev_io) { 6945 return -ENOMEM; 6946 } 6947 6948 bdev_io->internal.ch = channel; 6949 bdev_io->internal.desc = desc; 6950 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6951 bdev_io->u.nvme_passthru.cmd = *cmd; 6952 bdev_io->u.nvme_passthru.buf = buf; 6953 bdev_io->u.nvme_passthru.nbytes = nbytes; 6954 bdev_io->u.nvme_passthru.md_buf = NULL; 6955 bdev_io->u.nvme_passthru.md_len = 0; 6956 6957 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6958 6959 bdev_io_submit(bdev_io); 6960 return 0; 6961 } 6962 6963 int 6964 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6965 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6966 spdk_bdev_io_completion_cb cb, void *cb_arg) 6967 { 6968 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6969 struct spdk_bdev_io *bdev_io; 6970 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6971 6972 if (!desc->write) { 6973 /* 6974 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6975 * to easily determine if the command is a read or write, but for now just 6976 * do not allow io_passthru with a read-only descriptor. 6977 */ 6978 return -EBADF; 6979 } 6980 6981 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6982 return -ENOTSUP; 6983 } 6984 6985 bdev_io = bdev_channel_get_io(channel); 6986 if (!bdev_io) { 6987 return -ENOMEM; 6988 } 6989 6990 bdev_io->internal.ch = channel; 6991 bdev_io->internal.desc = desc; 6992 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6993 bdev_io->u.nvme_passthru.cmd = *cmd; 6994 bdev_io->u.nvme_passthru.buf = buf; 6995 bdev_io->u.nvme_passthru.nbytes = nbytes; 6996 bdev_io->u.nvme_passthru.md_buf = NULL; 6997 bdev_io->u.nvme_passthru.md_len = 0; 6998 6999 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7000 7001 bdev_io_submit(bdev_io); 7002 return 0; 7003 } 7004 7005 int 7006 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7007 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 7008 spdk_bdev_io_completion_cb cb, void *cb_arg) 7009 { 7010 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7011 struct spdk_bdev_io *bdev_io; 7012 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7013 7014 if (!desc->write) { 7015 /* 7016 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7017 * to easily determine if the command is a read or write, but for now just 7018 * do not allow io_passthru with a read-only descriptor. 7019 */ 7020 return -EBADF; 7021 } 7022 7023 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7024 return -ENOTSUP; 7025 } 7026 7027 bdev_io = bdev_channel_get_io(channel); 7028 if (!bdev_io) { 7029 return -ENOMEM; 7030 } 7031 7032 bdev_io->internal.ch = channel; 7033 bdev_io->internal.desc = desc; 7034 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 7035 bdev_io->u.nvme_passthru.cmd = *cmd; 7036 bdev_io->u.nvme_passthru.buf = buf; 7037 bdev_io->u.nvme_passthru.nbytes = nbytes; 7038 bdev_io->u.nvme_passthru.md_buf = md_buf; 7039 bdev_io->u.nvme_passthru.md_len = md_len; 7040 7041 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7042 7043 bdev_io_submit(bdev_io); 7044 return 0; 7045 } 7046 7047 int 7048 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 7049 struct spdk_io_channel *ch, 7050 const struct spdk_nvme_cmd *cmd, 7051 struct iovec *iov, int iovcnt, size_t nbytes, 7052 void *md_buf, size_t md_len, 7053 spdk_bdev_io_completion_cb cb, void *cb_arg) 7054 { 7055 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7056 struct spdk_bdev_io *bdev_io; 7057 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7058 7059 if (!desc->write) { 7060 /* 7061 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7062 * to easily determine if the command is a read or write, but for now just 7063 * do not allow io_passthru with a read-only descriptor. 7064 */ 7065 return -EBADF; 7066 } 7067 7068 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7069 return -ENOTSUP; 7070 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7071 return -ENOTSUP; 7072 } 7073 7074 bdev_io = bdev_channel_get_io(channel); 7075 if (!bdev_io) { 7076 return -ENOMEM; 7077 } 7078 7079 bdev_io->internal.ch = channel; 7080 bdev_io->internal.desc = desc; 7081 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 7082 bdev_io->u.nvme_passthru.cmd = *cmd; 7083 bdev_io->u.nvme_passthru.iovs = iov; 7084 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 7085 bdev_io->u.nvme_passthru.nbytes = nbytes; 7086 bdev_io->u.nvme_passthru.md_buf = md_buf; 7087 bdev_io->u.nvme_passthru.md_len = md_len; 7088 7089 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7090 7091 bdev_io_submit(bdev_io); 7092 return 0; 7093 } 7094 7095 static void bdev_abort_retry(void *ctx); 7096 static void bdev_abort(struct spdk_bdev_io *parent_io); 7097 7098 static void 7099 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7100 { 7101 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 7102 struct spdk_bdev_io *parent_io = cb_arg; 7103 struct spdk_bdev_io *bio_to_abort, *tmp_io; 7104 7105 bio_to_abort = bdev_io->u.abort.bio_to_abort; 7106 7107 spdk_bdev_free_io(bdev_io); 7108 7109 if (!success) { 7110 /* Check if the target I/O completed in the meantime. */ 7111 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 7112 if (tmp_io == bio_to_abort) { 7113 break; 7114 } 7115 } 7116 7117 /* If the target I/O still exists, set the parent to failed. */ 7118 if (tmp_io != NULL) { 7119 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7120 } 7121 } 7122 7123 assert(parent_io->internal.f.split); 7124 7125 parent_io->internal.split.outstanding--; 7126 if (parent_io->internal.split.outstanding == 0) { 7127 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7128 bdev_abort_retry(parent_io); 7129 } else { 7130 bdev_io_complete(parent_io); 7131 } 7132 } 7133 } 7134 7135 static int 7136 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7137 struct spdk_bdev_io *bio_to_abort, 7138 spdk_bdev_io_completion_cb cb, void *cb_arg) 7139 { 7140 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7141 struct spdk_bdev_io *bdev_io; 7142 7143 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7144 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7145 /* TODO: Abort reset or abort request. */ 7146 return -ENOTSUP; 7147 } 7148 7149 bdev_io = bdev_channel_get_io(channel); 7150 if (bdev_io == NULL) { 7151 return -ENOMEM; 7152 } 7153 7154 bdev_io->internal.ch = channel; 7155 bdev_io->internal.desc = desc; 7156 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7157 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7158 7159 if (bio_to_abort->internal.f.split) { 7160 assert(bdev_io_should_split(bio_to_abort)); 7161 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7162 7163 /* Parent abort request is not submitted directly, but to manage its 7164 * execution add it to the submitted list here. 7165 */ 7166 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7167 bdev_ch_add_to_io_submitted(bdev_io); 7168 7169 bdev_abort(bdev_io); 7170 7171 return 0; 7172 } 7173 7174 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7175 7176 /* Submit the abort request to the underlying bdev module. */ 7177 bdev_io_submit(bdev_io); 7178 7179 return 0; 7180 } 7181 7182 static bool 7183 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7184 { 7185 struct spdk_bdev_io *iter; 7186 7187 TAILQ_FOREACH(iter, tailq, internal.link) { 7188 if (iter == bdev_io) { 7189 return true; 7190 } 7191 } 7192 7193 return false; 7194 } 7195 7196 static uint32_t 7197 _bdev_abort(struct spdk_bdev_io *parent_io) 7198 { 7199 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7200 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7201 void *bio_cb_arg; 7202 struct spdk_bdev_io *bio_to_abort; 7203 uint32_t matched_ios; 7204 int rc; 7205 7206 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7207 7208 /* matched_ios is returned and will be kept by the caller. 7209 * 7210 * This function will be used for two cases, 1) the same cb_arg is used for 7211 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7212 * Incrementing split_outstanding directly here may confuse readers especially 7213 * for the 1st case. 7214 * 7215 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7216 * works as expected. 7217 */ 7218 matched_ios = 0; 7219 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7220 7221 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7222 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7223 continue; 7224 } 7225 7226 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7227 /* Any I/O which was submitted after this abort command should be excluded. */ 7228 continue; 7229 } 7230 7231 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7232 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7233 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7234 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7235 break; 7236 } 7237 7238 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7239 if (rc != 0) { 7240 if (rc == -ENOMEM) { 7241 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7242 } else { 7243 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7244 } 7245 break; 7246 } 7247 matched_ios++; 7248 } 7249 7250 return matched_ios; 7251 } 7252 7253 static void 7254 bdev_abort_retry(void *ctx) 7255 { 7256 struct spdk_bdev_io *parent_io = ctx; 7257 uint32_t matched_ios; 7258 7259 matched_ios = _bdev_abort(parent_io); 7260 7261 if (matched_ios == 0) { 7262 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7263 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7264 } else { 7265 /* For retry, the case that no target I/O was found is success 7266 * because it means target I/Os completed in the meantime. 7267 */ 7268 bdev_io_complete(parent_io); 7269 } 7270 return; 7271 } 7272 7273 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7274 parent_io->internal.f.split = true; 7275 parent_io->internal.split.outstanding = matched_ios; 7276 } 7277 7278 static void 7279 bdev_abort(struct spdk_bdev_io *parent_io) 7280 { 7281 uint32_t matched_ios; 7282 7283 matched_ios = _bdev_abort(parent_io); 7284 7285 if (matched_ios == 0) { 7286 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7287 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7288 } else { 7289 /* The case the no target I/O was found is failure. */ 7290 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7291 bdev_io_complete(parent_io); 7292 } 7293 return; 7294 } 7295 7296 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7297 parent_io->internal.f.split = true; 7298 parent_io->internal.split.outstanding = matched_ios; 7299 } 7300 7301 int 7302 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7303 void *bio_cb_arg, 7304 spdk_bdev_io_completion_cb cb, void *cb_arg) 7305 { 7306 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7307 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7308 struct spdk_bdev_io *bdev_io; 7309 7310 if (bio_cb_arg == NULL) { 7311 return -EINVAL; 7312 } 7313 7314 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7315 return -ENOTSUP; 7316 } 7317 7318 bdev_io = bdev_channel_get_io(channel); 7319 if (bdev_io == NULL) { 7320 return -ENOMEM; 7321 } 7322 7323 bdev_io->internal.ch = channel; 7324 bdev_io->internal.desc = desc; 7325 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7326 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7327 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7328 7329 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7330 7331 /* Parent abort request is not submitted directly, but to manage its execution, 7332 * add it to the submitted list here. 7333 */ 7334 bdev_ch_add_to_io_submitted(bdev_io); 7335 7336 bdev_abort(bdev_io); 7337 7338 return 0; 7339 } 7340 7341 int 7342 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7343 struct spdk_bdev_io_wait_entry *entry) 7344 { 7345 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7346 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7347 7348 if (bdev != entry->bdev) { 7349 SPDK_ERRLOG("bdevs do not match\n"); 7350 return -EINVAL; 7351 } 7352 7353 if (mgmt_ch->per_thread_cache_count > 0) { 7354 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7355 return -EINVAL; 7356 } 7357 7358 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7359 return 0; 7360 } 7361 7362 static inline void 7363 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7364 { 7365 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7366 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7367 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7368 uint32_t blocklen = bdev_io->bdev->blocklen; 7369 7370 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7371 switch (bdev_io->type) { 7372 case SPDK_BDEV_IO_TYPE_READ: 7373 io_stat->bytes_read += num_blocks * blocklen; 7374 io_stat->num_read_ops++; 7375 io_stat->read_latency_ticks += tsc_diff; 7376 if (io_stat->max_read_latency_ticks < tsc_diff) { 7377 io_stat->max_read_latency_ticks = tsc_diff; 7378 } 7379 if (io_stat->min_read_latency_ticks > tsc_diff) { 7380 io_stat->min_read_latency_ticks = tsc_diff; 7381 } 7382 break; 7383 case SPDK_BDEV_IO_TYPE_WRITE: 7384 io_stat->bytes_written += num_blocks * blocklen; 7385 io_stat->num_write_ops++; 7386 io_stat->write_latency_ticks += tsc_diff; 7387 if (io_stat->max_write_latency_ticks < tsc_diff) { 7388 io_stat->max_write_latency_ticks = tsc_diff; 7389 } 7390 if (io_stat->min_write_latency_ticks > tsc_diff) { 7391 io_stat->min_write_latency_ticks = tsc_diff; 7392 } 7393 break; 7394 case SPDK_BDEV_IO_TYPE_UNMAP: 7395 io_stat->bytes_unmapped += num_blocks * blocklen; 7396 io_stat->num_unmap_ops++; 7397 io_stat->unmap_latency_ticks += tsc_diff; 7398 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7399 io_stat->max_unmap_latency_ticks = tsc_diff; 7400 } 7401 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7402 io_stat->min_unmap_latency_ticks = tsc_diff; 7403 } 7404 break; 7405 case SPDK_BDEV_IO_TYPE_ZCOPY: 7406 /* Track the data in the start phase only */ 7407 if (bdev_io->u.bdev.zcopy.start) { 7408 if (bdev_io->u.bdev.zcopy.populate) { 7409 io_stat->bytes_read += num_blocks * blocklen; 7410 io_stat->num_read_ops++; 7411 io_stat->read_latency_ticks += tsc_diff; 7412 if (io_stat->max_read_latency_ticks < tsc_diff) { 7413 io_stat->max_read_latency_ticks = tsc_diff; 7414 } 7415 if (io_stat->min_read_latency_ticks > tsc_diff) { 7416 io_stat->min_read_latency_ticks = tsc_diff; 7417 } 7418 } else { 7419 io_stat->bytes_written += num_blocks * blocklen; 7420 io_stat->num_write_ops++; 7421 io_stat->write_latency_ticks += tsc_diff; 7422 if (io_stat->max_write_latency_ticks < tsc_diff) { 7423 io_stat->max_write_latency_ticks = tsc_diff; 7424 } 7425 if (io_stat->min_write_latency_ticks > tsc_diff) { 7426 io_stat->min_write_latency_ticks = tsc_diff; 7427 } 7428 } 7429 } 7430 break; 7431 case SPDK_BDEV_IO_TYPE_COPY: 7432 io_stat->bytes_copied += num_blocks * blocklen; 7433 io_stat->num_copy_ops++; 7434 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7435 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7436 io_stat->max_copy_latency_ticks = tsc_diff; 7437 } 7438 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7439 io_stat->min_copy_latency_ticks = tsc_diff; 7440 } 7441 break; 7442 default: 7443 break; 7444 } 7445 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7446 io_stat = bdev_io->bdev->internal.stat; 7447 assert(io_stat->io_error != NULL); 7448 7449 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7450 io_stat->io_error->error_status[-io_status - 1]++; 7451 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7452 } 7453 7454 #ifdef SPDK_CONFIG_VTUNE 7455 uint64_t now_tsc = spdk_get_ticks(); 7456 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7457 uint64_t data[5]; 7458 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7459 7460 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7461 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7462 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7463 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7464 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7465 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7466 7467 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7468 __itt_metadata_u64, 5, data); 7469 7470 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7471 bdev_io->internal.ch->start_tsc = now_tsc; 7472 } 7473 #endif 7474 } 7475 7476 static inline void 7477 _bdev_io_complete(void *ctx) 7478 { 7479 struct spdk_bdev_io *bdev_io = ctx; 7480 7481 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7482 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7483 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7484 } 7485 7486 assert(bdev_io->internal.cb != NULL); 7487 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7488 7489 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7490 bdev_io->internal.caller_ctx); 7491 } 7492 7493 static inline void 7494 bdev_io_complete(void *ctx) 7495 { 7496 struct spdk_bdev_io *bdev_io = ctx; 7497 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7498 uint64_t tsc, tsc_diff; 7499 7500 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7501 /* 7502 * Defer completion to avoid potential infinite recursion if the 7503 * user's completion callback issues a new I/O. 7504 */ 7505 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7506 bdev_io_complete, bdev_io); 7507 return; 7508 } 7509 7510 tsc = spdk_get_ticks(); 7511 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7512 7513 bdev_ch_remove_from_io_submitted(bdev_io); 7514 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7515 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7516 7517 if (bdev_ch->histogram) { 7518 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7519 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7520 /* 7521 * Tally all I/O types if the histogram_io_type is set to 0. 7522 */ 7523 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7524 } 7525 } 7526 7527 bdev_io_update_io_stat(bdev_io, tsc_diff); 7528 _bdev_io_complete(bdev_io); 7529 } 7530 7531 /* The difference between this function and bdev_io_complete() is that this should be called to 7532 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7533 * io_submitted list and don't have submit_tsc updated. 7534 */ 7535 static inline void 7536 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7537 { 7538 /* Since the IO hasn't been submitted it's bound to be failed */ 7539 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7540 7541 /* At this point we don't know if the IO is completed from submission context or not, but, 7542 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7543 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7544 _bdev_io_complete, bdev_io); 7545 } 7546 7547 static void bdev_destroy_cb(void *io_device); 7548 7549 static inline void 7550 _bdev_reset_complete(void *ctx) 7551 { 7552 struct spdk_bdev_io *bdev_io = ctx; 7553 7554 /* Put the channel reference we got in submission. */ 7555 assert(bdev_io->u.reset.ch_ref != NULL); 7556 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7557 bdev_io->u.reset.ch_ref = NULL; 7558 7559 bdev_io_complete(bdev_io); 7560 } 7561 7562 static void 7563 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7564 { 7565 struct spdk_bdev_io *bdev_io = _ctx; 7566 bdev_io_tailq_t queued_resets; 7567 struct spdk_bdev_io *queued_reset; 7568 7569 assert(bdev_io == bdev->internal.reset_in_progress); 7570 7571 TAILQ_INIT(&queued_resets); 7572 7573 spdk_spin_lock(&bdev->internal.spinlock); 7574 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7575 spdk_bdev_io, internal.link); 7576 bdev->internal.reset_in_progress = NULL; 7577 spdk_spin_unlock(&bdev->internal.spinlock); 7578 7579 while (!TAILQ_EMPTY(&queued_resets)) { 7580 queued_reset = TAILQ_FIRST(&queued_resets); 7581 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7582 queued_reset->internal.status = bdev_io->internal.status; 7583 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7584 _bdev_reset_complete, queued_reset); 7585 } 7586 7587 _bdev_reset_complete(bdev_io); 7588 7589 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7590 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7591 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7592 } 7593 } 7594 7595 static void 7596 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7597 struct spdk_io_channel *_ch, void *_ctx) 7598 { 7599 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7600 7601 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7602 7603 spdk_bdev_for_each_channel_continue(i, 0); 7604 } 7605 7606 static void 7607 bdev_io_complete_sequence_cb(void *ctx, int status) 7608 { 7609 struct spdk_bdev_io *bdev_io = ctx; 7610 7611 /* u.bdev.accel_sequence should have already been cleared at this point */ 7612 assert(bdev_io->u.bdev.accel_sequence == NULL); 7613 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7614 bdev_io->internal.f.has_accel_sequence = false; 7615 7616 if (spdk_unlikely(status != 0)) { 7617 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7618 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7619 } 7620 7621 bdev_io_complete(bdev_io); 7622 } 7623 7624 void 7625 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7626 { 7627 struct spdk_bdev *bdev = bdev_io->bdev; 7628 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7629 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7630 7631 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7632 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7633 spdk_bdev_get_module_name(bdev), 7634 bdev_io_status_get_string(bdev_io->internal.status)); 7635 assert(false); 7636 } 7637 bdev_io->internal.status = status; 7638 7639 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7640 assert(bdev_io == bdev->internal.reset_in_progress); 7641 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7642 bdev_reset_complete); 7643 return; 7644 } else { 7645 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7646 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7647 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7648 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7649 return; 7650 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7651 !bdev_io_use_accel_sequence(bdev_io))) { 7652 _bdev_io_push_bounce_data_buffer(bdev_io, 7653 _bdev_io_complete_push_bounce_done); 7654 /* bdev IO will be completed in the callback */ 7655 return; 7656 } 7657 } 7658 7659 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7660 return; 7661 } 7662 } 7663 7664 bdev_io_complete(bdev_io); 7665 } 7666 7667 void 7668 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7669 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7670 { 7671 enum spdk_bdev_io_status status; 7672 7673 if (sc == SPDK_SCSI_STATUS_GOOD) { 7674 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7675 } else { 7676 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7677 bdev_io->internal.error.scsi.sc = sc; 7678 bdev_io->internal.error.scsi.sk = sk; 7679 bdev_io->internal.error.scsi.asc = asc; 7680 bdev_io->internal.error.scsi.ascq = ascq; 7681 } 7682 7683 spdk_bdev_io_complete(bdev_io, status); 7684 } 7685 7686 void 7687 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7688 int *sc, int *sk, int *asc, int *ascq) 7689 { 7690 assert(sc != NULL); 7691 assert(sk != NULL); 7692 assert(asc != NULL); 7693 assert(ascq != NULL); 7694 7695 switch (bdev_io->internal.status) { 7696 case SPDK_BDEV_IO_STATUS_SUCCESS: 7697 *sc = SPDK_SCSI_STATUS_GOOD; 7698 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7699 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7700 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7701 break; 7702 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7703 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7704 break; 7705 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7706 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7707 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7708 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7709 *ascq = bdev_io->internal.error.scsi.ascq; 7710 break; 7711 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7712 *sc = bdev_io->internal.error.scsi.sc; 7713 *sk = bdev_io->internal.error.scsi.sk; 7714 *asc = bdev_io->internal.error.scsi.asc; 7715 *ascq = bdev_io->internal.error.scsi.ascq; 7716 break; 7717 default: 7718 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7719 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7720 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7721 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7722 break; 7723 } 7724 } 7725 7726 void 7727 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7728 { 7729 enum spdk_bdev_io_status status; 7730 7731 if (aio_result == 0) { 7732 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7733 } else { 7734 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7735 } 7736 7737 bdev_io->internal.error.aio_result = aio_result; 7738 7739 spdk_bdev_io_complete(bdev_io, status); 7740 } 7741 7742 void 7743 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7744 { 7745 assert(aio_result != NULL); 7746 7747 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7748 *aio_result = bdev_io->internal.error.aio_result; 7749 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7750 *aio_result = 0; 7751 } else { 7752 *aio_result = -EIO; 7753 } 7754 } 7755 7756 void 7757 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7758 { 7759 enum spdk_bdev_io_status status; 7760 7761 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7762 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7763 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7764 status = SPDK_BDEV_IO_STATUS_ABORTED; 7765 } else { 7766 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7767 } 7768 7769 bdev_io->internal.error.nvme.cdw0 = cdw0; 7770 bdev_io->internal.error.nvme.sct = sct; 7771 bdev_io->internal.error.nvme.sc = sc; 7772 7773 spdk_bdev_io_complete(bdev_io, status); 7774 } 7775 7776 void 7777 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7778 { 7779 assert(sct != NULL); 7780 assert(sc != NULL); 7781 assert(cdw0 != NULL); 7782 7783 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7784 *sct = SPDK_NVME_SCT_GENERIC; 7785 *sc = SPDK_NVME_SC_SUCCESS; 7786 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7787 *cdw0 = 0; 7788 } else { 7789 *cdw0 = 1U; 7790 } 7791 return; 7792 } 7793 7794 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7795 *sct = SPDK_NVME_SCT_GENERIC; 7796 *sc = SPDK_NVME_SC_SUCCESS; 7797 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7798 *sct = bdev_io->internal.error.nvme.sct; 7799 *sc = bdev_io->internal.error.nvme.sc; 7800 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7801 *sct = SPDK_NVME_SCT_GENERIC; 7802 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7803 } else { 7804 *sct = SPDK_NVME_SCT_GENERIC; 7805 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7806 } 7807 7808 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7809 } 7810 7811 void 7812 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7813 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7814 { 7815 assert(first_sct != NULL); 7816 assert(first_sc != NULL); 7817 assert(second_sct != NULL); 7818 assert(second_sc != NULL); 7819 assert(cdw0 != NULL); 7820 7821 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7822 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7823 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7824 *first_sct = bdev_io->internal.error.nvme.sct; 7825 *first_sc = bdev_io->internal.error.nvme.sc; 7826 *second_sct = SPDK_NVME_SCT_GENERIC; 7827 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7828 } else { 7829 *first_sct = SPDK_NVME_SCT_GENERIC; 7830 *first_sc = SPDK_NVME_SC_SUCCESS; 7831 *second_sct = bdev_io->internal.error.nvme.sct; 7832 *second_sc = bdev_io->internal.error.nvme.sc; 7833 } 7834 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7835 *first_sct = SPDK_NVME_SCT_GENERIC; 7836 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7837 *second_sct = SPDK_NVME_SCT_GENERIC; 7838 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7839 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7840 *first_sct = SPDK_NVME_SCT_GENERIC; 7841 *first_sc = SPDK_NVME_SC_SUCCESS; 7842 *second_sct = SPDK_NVME_SCT_GENERIC; 7843 *second_sc = SPDK_NVME_SC_SUCCESS; 7844 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7845 *first_sct = SPDK_NVME_SCT_GENERIC; 7846 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7847 *second_sct = SPDK_NVME_SCT_GENERIC; 7848 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7849 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7850 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7851 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7852 *second_sct = SPDK_NVME_SCT_GENERIC; 7853 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7854 } else { 7855 *first_sct = SPDK_NVME_SCT_GENERIC; 7856 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7857 *second_sct = SPDK_NVME_SCT_GENERIC; 7858 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7859 } 7860 7861 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7862 } 7863 7864 void 7865 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7866 const struct spdk_bdev_io *base_io) 7867 { 7868 switch (base_io->internal.status) { 7869 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7870 spdk_bdev_io_complete_nvme_status(bdev_io, 7871 base_io->internal.error.nvme.cdw0, 7872 base_io->internal.error.nvme.sct, 7873 base_io->internal.error.nvme.sc); 7874 break; 7875 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7876 spdk_bdev_io_complete_scsi_status(bdev_io, 7877 base_io->internal.error.scsi.sc, 7878 base_io->internal.error.scsi.sk, 7879 base_io->internal.error.scsi.asc, 7880 base_io->internal.error.scsi.ascq); 7881 break; 7882 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7883 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7884 break; 7885 default: 7886 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7887 break; 7888 } 7889 } 7890 7891 struct spdk_thread * 7892 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7893 { 7894 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7895 } 7896 7897 struct spdk_io_channel * 7898 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7899 { 7900 return bdev_io->internal.ch->channel; 7901 } 7902 7903 static int 7904 bdev_register(struct spdk_bdev *bdev) 7905 { 7906 char *bdev_name; 7907 char uuid[SPDK_UUID_STRING_LEN]; 7908 struct spdk_iobuf_opts iobuf_opts; 7909 int ret; 7910 7911 assert(bdev->module != NULL); 7912 7913 if (!bdev->name) { 7914 SPDK_ERRLOG("Bdev name is NULL\n"); 7915 return -EINVAL; 7916 } 7917 7918 if (!strlen(bdev->name)) { 7919 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7920 return -EINVAL; 7921 } 7922 7923 /* Users often register their own I/O devices using the bdev name. In 7924 * order to avoid conflicts, prepend bdev_. */ 7925 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7926 if (!bdev_name) { 7927 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7928 return -ENOMEM; 7929 } 7930 7931 bdev->internal.stat = bdev_alloc_io_stat(true); 7932 if (!bdev->internal.stat) { 7933 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7934 free(bdev_name); 7935 return -ENOMEM; 7936 } 7937 7938 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7939 bdev->internal.measured_queue_depth = UINT64_MAX; 7940 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7941 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7942 bdev->internal.qd_poller = NULL; 7943 bdev->internal.qos = NULL; 7944 7945 TAILQ_INIT(&bdev->internal.open_descs); 7946 TAILQ_INIT(&bdev->internal.locked_ranges); 7947 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7948 TAILQ_INIT(&bdev->internal.queued_resets); 7949 TAILQ_INIT(&bdev->aliases); 7950 7951 /* UUID may be specified by the user or defined by bdev itself. 7952 * Otherwise it will be generated here, so this field will never be empty. */ 7953 if (spdk_uuid_is_null(&bdev->uuid)) { 7954 spdk_uuid_generate(&bdev->uuid); 7955 } 7956 7957 /* Add the UUID alias only if it's different than the name */ 7958 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7959 if (strcmp(bdev->name, uuid) != 0) { 7960 ret = spdk_bdev_alias_add(bdev, uuid); 7961 if (ret != 0) { 7962 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7963 bdev_free_io_stat(bdev->internal.stat); 7964 free(bdev_name); 7965 return ret; 7966 } 7967 } 7968 7969 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7970 if (spdk_bdev_get_buf_align(bdev) > 1) { 7971 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7972 iobuf_opts.large_bufsize / bdev->blocklen); 7973 } 7974 7975 /* If the user didn't specify a write unit size, set it to one. */ 7976 if (bdev->write_unit_size == 0) { 7977 bdev->write_unit_size = 1; 7978 } 7979 7980 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7981 if (bdev->acwu == 0) { 7982 bdev->acwu = bdev->write_unit_size; 7983 } 7984 7985 if (bdev->phys_blocklen == 0) { 7986 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7987 } 7988 7989 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7990 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7991 } 7992 7993 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7994 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7995 } 7996 7997 bdev->internal.reset_in_progress = NULL; 7998 bdev->internal.qd_poll_in_progress = false; 7999 bdev->internal.period = 0; 8000 bdev->internal.new_period = 0; 8001 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 8002 8003 /* 8004 * Initialize spinlock before registering IO device because spinlock is used in 8005 * bdev_channel_create 8006 */ 8007 spdk_spin_init(&bdev->internal.spinlock); 8008 8009 spdk_io_device_register(__bdev_to_io_dev(bdev), 8010 bdev_channel_create, bdev_channel_destroy, 8011 sizeof(struct spdk_bdev_channel), 8012 bdev_name); 8013 8014 /* 8015 * Register bdev name only after the bdev object is ready. 8016 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 8017 * create IO channels... 8018 */ 8019 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 8020 if (ret != 0) { 8021 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 8022 bdev_free_io_stat(bdev->internal.stat); 8023 spdk_spin_destroy(&bdev->internal.spinlock); 8024 free(bdev_name); 8025 return ret; 8026 } 8027 8028 free(bdev_name); 8029 8030 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 8031 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 8032 8033 return 0; 8034 } 8035 8036 static void 8037 bdev_destroy_cb(void *io_device) 8038 { 8039 int rc; 8040 struct spdk_bdev *bdev; 8041 spdk_bdev_unregister_cb cb_fn; 8042 void *cb_arg; 8043 8044 bdev = __bdev_from_io_dev(io_device); 8045 8046 if (bdev->internal.unregister_td != spdk_get_thread()) { 8047 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 8048 return; 8049 } 8050 8051 cb_fn = bdev->internal.unregister_cb; 8052 cb_arg = bdev->internal.unregister_ctx; 8053 8054 spdk_spin_destroy(&bdev->internal.spinlock); 8055 free(bdev->internal.qos); 8056 bdev_free_io_stat(bdev->internal.stat); 8057 spdk_trace_unregister_owner(bdev->internal.trace_id); 8058 8059 rc = bdev->fn_table->destruct(bdev->ctxt); 8060 if (rc < 0) { 8061 SPDK_ERRLOG("destruct failed\n"); 8062 } 8063 if (rc <= 0 && cb_fn != NULL) { 8064 cb_fn(cb_arg, rc); 8065 } 8066 } 8067 8068 void 8069 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 8070 { 8071 if (bdev->internal.unregister_cb != NULL) { 8072 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 8073 } 8074 } 8075 8076 static void 8077 _remove_notify(void *arg) 8078 { 8079 struct spdk_bdev_desc *desc = arg; 8080 8081 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 8082 } 8083 8084 /* returns: 0 - bdev removed and ready to be destructed. 8085 * -EBUSY - bdev can't be destructed yet. */ 8086 static int 8087 bdev_unregister_unsafe(struct spdk_bdev *bdev) 8088 { 8089 struct spdk_bdev_desc *desc, *tmp; 8090 struct spdk_bdev_alias *alias; 8091 int rc = 0; 8092 char uuid[SPDK_UUID_STRING_LEN]; 8093 8094 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 8095 assert(spdk_spin_held(&bdev->internal.spinlock)); 8096 8097 /* Notify each descriptor about hotremoval */ 8098 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 8099 rc = -EBUSY; 8100 /* 8101 * Defer invocation of the event_cb to a separate message that will 8102 * run later on its thread. This ensures this context unwinds and 8103 * we don't recursively unregister this bdev again if the event_cb 8104 * immediately closes its descriptor. 8105 */ 8106 event_notify(desc, _remove_notify); 8107 } 8108 8109 /* If there are no descriptors, proceed removing the bdev */ 8110 if (rc == 0) { 8111 bdev_examine_allowlist_remove(bdev->name); 8112 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 8113 bdev_examine_allowlist_remove(alias->alias.name); 8114 } 8115 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 8116 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 8117 8118 /* Delete the name and the UUID alias */ 8119 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8120 bdev_name_del_unsafe(&bdev->internal.bdev_name); 8121 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 8122 8123 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 8124 8125 if (bdev->internal.reset_in_progress != NULL) { 8126 /* If reset is in progress, let the completion callback for reset 8127 * unregister the bdev. 8128 */ 8129 rc = -EBUSY; 8130 } 8131 } 8132 8133 return rc; 8134 } 8135 8136 static void 8137 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8138 struct spdk_io_channel *io_ch, void *_ctx) 8139 { 8140 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8141 8142 bdev_channel_abort_queued_ios(bdev_ch); 8143 spdk_bdev_for_each_channel_continue(i, 0); 8144 } 8145 8146 static void 8147 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8148 { 8149 int rc; 8150 8151 spdk_spin_lock(&g_bdev_mgr.spinlock); 8152 spdk_spin_lock(&bdev->internal.spinlock); 8153 /* 8154 * Set the status to REMOVING after completing to abort channels. Otherwise, 8155 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8156 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8157 * may fail. 8158 */ 8159 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8160 rc = bdev_unregister_unsafe(bdev); 8161 spdk_spin_unlock(&bdev->internal.spinlock); 8162 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8163 8164 if (rc == 0) { 8165 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8166 } 8167 } 8168 8169 void 8170 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8171 { 8172 struct spdk_thread *thread; 8173 8174 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8175 8176 thread = spdk_get_thread(); 8177 if (!thread) { 8178 /* The user called this from a non-SPDK thread. */ 8179 if (cb_fn != NULL) { 8180 cb_fn(cb_arg, -ENOTSUP); 8181 } 8182 return; 8183 } 8184 8185 spdk_spin_lock(&g_bdev_mgr.spinlock); 8186 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8187 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8188 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8189 if (cb_fn) { 8190 cb_fn(cb_arg, -EBUSY); 8191 } 8192 return; 8193 } 8194 8195 spdk_spin_lock(&bdev->internal.spinlock); 8196 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8197 bdev->internal.unregister_cb = cb_fn; 8198 bdev->internal.unregister_ctx = cb_arg; 8199 bdev->internal.unregister_td = thread; 8200 spdk_spin_unlock(&bdev->internal.spinlock); 8201 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8202 8203 spdk_bdev_set_qd_sampling_period(bdev, 0); 8204 8205 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8206 bdev_unregister); 8207 } 8208 8209 int 8210 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8211 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8212 { 8213 struct spdk_bdev_desc *desc; 8214 struct spdk_bdev *bdev; 8215 int rc; 8216 8217 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8218 if (rc != 0) { 8219 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8220 return rc; 8221 } 8222 8223 bdev = spdk_bdev_desc_get_bdev(desc); 8224 8225 if (bdev->module != module) { 8226 spdk_bdev_close(desc); 8227 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8228 bdev_name); 8229 return -ENODEV; 8230 } 8231 8232 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8233 8234 spdk_bdev_close(desc); 8235 8236 return 0; 8237 } 8238 8239 static int 8240 bdev_start_qos(struct spdk_bdev *bdev) 8241 { 8242 struct set_qos_limit_ctx *ctx; 8243 8244 /* Enable QoS */ 8245 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8246 ctx = calloc(1, sizeof(*ctx)); 8247 if (ctx == NULL) { 8248 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8249 return -ENOMEM; 8250 } 8251 ctx->bdev = bdev; 8252 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8253 } 8254 8255 return 0; 8256 } 8257 8258 static void 8259 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8260 struct spdk_bdev *bdev) 8261 { 8262 enum spdk_bdev_claim_type type; 8263 const char *typename, *modname; 8264 extern struct spdk_log_flag SPDK_LOG_bdev; 8265 8266 assert(spdk_spin_held(&bdev->internal.spinlock)); 8267 8268 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8269 return; 8270 } 8271 8272 type = bdev->internal.claim_type; 8273 typename = spdk_bdev_claim_get_name(type); 8274 8275 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8276 modname = bdev->internal.claim.v1.module->name; 8277 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8278 bdev->name, detail, typename, modname); 8279 return; 8280 } 8281 8282 if (claim_type_is_v2(type)) { 8283 struct spdk_bdev_module_claim *claim; 8284 8285 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8286 modname = claim->module->name; 8287 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8288 bdev->name, detail, typename, modname); 8289 } 8290 return; 8291 } 8292 8293 assert(false); 8294 } 8295 8296 static int 8297 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8298 { 8299 struct spdk_thread *thread; 8300 int rc = 0; 8301 8302 thread = spdk_get_thread(); 8303 if (!thread) { 8304 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8305 return -ENOTSUP; 8306 } 8307 8308 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8309 spdk_get_thread()); 8310 8311 desc->bdev = bdev; 8312 desc->thread = thread; 8313 desc->write = write; 8314 8315 spdk_spin_lock(&bdev->internal.spinlock); 8316 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8317 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8318 spdk_spin_unlock(&bdev->internal.spinlock); 8319 return -ENODEV; 8320 } 8321 8322 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8323 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8324 spdk_spin_unlock(&bdev->internal.spinlock); 8325 return -EPERM; 8326 } 8327 8328 rc = bdev_start_qos(bdev); 8329 if (rc != 0) { 8330 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8331 spdk_spin_unlock(&bdev->internal.spinlock); 8332 return rc; 8333 } 8334 8335 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8336 8337 spdk_spin_unlock(&bdev->internal.spinlock); 8338 8339 return 0; 8340 } 8341 8342 static void 8343 bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size) 8344 { 8345 if (!opts) { 8346 SPDK_ERRLOG("opts should not be NULL.\n"); 8347 return; 8348 } 8349 8350 if (!opts_size) { 8351 SPDK_ERRLOG("opts_size should not be zero.\n"); 8352 return; 8353 } 8354 8355 memset(opts, 0, opts_size); 8356 opts->size = opts_size; 8357 8358 #define FIELD_OK(field) \ 8359 offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size 8360 8361 #define SET_FIELD(field, value) \ 8362 if (FIELD_OK(field)) { \ 8363 opts->field = value; \ 8364 } \ 8365 8366 SET_FIELD(hide_metadata, false); 8367 8368 #undef FIELD_OK 8369 #undef SET_FIELD 8370 } 8371 8372 static void 8373 bdev_open_opts_copy(struct spdk_bdev_open_opts *opts, 8374 const struct spdk_bdev_open_opts *opts_src, size_t opts_size) 8375 { 8376 assert(opts); 8377 assert(opts_src); 8378 8379 #define SET_FIELD(field) \ 8380 if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \ 8381 opts->field = opts_src->field; \ 8382 } \ 8383 8384 SET_FIELD(hide_metadata); 8385 8386 opts->size = opts_src->size; 8387 8388 /* We should not remove this statement, but need to update the assert statement 8389 * if we add a new field, and also add a corresponding SET_FIELD statement. 8390 */ 8391 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size"); 8392 8393 #undef SET_FIELD 8394 } 8395 8396 void 8397 spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size) 8398 { 8399 struct spdk_bdev_open_opts opts_local; 8400 8401 bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local)); 8402 bdev_open_opts_copy(opts, &opts_local, opts_size); 8403 } 8404 8405 static int 8406 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8407 struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc) 8408 { 8409 struct spdk_bdev_desc *desc; 8410 struct spdk_bdev_open_opts opts; 8411 unsigned int i; 8412 8413 bdev_open_opts_get_defaults(&opts, sizeof(opts)); 8414 if (user_opts != NULL) { 8415 bdev_open_opts_copy(&opts, user_opts, user_opts->size); 8416 } 8417 8418 desc = calloc(1, sizeof(*desc)); 8419 if (desc == NULL) { 8420 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8421 return -ENOMEM; 8422 } 8423 8424 desc->opts = opts; 8425 8426 TAILQ_INIT(&desc->pending_media_events); 8427 TAILQ_INIT(&desc->free_media_events); 8428 8429 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8430 desc->callback.event_fn = event_cb; 8431 desc->callback.ctx = event_ctx; 8432 spdk_spin_init(&desc->spinlock); 8433 8434 if (desc->opts.hide_metadata) { 8435 if (spdk_bdev_is_md_separate(bdev)) { 8436 SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n"); 8437 bdev_desc_free(desc); 8438 return -EINVAL; 8439 } 8440 } 8441 8442 if (bdev->media_events) { 8443 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8444 sizeof(*desc->media_events_buffer)); 8445 if (desc->media_events_buffer == NULL) { 8446 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8447 bdev_desc_free(desc); 8448 return -ENOMEM; 8449 } 8450 8451 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8452 TAILQ_INSERT_TAIL(&desc->free_media_events, 8453 &desc->media_events_buffer[i], tailq); 8454 } 8455 } 8456 8457 if (bdev->fn_table->accel_sequence_supported != NULL) { 8458 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8459 desc->accel_sequence_supported[i] = 8460 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8461 (enum spdk_bdev_io_type)i); 8462 } 8463 } 8464 8465 *_desc = desc; 8466 8467 return 0; 8468 } 8469 8470 static int 8471 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8472 void *event_ctx, struct spdk_bdev_open_opts *opts, 8473 struct spdk_bdev_desc **_desc) 8474 { 8475 struct spdk_bdev_desc *desc; 8476 struct spdk_bdev *bdev; 8477 int rc; 8478 8479 bdev = bdev_get_by_name(bdev_name); 8480 8481 if (bdev == NULL) { 8482 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8483 return -ENODEV; 8484 } 8485 8486 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc); 8487 if (rc != 0) { 8488 return rc; 8489 } 8490 8491 rc = bdev_open(bdev, write, desc); 8492 if (rc != 0) { 8493 bdev_desc_free(desc); 8494 desc = NULL; 8495 } 8496 8497 *_desc = desc; 8498 8499 return rc; 8500 } 8501 8502 int 8503 spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8504 void *event_ctx, struct spdk_bdev_open_opts *opts, 8505 struct spdk_bdev_desc **_desc) 8506 { 8507 int rc; 8508 8509 if (event_cb == NULL) { 8510 SPDK_ERRLOG("Missing event callback function\n"); 8511 return -EINVAL; 8512 } 8513 8514 spdk_spin_lock(&g_bdev_mgr.spinlock); 8515 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc); 8516 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8517 8518 return rc; 8519 } 8520 8521 int 8522 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8523 void *event_ctx, struct spdk_bdev_desc **_desc) 8524 { 8525 return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc); 8526 } 8527 8528 struct spdk_bdev_open_async_ctx { 8529 char *bdev_name; 8530 spdk_bdev_event_cb_t event_cb; 8531 void *event_ctx; 8532 bool write; 8533 int rc; 8534 spdk_bdev_open_async_cb_t cb_fn; 8535 void *cb_arg; 8536 struct spdk_bdev_desc *desc; 8537 struct spdk_bdev_open_async_opts opts; 8538 uint64_t start_ticks; 8539 struct spdk_thread *orig_thread; 8540 struct spdk_poller *poller; 8541 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8542 }; 8543 8544 static void 8545 bdev_open_async_done(void *arg) 8546 { 8547 struct spdk_bdev_open_async_ctx *ctx = arg; 8548 8549 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8550 8551 free(ctx->bdev_name); 8552 free(ctx); 8553 } 8554 8555 static void 8556 bdev_open_async_cancel(void *arg) 8557 { 8558 struct spdk_bdev_open_async_ctx *ctx = arg; 8559 8560 assert(ctx->rc == -ESHUTDOWN); 8561 8562 spdk_poller_unregister(&ctx->poller); 8563 8564 bdev_open_async_done(ctx); 8565 } 8566 8567 /* This is called when the bdev library finishes at shutdown. */ 8568 static void 8569 bdev_open_async_fini(void) 8570 { 8571 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8572 8573 spdk_spin_lock(&g_bdev_mgr.spinlock); 8574 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8575 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8576 /* 8577 * We have to move to ctx->orig_thread to unregister ctx->poller. 8578 * However, there is a chance that ctx->poller is executed before 8579 * message is executed, which could result in bdev_open_async_done() 8580 * being called twice. To avoid such race condition, set ctx->rc to 8581 * -ESHUTDOWN. 8582 */ 8583 ctx->rc = -ESHUTDOWN; 8584 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8585 } 8586 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8587 } 8588 8589 static int bdev_open_async(void *arg); 8590 8591 static void 8592 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8593 { 8594 uint64_t timeout_ticks; 8595 8596 if (ctx->rc == -ESHUTDOWN) { 8597 /* This context is being canceled. Do nothing. */ 8598 return; 8599 } 8600 8601 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8602 NULL, &ctx->desc); 8603 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8604 goto exit; 8605 } 8606 8607 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8608 if (spdk_get_ticks() >= timeout_ticks) { 8609 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8610 ctx->rc = -ETIMEDOUT; 8611 goto exit; 8612 } 8613 8614 return; 8615 8616 exit: 8617 spdk_poller_unregister(&ctx->poller); 8618 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8619 8620 /* Completion callback is processed after stack unwinding. */ 8621 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8622 } 8623 8624 static int 8625 bdev_open_async(void *arg) 8626 { 8627 struct spdk_bdev_open_async_ctx *ctx = arg; 8628 8629 spdk_spin_lock(&g_bdev_mgr.spinlock); 8630 8631 _bdev_open_async(ctx); 8632 8633 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8634 8635 return SPDK_POLLER_BUSY; 8636 } 8637 8638 static void 8639 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8640 struct spdk_bdev_open_async_opts *opts_src, 8641 size_t size) 8642 { 8643 assert(opts); 8644 assert(opts_src); 8645 8646 opts->size = size; 8647 8648 #define SET_FIELD(field) \ 8649 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8650 opts->field = opts_src->field; \ 8651 } \ 8652 8653 SET_FIELD(timeout_ms); 8654 8655 /* Do not remove this statement, you should always update this statement when you adding a new field, 8656 * and do not forget to add the SET_FIELD statement for your added field. */ 8657 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8658 8659 #undef SET_FIELD 8660 } 8661 8662 static void 8663 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8664 { 8665 assert(opts); 8666 8667 opts->size = size; 8668 8669 #define SET_FIELD(field, value) \ 8670 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8671 opts->field = value; \ 8672 } \ 8673 8674 SET_FIELD(timeout_ms, 0); 8675 8676 #undef SET_FIELD 8677 } 8678 8679 int 8680 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8681 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8682 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8683 { 8684 struct spdk_bdev_open_async_ctx *ctx; 8685 8686 if (event_cb == NULL) { 8687 SPDK_ERRLOG("Missing event callback function\n"); 8688 return -EINVAL; 8689 } 8690 8691 if (open_cb == NULL) { 8692 SPDK_ERRLOG("Missing open callback function\n"); 8693 return -EINVAL; 8694 } 8695 8696 if (opts != NULL && opts->size == 0) { 8697 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8698 return -EINVAL; 8699 } 8700 8701 ctx = calloc(1, sizeof(*ctx)); 8702 if (ctx == NULL) { 8703 SPDK_ERRLOG("Failed to allocate open context\n"); 8704 return -ENOMEM; 8705 } 8706 8707 ctx->bdev_name = strdup(bdev_name); 8708 if (ctx->bdev_name == NULL) { 8709 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8710 free(ctx); 8711 return -ENOMEM; 8712 } 8713 8714 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8715 if (ctx->poller == NULL) { 8716 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8717 free(ctx->bdev_name); 8718 free(ctx); 8719 return -ENOMEM; 8720 } 8721 8722 ctx->cb_fn = open_cb; 8723 ctx->cb_arg = open_cb_arg; 8724 ctx->write = write; 8725 ctx->event_cb = event_cb; 8726 ctx->event_ctx = event_ctx; 8727 ctx->orig_thread = spdk_get_thread(); 8728 ctx->start_ticks = spdk_get_ticks(); 8729 8730 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8731 if (opts != NULL) { 8732 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8733 } 8734 8735 spdk_spin_lock(&g_bdev_mgr.spinlock); 8736 8737 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8738 _bdev_open_async(ctx); 8739 8740 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8741 8742 return 0; 8743 } 8744 8745 static void 8746 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8747 { 8748 int rc; 8749 8750 spdk_spin_lock(&bdev->internal.spinlock); 8751 spdk_spin_lock(&desc->spinlock); 8752 8753 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8754 8755 desc->closed = true; 8756 8757 if (desc->claim != NULL) { 8758 bdev_desc_release_claims(desc); 8759 } 8760 8761 if (0 == desc->refs) { 8762 spdk_spin_unlock(&desc->spinlock); 8763 bdev_desc_free(desc); 8764 } else { 8765 spdk_spin_unlock(&desc->spinlock); 8766 } 8767 8768 /* If no more descriptors, kill QoS channel */ 8769 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8770 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8771 bdev->name, spdk_get_thread()); 8772 8773 if (bdev_qos_destroy(bdev)) { 8774 /* There isn't anything we can do to recover here. Just let the 8775 * old QoS poller keep running. The QoS handling won't change 8776 * cores when the user allocates a new channel, but it won't break. */ 8777 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8778 } 8779 } 8780 8781 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8782 rc = bdev_unregister_unsafe(bdev); 8783 spdk_spin_unlock(&bdev->internal.spinlock); 8784 8785 if (rc == 0) { 8786 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8787 } 8788 } else { 8789 spdk_spin_unlock(&bdev->internal.spinlock); 8790 } 8791 } 8792 8793 void 8794 spdk_bdev_close(struct spdk_bdev_desc *desc) 8795 { 8796 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8797 8798 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8799 spdk_get_thread()); 8800 8801 assert(desc->thread == spdk_get_thread()); 8802 8803 spdk_poller_unregister(&desc->io_timeout_poller); 8804 8805 spdk_spin_lock(&g_bdev_mgr.spinlock); 8806 8807 bdev_close(bdev, desc); 8808 8809 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8810 } 8811 8812 int32_t 8813 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8814 { 8815 if (bdev->numa.id_valid) { 8816 return bdev->numa.id; 8817 } else { 8818 return SPDK_ENV_NUMA_ID_ANY; 8819 } 8820 } 8821 8822 static void 8823 bdev_register_finished(void *arg) 8824 { 8825 struct spdk_bdev_desc *desc = arg; 8826 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8827 8828 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8829 8830 spdk_spin_lock(&g_bdev_mgr.spinlock); 8831 8832 bdev_close(bdev, desc); 8833 8834 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8835 } 8836 8837 int 8838 spdk_bdev_register(struct spdk_bdev *bdev) 8839 { 8840 struct spdk_bdev_desc *desc; 8841 struct spdk_thread *thread = spdk_get_thread(); 8842 int rc; 8843 8844 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8845 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8846 thread ? spdk_thread_get_name(thread) : "null"); 8847 return -EINVAL; 8848 } 8849 8850 rc = bdev_register(bdev); 8851 if (rc != 0) { 8852 return rc; 8853 } 8854 8855 /* A descriptor is opened to prevent bdev deletion during examination */ 8856 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 8857 if (rc != 0) { 8858 spdk_bdev_unregister(bdev, NULL, NULL); 8859 return rc; 8860 } 8861 8862 rc = bdev_open(bdev, false, desc); 8863 if (rc != 0) { 8864 bdev_desc_free(desc); 8865 spdk_bdev_unregister(bdev, NULL, NULL); 8866 return rc; 8867 } 8868 8869 /* Examine configuration before initializing I/O */ 8870 bdev_examine(bdev); 8871 8872 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8873 if (rc != 0) { 8874 bdev_close(bdev, desc); 8875 spdk_bdev_unregister(bdev, NULL, NULL); 8876 } 8877 8878 return rc; 8879 } 8880 8881 int 8882 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8883 struct spdk_bdev_module *module) 8884 { 8885 spdk_spin_lock(&bdev->internal.spinlock); 8886 8887 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8888 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8889 spdk_spin_unlock(&bdev->internal.spinlock); 8890 return -EPERM; 8891 } 8892 8893 if (desc && !desc->write) { 8894 desc->write = true; 8895 } 8896 8897 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8898 bdev->internal.claim.v1.module = module; 8899 8900 spdk_spin_unlock(&bdev->internal.spinlock); 8901 return 0; 8902 } 8903 8904 void 8905 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8906 { 8907 spdk_spin_lock(&bdev->internal.spinlock); 8908 8909 assert(bdev->internal.claim.v1.module != NULL); 8910 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8911 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8912 bdev->internal.claim.v1.module = NULL; 8913 8914 spdk_spin_unlock(&bdev->internal.spinlock); 8915 } 8916 8917 /* 8918 * Start claims v2 8919 */ 8920 8921 const char * 8922 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8923 { 8924 switch (type) { 8925 case SPDK_BDEV_CLAIM_NONE: 8926 return "not_claimed"; 8927 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8928 return "exclusive_write"; 8929 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8930 return "read_many_write_one"; 8931 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8932 return "read_many_write_none"; 8933 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8934 return "read_many_write_many"; 8935 default: 8936 break; 8937 } 8938 return "invalid_claim"; 8939 } 8940 8941 static bool 8942 claim_type_is_v2(enum spdk_bdev_claim_type type) 8943 { 8944 switch (type) { 8945 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8946 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8947 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8948 return true; 8949 default: 8950 break; 8951 } 8952 return false; 8953 } 8954 8955 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8956 static bool 8957 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8958 { 8959 switch (type) { 8960 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8961 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8962 return true; 8963 default: 8964 break; 8965 } 8966 return false; 8967 } 8968 8969 void 8970 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8971 { 8972 if (opts == NULL) { 8973 SPDK_ERRLOG("opts should not be NULL\n"); 8974 assert(opts != NULL); 8975 return; 8976 } 8977 if (size == 0) { 8978 SPDK_ERRLOG("size should not be zero\n"); 8979 assert(size != 0); 8980 return; 8981 } 8982 8983 memset(opts, 0, size); 8984 opts->opts_size = size; 8985 8986 #define FIELD_OK(field) \ 8987 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8988 8989 #define SET_FIELD(field, value) \ 8990 if (FIELD_OK(field)) { \ 8991 opts->field = value; \ 8992 } \ 8993 8994 SET_FIELD(shared_claim_key, 0); 8995 8996 #undef FIELD_OK 8997 #undef SET_FIELD 8998 } 8999 9000 static int 9001 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 9002 { 9003 if (src->opts_size == 0) { 9004 SPDK_ERRLOG("size should not be zero\n"); 9005 return -1; 9006 } 9007 9008 memset(dst, 0, sizeof(*dst)); 9009 dst->opts_size = src->opts_size; 9010 9011 #define FIELD_OK(field) \ 9012 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 9013 9014 #define SET_FIELD(field) \ 9015 if (FIELD_OK(field)) { \ 9016 dst->field = src->field; \ 9017 } \ 9018 9019 if (FIELD_OK(name)) { 9020 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 9021 } 9022 9023 SET_FIELD(shared_claim_key); 9024 9025 /* You should not remove this statement, but need to update the assert statement 9026 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9027 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 9028 9029 #undef FIELD_OK 9030 #undef SET_FIELD 9031 return 0; 9032 } 9033 9034 /* Returns 0 if a read-write-once claim can be taken. */ 9035 static int 9036 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9037 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9038 { 9039 struct spdk_bdev *bdev = desc->bdev; 9040 struct spdk_bdev_desc *open_desc; 9041 9042 assert(spdk_spin_held(&bdev->internal.spinlock)); 9043 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 9044 9045 if (opts->shared_claim_key != 0) { 9046 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 9047 bdev->name); 9048 return -EINVAL; 9049 } 9050 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 9051 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9052 return -EPERM; 9053 } 9054 if (desc->claim != NULL) { 9055 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 9056 bdev->name, desc->claim->module->name); 9057 return -EPERM; 9058 } 9059 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9060 if (desc != open_desc && open_desc->write) { 9061 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 9062 "another descriptor is open for writing\n", 9063 bdev->name); 9064 return -EPERM; 9065 } 9066 } 9067 9068 return 0; 9069 } 9070 9071 /* Returns 0 if a read-only-many claim can be taken. */ 9072 static int 9073 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9074 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9075 { 9076 struct spdk_bdev *bdev = desc->bdev; 9077 struct spdk_bdev_desc *open_desc; 9078 9079 assert(spdk_spin_held(&bdev->internal.spinlock)); 9080 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 9081 assert(desc->claim == NULL); 9082 9083 if (desc->write) { 9084 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 9085 bdev->name); 9086 return -EINVAL; 9087 } 9088 if (opts->shared_claim_key != 0) { 9089 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 9090 return -EINVAL; 9091 } 9092 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9093 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9094 if (open_desc->write) { 9095 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 9096 "another descriptor is open for writing\n", 9097 bdev->name); 9098 return -EPERM; 9099 } 9100 } 9101 } 9102 9103 return 0; 9104 } 9105 9106 /* Returns 0 if a read-write-many claim can be taken. */ 9107 static int 9108 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9109 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9110 { 9111 struct spdk_bdev *bdev = desc->bdev; 9112 struct spdk_bdev_desc *open_desc; 9113 9114 assert(spdk_spin_held(&bdev->internal.spinlock)); 9115 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 9116 assert(desc->claim == NULL); 9117 9118 if (opts->shared_claim_key == 0) { 9119 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 9120 bdev->name); 9121 return -EINVAL; 9122 } 9123 switch (bdev->internal.claim_type) { 9124 case SPDK_BDEV_CLAIM_NONE: 9125 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9126 if (open_desc == desc) { 9127 continue; 9128 } 9129 if (open_desc->write) { 9130 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 9131 "another descriptor is open for writing without a " 9132 "claim\n", bdev->name); 9133 return -EPERM; 9134 } 9135 } 9136 break; 9137 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9138 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 9139 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 9140 return -EPERM; 9141 } 9142 break; 9143 default: 9144 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9145 return -EBUSY; 9146 } 9147 9148 return 0; 9149 } 9150 9151 /* Updates desc and its bdev with a v2 claim. */ 9152 static int 9153 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9154 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9155 { 9156 struct spdk_bdev *bdev = desc->bdev; 9157 struct spdk_bdev_module_claim *claim; 9158 9159 assert(spdk_spin_held(&bdev->internal.spinlock)); 9160 assert(claim_type_is_v2(type)); 9161 assert(desc->claim == NULL); 9162 9163 claim = calloc(1, sizeof(*desc->claim)); 9164 if (claim == NULL) { 9165 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 9166 return -ENOMEM; 9167 } 9168 claim->module = module; 9169 claim->desc = desc; 9170 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 9171 memcpy(claim->name, opts->name, sizeof(claim->name)); 9172 desc->claim = claim; 9173 9174 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9175 bdev->internal.claim_type = type; 9176 TAILQ_INIT(&bdev->internal.claim.v2.claims); 9177 bdev->internal.claim.v2.key = opts->shared_claim_key; 9178 } 9179 assert(type == bdev->internal.claim_type); 9180 9181 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 9182 9183 if (!desc->write && claim_type_promotes_to_write(type)) { 9184 desc->write = true; 9185 } 9186 9187 return 0; 9188 } 9189 9190 int 9191 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9192 struct spdk_bdev_claim_opts *_opts, 9193 struct spdk_bdev_module *module) 9194 { 9195 struct spdk_bdev *bdev; 9196 struct spdk_bdev_claim_opts opts; 9197 int rc = 0; 9198 9199 if (desc == NULL) { 9200 SPDK_ERRLOG("descriptor must not be NULL\n"); 9201 return -EINVAL; 9202 } 9203 9204 bdev = desc->bdev; 9205 9206 if (_opts == NULL) { 9207 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 9208 } else if (claim_opts_copy(_opts, &opts) != 0) { 9209 return -EINVAL; 9210 } 9211 9212 spdk_spin_lock(&bdev->internal.spinlock); 9213 9214 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9215 bdev->internal.claim_type != type) { 9216 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9217 spdk_spin_unlock(&bdev->internal.spinlock); 9218 return -EPERM; 9219 } 9220 9221 if (claim_type_is_v2(type) && desc->claim != NULL) { 9222 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9223 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9224 spdk_spin_unlock(&bdev->internal.spinlock); 9225 return -EPERM; 9226 } 9227 9228 switch (type) { 9229 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9230 spdk_spin_unlock(&bdev->internal.spinlock); 9231 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9232 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9233 rc = claim_verify_rwo(desc, type, &opts, module); 9234 break; 9235 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9236 rc = claim_verify_rom(desc, type, &opts, module); 9237 break; 9238 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9239 rc = claim_verify_rwm(desc, type, &opts, module); 9240 break; 9241 default: 9242 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9243 rc = -ENOTSUP; 9244 } 9245 9246 if (rc == 0) { 9247 rc = claim_bdev(desc, type, &opts, module); 9248 } 9249 9250 spdk_spin_unlock(&bdev->internal.spinlock); 9251 return rc; 9252 } 9253 9254 static void 9255 claim_reset(struct spdk_bdev *bdev) 9256 { 9257 assert(spdk_spin_held(&bdev->internal.spinlock)); 9258 assert(claim_type_is_v2(bdev->internal.claim_type)); 9259 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9260 9261 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9262 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9263 } 9264 9265 static void 9266 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9267 { 9268 struct spdk_bdev *bdev = desc->bdev; 9269 9270 assert(spdk_spin_held(&bdev->internal.spinlock)); 9271 assert(claim_type_is_v2(bdev->internal.claim_type)); 9272 9273 if (bdev->internal.examine_in_progress == 0) { 9274 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9275 free(desc->claim); 9276 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9277 claim_reset(bdev); 9278 } 9279 } else { 9280 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9281 desc->claim->module = NULL; 9282 desc->claim->desc = NULL; 9283 } 9284 desc->claim = NULL; 9285 } 9286 9287 /* 9288 * End claims v2 9289 */ 9290 9291 struct spdk_bdev * 9292 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9293 { 9294 assert(desc != NULL); 9295 return desc->bdev; 9296 } 9297 9298 int 9299 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9300 { 9301 struct spdk_bdev *bdev, *tmp; 9302 struct spdk_bdev_desc *desc; 9303 int rc = 0; 9304 9305 assert(fn != NULL); 9306 9307 spdk_spin_lock(&g_bdev_mgr.spinlock); 9308 bdev = spdk_bdev_first(); 9309 while (bdev != NULL) { 9310 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9311 if (rc != 0) { 9312 break; 9313 } 9314 rc = bdev_open(bdev, false, desc); 9315 if (rc != 0) { 9316 bdev_desc_free(desc); 9317 if (rc == -ENODEV) { 9318 /* Ignore the error and move to the next bdev. */ 9319 rc = 0; 9320 bdev = spdk_bdev_next(bdev); 9321 continue; 9322 } 9323 break; 9324 } 9325 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9326 9327 rc = fn(ctx, bdev); 9328 9329 spdk_spin_lock(&g_bdev_mgr.spinlock); 9330 tmp = spdk_bdev_next(bdev); 9331 bdev_close(bdev, desc); 9332 if (rc != 0) { 9333 break; 9334 } 9335 bdev = tmp; 9336 } 9337 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9338 9339 return rc; 9340 } 9341 9342 int 9343 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9344 { 9345 struct spdk_bdev *bdev, *tmp; 9346 struct spdk_bdev_desc *desc; 9347 int rc = 0; 9348 9349 assert(fn != NULL); 9350 9351 spdk_spin_lock(&g_bdev_mgr.spinlock); 9352 bdev = spdk_bdev_first_leaf(); 9353 while (bdev != NULL) { 9354 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9355 if (rc != 0) { 9356 break; 9357 } 9358 rc = bdev_open(bdev, false, desc); 9359 if (rc != 0) { 9360 bdev_desc_free(desc); 9361 if (rc == -ENODEV) { 9362 /* Ignore the error and move to the next bdev. */ 9363 rc = 0; 9364 bdev = spdk_bdev_next_leaf(bdev); 9365 continue; 9366 } 9367 break; 9368 } 9369 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9370 9371 rc = fn(ctx, bdev); 9372 9373 spdk_spin_lock(&g_bdev_mgr.spinlock); 9374 tmp = spdk_bdev_next_leaf(bdev); 9375 bdev_close(bdev, desc); 9376 if (rc != 0) { 9377 break; 9378 } 9379 bdev = tmp; 9380 } 9381 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9382 9383 return rc; 9384 } 9385 9386 void 9387 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9388 { 9389 struct iovec *iovs; 9390 int iovcnt; 9391 9392 if (bdev_io == NULL) { 9393 return; 9394 } 9395 9396 switch (bdev_io->type) { 9397 case SPDK_BDEV_IO_TYPE_READ: 9398 case SPDK_BDEV_IO_TYPE_WRITE: 9399 case SPDK_BDEV_IO_TYPE_ZCOPY: 9400 iovs = bdev_io->u.bdev.iovs; 9401 iovcnt = bdev_io->u.bdev.iovcnt; 9402 break; 9403 default: 9404 iovs = NULL; 9405 iovcnt = 0; 9406 break; 9407 } 9408 9409 if (iovp) { 9410 *iovp = iovs; 9411 } 9412 if (iovcntp) { 9413 *iovcntp = iovcnt; 9414 } 9415 } 9416 9417 void * 9418 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9419 { 9420 if (bdev_io == NULL) { 9421 return NULL; 9422 } 9423 9424 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9425 return NULL; 9426 } 9427 9428 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9429 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9430 return bdev_io->u.bdev.md_buf; 9431 } 9432 9433 return NULL; 9434 } 9435 9436 void * 9437 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9438 { 9439 if (bdev_io == NULL) { 9440 assert(false); 9441 return NULL; 9442 } 9443 9444 return bdev_io->internal.caller_ctx; 9445 } 9446 9447 void 9448 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9449 { 9450 9451 if (spdk_bdev_module_list_find(bdev_module->name)) { 9452 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9453 assert(false); 9454 } 9455 9456 spdk_spin_init(&bdev_module->internal.spinlock); 9457 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9458 9459 /* 9460 * Modules with examine callbacks must be initialized first, so they are 9461 * ready to handle examine callbacks from later modules that will 9462 * register physical bdevs. 9463 */ 9464 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9465 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9466 } else { 9467 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9468 } 9469 } 9470 9471 struct spdk_bdev_module * 9472 spdk_bdev_module_list_find(const char *name) 9473 { 9474 struct spdk_bdev_module *bdev_module; 9475 9476 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9477 if (strcmp(name, bdev_module->name) == 0) { 9478 break; 9479 } 9480 } 9481 9482 return bdev_module; 9483 } 9484 9485 static int 9486 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9487 { 9488 uint64_t num_blocks; 9489 void *md_buf = NULL; 9490 9491 num_blocks = bdev_io->u.bdev.num_blocks; 9492 9493 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9494 md_buf = (char *)g_bdev_mgr.zero_buffer + 9495 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9496 } 9497 9498 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9499 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9500 g_bdev_mgr.zero_buffer, md_buf, 9501 bdev_io->u.bdev.offset_blocks, num_blocks, 9502 bdev_write_zero_buffer_done, bdev_io); 9503 } 9504 9505 static void 9506 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9507 { 9508 struct spdk_bdev_io *parent_io = cb_arg; 9509 9510 spdk_bdev_free_io(bdev_io); 9511 9512 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9513 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9514 } 9515 9516 static void 9517 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9518 { 9519 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9520 ctx->bdev->internal.qos_mod_in_progress = false; 9521 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9522 9523 if (ctx->cb_fn) { 9524 ctx->cb_fn(ctx->cb_arg, status); 9525 } 9526 free(ctx); 9527 } 9528 9529 static void 9530 bdev_disable_qos_done(void *cb_arg) 9531 { 9532 struct set_qos_limit_ctx *ctx = cb_arg; 9533 struct spdk_bdev *bdev = ctx->bdev; 9534 struct spdk_bdev_qos *qos; 9535 9536 spdk_spin_lock(&bdev->internal.spinlock); 9537 qos = bdev->internal.qos; 9538 bdev->internal.qos = NULL; 9539 spdk_spin_unlock(&bdev->internal.spinlock); 9540 9541 if (qos->thread != NULL) { 9542 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9543 spdk_poller_unregister(&qos->poller); 9544 } 9545 9546 free(qos); 9547 9548 bdev_set_qos_limit_done(ctx, 0); 9549 } 9550 9551 static void 9552 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9553 { 9554 struct set_qos_limit_ctx *ctx = _ctx; 9555 struct spdk_thread *thread; 9556 9557 spdk_spin_lock(&bdev->internal.spinlock); 9558 thread = bdev->internal.qos->thread; 9559 spdk_spin_unlock(&bdev->internal.spinlock); 9560 9561 if (thread != NULL) { 9562 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9563 } else { 9564 bdev_disable_qos_done(ctx); 9565 } 9566 } 9567 9568 static void 9569 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9570 struct spdk_io_channel *ch, void *_ctx) 9571 { 9572 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9573 struct spdk_bdev_io *bdev_io; 9574 9575 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9576 9577 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9578 /* Re-submit the queued I/O. */ 9579 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9580 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9581 _bdev_io_submit(bdev_io); 9582 } 9583 9584 spdk_bdev_for_each_channel_continue(i, 0); 9585 } 9586 9587 static void 9588 bdev_update_qos_rate_limit_msg(void *cb_arg) 9589 { 9590 struct set_qos_limit_ctx *ctx = cb_arg; 9591 struct spdk_bdev *bdev = ctx->bdev; 9592 9593 spdk_spin_lock(&bdev->internal.spinlock); 9594 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9595 spdk_spin_unlock(&bdev->internal.spinlock); 9596 9597 bdev_set_qos_limit_done(ctx, 0); 9598 } 9599 9600 static void 9601 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9602 struct spdk_io_channel *ch, void *_ctx) 9603 { 9604 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9605 9606 spdk_spin_lock(&bdev->internal.spinlock); 9607 bdev_enable_qos(bdev, bdev_ch); 9608 spdk_spin_unlock(&bdev->internal.spinlock); 9609 spdk_bdev_for_each_channel_continue(i, 0); 9610 } 9611 9612 static void 9613 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9614 { 9615 struct set_qos_limit_ctx *ctx = _ctx; 9616 9617 bdev_set_qos_limit_done(ctx, status); 9618 } 9619 9620 static void 9621 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9622 { 9623 int i; 9624 9625 assert(bdev->internal.qos != NULL); 9626 9627 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9628 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9629 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9630 9631 if (limits[i] == 0) { 9632 bdev->internal.qos->rate_limits[i].limit = 9633 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9634 } 9635 } 9636 } 9637 } 9638 9639 void 9640 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9641 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9642 { 9643 struct set_qos_limit_ctx *ctx; 9644 uint32_t limit_set_complement; 9645 uint64_t min_limit_per_sec; 9646 int i; 9647 bool disable_rate_limit = true; 9648 9649 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9650 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9651 continue; 9652 } 9653 9654 if (limits[i] > 0) { 9655 disable_rate_limit = false; 9656 } 9657 9658 if (bdev_qos_is_iops_rate_limit(i) == true) { 9659 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9660 } else { 9661 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9662 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9663 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9664 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9665 } 9666 /* Change from megabyte to byte rate limit */ 9667 limits[i] = limits[i] * 1024 * 1024; 9668 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9669 } 9670 9671 limit_set_complement = limits[i] % min_limit_per_sec; 9672 if (limit_set_complement) { 9673 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9674 limits[i], min_limit_per_sec); 9675 limits[i] += min_limit_per_sec - limit_set_complement; 9676 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9677 } 9678 } 9679 9680 ctx = calloc(1, sizeof(*ctx)); 9681 if (ctx == NULL) { 9682 cb_fn(cb_arg, -ENOMEM); 9683 return; 9684 } 9685 9686 ctx->cb_fn = cb_fn; 9687 ctx->cb_arg = cb_arg; 9688 ctx->bdev = bdev; 9689 9690 spdk_spin_lock(&bdev->internal.spinlock); 9691 if (bdev->internal.qos_mod_in_progress) { 9692 spdk_spin_unlock(&bdev->internal.spinlock); 9693 free(ctx); 9694 cb_fn(cb_arg, -EAGAIN); 9695 return; 9696 } 9697 bdev->internal.qos_mod_in_progress = true; 9698 9699 if (disable_rate_limit == true && bdev->internal.qos) { 9700 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9701 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9702 (bdev->internal.qos->rate_limits[i].limit > 0 && 9703 bdev->internal.qos->rate_limits[i].limit != 9704 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9705 disable_rate_limit = false; 9706 break; 9707 } 9708 } 9709 } 9710 9711 if (disable_rate_limit == false) { 9712 if (bdev->internal.qos == NULL) { 9713 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9714 if (!bdev->internal.qos) { 9715 spdk_spin_unlock(&bdev->internal.spinlock); 9716 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9717 bdev_set_qos_limit_done(ctx, -ENOMEM); 9718 return; 9719 } 9720 } 9721 9722 if (bdev->internal.qos->thread == NULL) { 9723 /* Enabling */ 9724 bdev_set_qos_rate_limits(bdev, limits); 9725 9726 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9727 bdev_enable_qos_done); 9728 } else { 9729 /* Updating */ 9730 bdev_set_qos_rate_limits(bdev, limits); 9731 9732 spdk_thread_send_msg(bdev->internal.qos->thread, 9733 bdev_update_qos_rate_limit_msg, ctx); 9734 } 9735 } else { 9736 if (bdev->internal.qos != NULL) { 9737 bdev_set_qos_rate_limits(bdev, limits); 9738 9739 /* Disabling */ 9740 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9741 bdev_disable_qos_msg_done); 9742 } else { 9743 spdk_spin_unlock(&bdev->internal.spinlock); 9744 bdev_set_qos_limit_done(ctx, 0); 9745 return; 9746 } 9747 } 9748 9749 spdk_spin_unlock(&bdev->internal.spinlock); 9750 } 9751 9752 struct spdk_bdev_histogram_ctx { 9753 spdk_bdev_histogram_status_cb cb_fn; 9754 void *cb_arg; 9755 struct spdk_bdev *bdev; 9756 int status; 9757 }; 9758 9759 static void 9760 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9761 { 9762 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9763 9764 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9765 ctx->bdev->internal.histogram_in_progress = false; 9766 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9767 ctx->cb_fn(ctx->cb_arg, ctx->status); 9768 free(ctx); 9769 } 9770 9771 static void 9772 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9773 struct spdk_io_channel *_ch, void *_ctx) 9774 { 9775 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9776 9777 if (ch->histogram != NULL) { 9778 spdk_histogram_data_free(ch->histogram); 9779 ch->histogram = NULL; 9780 } 9781 spdk_bdev_for_each_channel_continue(i, 0); 9782 } 9783 9784 static void 9785 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9786 { 9787 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9788 9789 if (status != 0) { 9790 ctx->status = status; 9791 ctx->bdev->internal.histogram_enabled = false; 9792 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9793 bdev_histogram_disable_channel_cb); 9794 } else { 9795 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9796 ctx->bdev->internal.histogram_in_progress = false; 9797 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9798 ctx->cb_fn(ctx->cb_arg, ctx->status); 9799 free(ctx); 9800 } 9801 } 9802 9803 static void 9804 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9805 struct spdk_io_channel *_ch, void *_ctx) 9806 { 9807 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9808 int status = 0; 9809 9810 if (ch->histogram == NULL) { 9811 ch->histogram = spdk_histogram_data_alloc(); 9812 if (ch->histogram == NULL) { 9813 status = -ENOMEM; 9814 } 9815 } 9816 9817 spdk_bdev_for_each_channel_continue(i, status); 9818 } 9819 9820 void 9821 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9822 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9823 { 9824 struct spdk_bdev_histogram_ctx *ctx; 9825 9826 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9827 if (ctx == NULL) { 9828 cb_fn(cb_arg, -ENOMEM); 9829 return; 9830 } 9831 9832 ctx->bdev = bdev; 9833 ctx->status = 0; 9834 ctx->cb_fn = cb_fn; 9835 ctx->cb_arg = cb_arg; 9836 9837 spdk_spin_lock(&bdev->internal.spinlock); 9838 if (bdev->internal.histogram_in_progress) { 9839 spdk_spin_unlock(&bdev->internal.spinlock); 9840 free(ctx); 9841 cb_fn(cb_arg, -EAGAIN); 9842 return; 9843 } 9844 9845 bdev->internal.histogram_in_progress = true; 9846 spdk_spin_unlock(&bdev->internal.spinlock); 9847 9848 bdev->internal.histogram_enabled = enable; 9849 bdev->internal.histogram_io_type = opts->io_type; 9850 9851 if (enable) { 9852 /* Allocate histogram for each channel */ 9853 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9854 bdev_histogram_enable_channel_cb); 9855 } else { 9856 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9857 bdev_histogram_disable_channel_cb); 9858 } 9859 } 9860 9861 void 9862 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9863 { 9864 if (opts == NULL) { 9865 SPDK_ERRLOG("opts should not be NULL\n"); 9866 assert(opts != NULL); 9867 return; 9868 } 9869 if (size == 0) { 9870 SPDK_ERRLOG("size should not be zero\n"); 9871 assert(size != 0); 9872 return; 9873 } 9874 9875 memset(opts, 0, size); 9876 opts->size = size; 9877 9878 #define FIELD_OK(field) \ 9879 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9880 9881 #define SET_FIELD(field, value) \ 9882 if (FIELD_OK(field)) { \ 9883 opts->field = value; \ 9884 } \ 9885 9886 SET_FIELD(io_type, 0); 9887 9888 /* You should not remove this statement, but need to update the assert statement 9889 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9890 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9891 9892 #undef FIELD_OK 9893 #undef SET_FIELD 9894 } 9895 9896 void 9897 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9898 void *cb_arg, bool enable) 9899 { 9900 struct spdk_bdev_enable_histogram_opts opts; 9901 9902 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9903 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9904 } 9905 9906 struct spdk_bdev_histogram_data_ctx { 9907 spdk_bdev_histogram_data_cb cb_fn; 9908 void *cb_arg; 9909 struct spdk_bdev *bdev; 9910 /** merged histogram data from all channels */ 9911 struct spdk_histogram_data *histogram; 9912 }; 9913 9914 static void 9915 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9916 { 9917 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9918 9919 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9920 free(ctx); 9921 } 9922 9923 static void 9924 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9925 struct spdk_io_channel *_ch, void *_ctx) 9926 { 9927 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9928 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9929 int status = 0; 9930 9931 if (ch->histogram == NULL) { 9932 status = -EFAULT; 9933 } else { 9934 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9935 } 9936 9937 spdk_bdev_for_each_channel_continue(i, status); 9938 } 9939 9940 void 9941 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9942 spdk_bdev_histogram_data_cb cb_fn, 9943 void *cb_arg) 9944 { 9945 struct spdk_bdev_histogram_data_ctx *ctx; 9946 9947 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9948 if (ctx == NULL) { 9949 cb_fn(cb_arg, -ENOMEM, NULL); 9950 return; 9951 } 9952 9953 ctx->bdev = bdev; 9954 ctx->cb_fn = cb_fn; 9955 ctx->cb_arg = cb_arg; 9956 9957 ctx->histogram = histogram; 9958 9959 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9960 bdev_histogram_get_channel_cb); 9961 } 9962 9963 void 9964 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9965 void *cb_arg) 9966 { 9967 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9968 int status = 0; 9969 9970 assert(cb_fn != NULL); 9971 9972 if (bdev_ch->histogram == NULL) { 9973 status = -EFAULT; 9974 } 9975 cb_fn(cb_arg, status, bdev_ch->histogram); 9976 } 9977 9978 size_t 9979 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9980 size_t max_events) 9981 { 9982 struct media_event_entry *entry; 9983 size_t num_events = 0; 9984 9985 for (; num_events < max_events; ++num_events) { 9986 entry = TAILQ_FIRST(&desc->pending_media_events); 9987 if (entry == NULL) { 9988 break; 9989 } 9990 9991 events[num_events] = entry->event; 9992 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9993 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9994 } 9995 9996 return num_events; 9997 } 9998 9999 int 10000 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 10001 size_t num_events) 10002 { 10003 struct spdk_bdev_desc *desc; 10004 struct media_event_entry *entry; 10005 size_t event_id; 10006 int rc = 0; 10007 10008 assert(bdev->media_events); 10009 10010 spdk_spin_lock(&bdev->internal.spinlock); 10011 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10012 if (desc->write) { 10013 break; 10014 } 10015 } 10016 10017 if (desc == NULL || desc->media_events_buffer == NULL) { 10018 rc = -ENODEV; 10019 goto out; 10020 } 10021 10022 for (event_id = 0; event_id < num_events; ++event_id) { 10023 entry = TAILQ_FIRST(&desc->free_media_events); 10024 if (entry == NULL) { 10025 break; 10026 } 10027 10028 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 10029 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 10030 entry->event = events[event_id]; 10031 } 10032 10033 rc = event_id; 10034 out: 10035 spdk_spin_unlock(&bdev->internal.spinlock); 10036 return rc; 10037 } 10038 10039 static void 10040 _media_management_notify(void *arg) 10041 { 10042 struct spdk_bdev_desc *desc = arg; 10043 10044 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 10045 } 10046 10047 void 10048 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 10049 { 10050 struct spdk_bdev_desc *desc; 10051 10052 spdk_spin_lock(&bdev->internal.spinlock); 10053 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10054 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 10055 event_notify(desc, _media_management_notify); 10056 } 10057 } 10058 spdk_spin_unlock(&bdev->internal.spinlock); 10059 } 10060 10061 struct locked_lba_range_ctx { 10062 struct lba_range range; 10063 struct lba_range *current_range; 10064 struct lba_range *owner_range; 10065 struct spdk_poller *poller; 10066 lock_range_cb cb_fn; 10067 void *cb_arg; 10068 }; 10069 10070 static void 10071 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10072 { 10073 struct locked_lba_range_ctx *ctx = _ctx; 10074 10075 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 10076 free(ctx); 10077 } 10078 10079 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 10080 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 10081 10082 static void 10083 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10084 { 10085 struct locked_lba_range_ctx *ctx = _ctx; 10086 10087 if (status == -ENOMEM) { 10088 /* One of the channels could not allocate a range object. 10089 * So we have to go back and clean up any ranges that were 10090 * allocated successfully before we return error status to 10091 * the caller. We can reuse the unlock function to do that 10092 * clean up. 10093 */ 10094 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10095 bdev_lock_error_cleanup_cb); 10096 return; 10097 } 10098 10099 /* All channels have locked this range and no I/O overlapping the range 10100 * are outstanding! Set the owner_ch for the range object for the 10101 * locking channel, so that this channel will know that it is allowed 10102 * to write to this range. 10103 */ 10104 if (ctx->owner_range != NULL) { 10105 ctx->owner_range->owner_ch = ctx->range.owner_ch; 10106 } 10107 10108 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10109 10110 /* Don't free the ctx here. Its range is in the bdev's global list of 10111 * locked ranges still, and will be removed and freed when this range 10112 * is later unlocked. 10113 */ 10114 } 10115 10116 static int 10117 bdev_lock_lba_range_check_io(void *_i) 10118 { 10119 struct spdk_bdev_channel_iter *i = _i; 10120 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 10121 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10122 struct locked_lba_range_ctx *ctx = i->ctx; 10123 struct lba_range *range = ctx->current_range; 10124 struct spdk_bdev_io *bdev_io; 10125 10126 spdk_poller_unregister(&ctx->poller); 10127 10128 /* The range is now in the locked_ranges, so no new IO can be submitted to this 10129 * range. But we need to wait until any outstanding IO overlapping with this range 10130 * are completed. 10131 */ 10132 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 10133 if (bdev_io_range_is_locked(bdev_io, range)) { 10134 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 10135 return SPDK_POLLER_BUSY; 10136 } 10137 } 10138 10139 spdk_bdev_for_each_channel_continue(i, 0); 10140 return SPDK_POLLER_BUSY; 10141 } 10142 10143 static void 10144 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10145 struct spdk_io_channel *_ch, void *_ctx) 10146 { 10147 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10148 struct locked_lba_range_ctx *ctx = _ctx; 10149 struct lba_range *range; 10150 10151 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10152 if (range->length == ctx->range.length && 10153 range->offset == ctx->range.offset && 10154 range->locked_ctx == ctx->range.locked_ctx) { 10155 /* This range already exists on this channel, so don't add 10156 * it again. This can happen when a new channel is created 10157 * while the for_each_channel operation is in progress. 10158 * Do not check for outstanding I/O in that case, since the 10159 * range was locked before any I/O could be submitted to the 10160 * new channel. 10161 */ 10162 spdk_bdev_for_each_channel_continue(i, 0); 10163 return; 10164 } 10165 } 10166 10167 range = calloc(1, sizeof(*range)); 10168 if (range == NULL) { 10169 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 10170 return; 10171 } 10172 10173 range->length = ctx->range.length; 10174 range->offset = ctx->range.offset; 10175 range->locked_ctx = ctx->range.locked_ctx; 10176 range->quiesce = ctx->range.quiesce; 10177 ctx->current_range = range; 10178 if (ctx->range.owner_ch == ch) { 10179 /* This is the range object for the channel that will hold 10180 * the lock. Store it in the ctx object so that we can easily 10181 * set its owner_ch after the lock is finally acquired. 10182 */ 10183 ctx->owner_range = range; 10184 } 10185 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 10186 bdev_lock_lba_range_check_io(i); 10187 } 10188 10189 static void 10190 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 10191 { 10192 assert(spdk_get_thread() == ctx->range.owner_thread); 10193 assert(ctx->range.owner_ch == NULL || 10194 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 10195 10196 /* We will add a copy of this range to each channel now. */ 10197 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 10198 bdev_lock_lba_range_cb); 10199 } 10200 10201 static bool 10202 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 10203 { 10204 struct lba_range *r; 10205 10206 TAILQ_FOREACH(r, tailq, tailq) { 10207 if (bdev_lba_range_overlapped(range, r)) { 10208 return true; 10209 } 10210 } 10211 return false; 10212 } 10213 10214 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10215 10216 static int 10217 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10218 uint64_t offset, uint64_t length, 10219 lock_range_cb cb_fn, void *cb_arg) 10220 { 10221 struct locked_lba_range_ctx *ctx; 10222 10223 ctx = calloc(1, sizeof(*ctx)); 10224 if (ctx == NULL) { 10225 return -ENOMEM; 10226 } 10227 10228 ctx->range.offset = offset; 10229 ctx->range.length = length; 10230 ctx->range.owner_thread = spdk_get_thread(); 10231 ctx->range.owner_ch = ch; 10232 ctx->range.locked_ctx = cb_arg; 10233 ctx->range.bdev = bdev; 10234 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10235 ctx->cb_fn = cb_fn; 10236 ctx->cb_arg = cb_arg; 10237 10238 spdk_spin_lock(&bdev->internal.spinlock); 10239 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10240 /* There is an active lock overlapping with this range. 10241 * Put it on the pending list until this range no 10242 * longer overlaps with another. 10243 */ 10244 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10245 } else { 10246 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10247 bdev_lock_lba_range_ctx(bdev, ctx); 10248 } 10249 spdk_spin_unlock(&bdev->internal.spinlock); 10250 return 0; 10251 } 10252 10253 static int 10254 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10255 uint64_t offset, uint64_t length, 10256 lock_range_cb cb_fn, void *cb_arg) 10257 { 10258 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10259 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10260 10261 if (cb_arg == NULL) { 10262 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10263 return -EINVAL; 10264 } 10265 10266 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10267 } 10268 10269 static void 10270 bdev_lock_lba_range_ctx_msg(void *_ctx) 10271 { 10272 struct locked_lba_range_ctx *ctx = _ctx; 10273 10274 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10275 } 10276 10277 static void 10278 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10279 { 10280 struct locked_lba_range_ctx *ctx = _ctx; 10281 struct locked_lba_range_ctx *pending_ctx; 10282 struct lba_range *range, *tmp; 10283 10284 spdk_spin_lock(&bdev->internal.spinlock); 10285 /* Check if there are any pending locked ranges that overlap with this range 10286 * that was just unlocked. If there are, check that it doesn't overlap with any 10287 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10288 * the lock process. 10289 */ 10290 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10291 if (bdev_lba_range_overlapped(range, &ctx->range) && 10292 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10293 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10294 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10295 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10296 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10297 bdev_lock_lba_range_ctx_msg, pending_ctx); 10298 } 10299 } 10300 spdk_spin_unlock(&bdev->internal.spinlock); 10301 10302 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10303 free(ctx); 10304 } 10305 10306 static void 10307 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10308 struct spdk_io_channel *_ch, void *_ctx) 10309 { 10310 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10311 struct locked_lba_range_ctx *ctx = _ctx; 10312 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10313 struct spdk_bdev_io *bdev_io; 10314 struct lba_range *range; 10315 10316 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10317 if (ctx->range.offset == range->offset && 10318 ctx->range.length == range->length && 10319 ctx->range.locked_ctx == range->locked_ctx) { 10320 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10321 free(range); 10322 break; 10323 } 10324 } 10325 10326 /* Note: we should almost always be able to assert that the range specified 10327 * was found. But there are some very rare corner cases where a new channel 10328 * gets created simultaneously with a range unlock, where this function 10329 * would execute on that new channel and wouldn't have the range. 10330 * We also use this to clean up range allocations when a later allocation 10331 * fails in the locking path. 10332 * So we can't actually assert() here. 10333 */ 10334 10335 /* Swap the locked IO into a temporary list, and then try to submit them again. 10336 * We could hyper-optimize this to only resubmit locked I/O that overlap 10337 * with the range that was just unlocked, but this isn't a performance path so 10338 * we go for simplicity here. 10339 */ 10340 TAILQ_INIT(&io_locked); 10341 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10342 while (!TAILQ_EMPTY(&io_locked)) { 10343 bdev_io = TAILQ_FIRST(&io_locked); 10344 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10345 bdev_io_submit(bdev_io); 10346 } 10347 10348 spdk_bdev_for_each_channel_continue(i, 0); 10349 } 10350 10351 static int 10352 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10353 lock_range_cb cb_fn, void *cb_arg) 10354 { 10355 struct locked_lba_range_ctx *ctx; 10356 struct lba_range *range; 10357 10358 spdk_spin_lock(&bdev->internal.spinlock); 10359 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10360 * and remove it. This ensures new channels don't inherit the locked range. 10361 * Then we will send a message to each channel to remove the range from its 10362 * per-channel list. 10363 */ 10364 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10365 if (range->offset == offset && range->length == length && 10366 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10367 break; 10368 } 10369 } 10370 if (range == NULL) { 10371 assert(false); 10372 spdk_spin_unlock(&bdev->internal.spinlock); 10373 return -EINVAL; 10374 } 10375 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10376 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10377 spdk_spin_unlock(&bdev->internal.spinlock); 10378 10379 ctx->cb_fn = cb_fn; 10380 ctx->cb_arg = cb_arg; 10381 10382 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10383 bdev_unlock_lba_range_cb); 10384 return 0; 10385 } 10386 10387 static int 10388 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10389 uint64_t offset, uint64_t length, 10390 lock_range_cb cb_fn, void *cb_arg) 10391 { 10392 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10393 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10394 struct lba_range *range; 10395 bool range_found = false; 10396 10397 /* Let's make sure the specified channel actually has a lock on 10398 * the specified range. Note that the range must match exactly. 10399 */ 10400 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10401 if (range->offset == offset && range->length == length && 10402 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10403 range_found = true; 10404 break; 10405 } 10406 } 10407 10408 if (!range_found) { 10409 return -EINVAL; 10410 } 10411 10412 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10413 } 10414 10415 struct bdev_quiesce_ctx { 10416 spdk_bdev_quiesce_cb cb_fn; 10417 void *cb_arg; 10418 }; 10419 10420 static void 10421 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10422 { 10423 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10424 10425 if (quiesce_ctx->cb_fn != NULL) { 10426 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10427 } 10428 10429 free(quiesce_ctx); 10430 } 10431 10432 static void 10433 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10434 { 10435 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10436 struct spdk_bdev_module *module = range->bdev->module; 10437 10438 if (status != 0) { 10439 if (quiesce_ctx->cb_fn != NULL) { 10440 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10441 } 10442 free(quiesce_ctx); 10443 return; 10444 } 10445 10446 spdk_spin_lock(&module->internal.spinlock); 10447 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10448 spdk_spin_unlock(&module->internal.spinlock); 10449 10450 if (quiesce_ctx->cb_fn != NULL) { 10451 /* copy the context in case the range is unlocked by the callback */ 10452 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10453 10454 quiesce_ctx->cb_fn = NULL; 10455 quiesce_ctx->cb_arg = NULL; 10456 10457 tmp.cb_fn(tmp.cb_arg, status); 10458 } 10459 /* quiesce_ctx will be freed on unquiesce */ 10460 } 10461 10462 static int 10463 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10464 uint64_t offset, uint64_t length, 10465 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10466 bool unquiesce) 10467 { 10468 struct bdev_quiesce_ctx *quiesce_ctx; 10469 int rc; 10470 10471 if (module != bdev->module) { 10472 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10473 return -EINVAL; 10474 } 10475 10476 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10477 return -EINVAL; 10478 } 10479 10480 if (unquiesce) { 10481 struct lba_range *range; 10482 10483 /* Make sure the specified range is actually quiesced in the specified module and 10484 * then remove it from the list. Note that the range must match exactly. 10485 */ 10486 spdk_spin_lock(&module->internal.spinlock); 10487 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10488 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10489 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10490 break; 10491 } 10492 } 10493 spdk_spin_unlock(&module->internal.spinlock); 10494 10495 if (range == NULL) { 10496 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10497 return -EINVAL; 10498 } 10499 10500 quiesce_ctx = range->locked_ctx; 10501 quiesce_ctx->cb_fn = cb_fn; 10502 quiesce_ctx->cb_arg = cb_arg; 10503 10504 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10505 } else { 10506 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10507 if (quiesce_ctx == NULL) { 10508 return -ENOMEM; 10509 } 10510 10511 quiesce_ctx->cb_fn = cb_fn; 10512 quiesce_ctx->cb_arg = cb_arg; 10513 10514 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10515 if (rc != 0) { 10516 free(quiesce_ctx); 10517 } 10518 } 10519 10520 return rc; 10521 } 10522 10523 int 10524 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10525 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10526 { 10527 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10528 } 10529 10530 int 10531 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10532 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10533 { 10534 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10535 } 10536 10537 int 10538 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10539 uint64_t offset, uint64_t length, 10540 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10541 { 10542 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10543 } 10544 10545 int 10546 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10547 uint64_t offset, uint64_t length, 10548 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10549 { 10550 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10551 } 10552 10553 int 10554 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10555 int array_size) 10556 { 10557 if (!bdev) { 10558 return -EINVAL; 10559 } 10560 10561 if (bdev->fn_table->get_memory_domains) { 10562 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10563 } 10564 10565 return 0; 10566 } 10567 10568 struct spdk_bdev_for_each_io_ctx { 10569 void *ctx; 10570 spdk_bdev_io_fn fn; 10571 spdk_bdev_for_each_io_cb cb; 10572 }; 10573 10574 static void 10575 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10576 struct spdk_io_channel *io_ch, void *_ctx) 10577 { 10578 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10579 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10580 struct spdk_bdev_io *bdev_io; 10581 int rc = 0; 10582 10583 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10584 rc = ctx->fn(ctx->ctx, bdev_io); 10585 if (rc != 0) { 10586 break; 10587 } 10588 } 10589 10590 spdk_bdev_for_each_channel_continue(i, rc); 10591 } 10592 10593 static void 10594 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10595 { 10596 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10597 10598 ctx->cb(ctx->ctx, status); 10599 10600 free(ctx); 10601 } 10602 10603 void 10604 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10605 spdk_bdev_for_each_io_cb cb) 10606 { 10607 struct spdk_bdev_for_each_io_ctx *ctx; 10608 10609 assert(fn != NULL && cb != NULL); 10610 10611 ctx = calloc(1, sizeof(*ctx)); 10612 if (ctx == NULL) { 10613 SPDK_ERRLOG("Failed to allocate context.\n"); 10614 cb(_ctx, -ENOMEM); 10615 return; 10616 } 10617 10618 ctx->ctx = _ctx; 10619 ctx->fn = fn; 10620 ctx->cb = cb; 10621 10622 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10623 bdev_for_each_io_done); 10624 } 10625 10626 void 10627 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10628 { 10629 spdk_for_each_channel_continue(iter->i, status); 10630 } 10631 10632 static struct spdk_bdev * 10633 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10634 { 10635 void *io_device = spdk_io_channel_iter_get_io_device(i); 10636 10637 return __bdev_from_io_dev(io_device); 10638 } 10639 10640 static void 10641 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10642 { 10643 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10644 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10645 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10646 10647 iter->i = i; 10648 iter->fn(iter, bdev, ch, iter->ctx); 10649 } 10650 10651 static void 10652 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10653 { 10654 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10655 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10656 10657 iter->i = i; 10658 iter->cpl(bdev, iter->ctx, status); 10659 10660 free(iter); 10661 } 10662 10663 void 10664 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10665 void *ctx, spdk_bdev_for_each_channel_done cpl) 10666 { 10667 struct spdk_bdev_channel_iter *iter; 10668 10669 assert(bdev != NULL && fn != NULL && ctx != NULL); 10670 10671 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10672 if (iter == NULL) { 10673 SPDK_ERRLOG("Unable to allocate iterator\n"); 10674 assert(false); 10675 return; 10676 } 10677 10678 iter->fn = fn; 10679 iter->cpl = cpl; 10680 iter->ctx = ctx; 10681 10682 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10683 iter, bdev_each_channel_cpl); 10684 } 10685 10686 static void 10687 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10688 { 10689 struct spdk_bdev_io *parent_io = cb_arg; 10690 10691 spdk_bdev_free_io(bdev_io); 10692 10693 /* Check return status of write */ 10694 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10695 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10696 } 10697 10698 static void 10699 bdev_copy_do_write(void *_bdev_io) 10700 { 10701 struct spdk_bdev_io *bdev_io = _bdev_io; 10702 int rc; 10703 10704 /* Write blocks */ 10705 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10706 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10707 bdev_io->u.bdev.iovs[0].iov_base, 10708 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10709 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10710 10711 if (rc == -ENOMEM) { 10712 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10713 } else if (rc != 0) { 10714 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10715 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10716 } 10717 } 10718 10719 static void 10720 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10721 { 10722 struct spdk_bdev_io *parent_io = cb_arg; 10723 10724 spdk_bdev_free_io(bdev_io); 10725 10726 /* Check return status of read */ 10727 if (!success) { 10728 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10729 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10730 return; 10731 } 10732 10733 /* Do write */ 10734 bdev_copy_do_write(parent_io); 10735 } 10736 10737 static void 10738 bdev_copy_do_read(void *_bdev_io) 10739 { 10740 struct spdk_bdev_io *bdev_io = _bdev_io; 10741 int rc; 10742 10743 /* Read blocks */ 10744 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10745 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10746 bdev_io->u.bdev.iovs[0].iov_base, 10747 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10748 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10749 10750 if (rc == -ENOMEM) { 10751 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10752 } else if (rc != 0) { 10753 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10754 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10755 } 10756 } 10757 10758 static void 10759 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10760 { 10761 if (!success) { 10762 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10763 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10764 return; 10765 } 10766 10767 bdev_copy_do_read(bdev_io); 10768 } 10769 10770 int 10771 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10772 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10773 spdk_bdev_io_completion_cb cb, void *cb_arg) 10774 { 10775 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10776 struct spdk_bdev_io *bdev_io; 10777 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10778 10779 if (!desc->write) { 10780 return -EBADF; 10781 } 10782 10783 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10784 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10785 SPDK_DEBUGLOG(bdev, 10786 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10787 dst_offset_blocks, src_offset_blocks, num_blocks); 10788 return -EINVAL; 10789 } 10790 10791 bdev_io = bdev_channel_get_io(channel); 10792 if (!bdev_io) { 10793 return -ENOMEM; 10794 } 10795 10796 bdev_io->internal.ch = channel; 10797 bdev_io->internal.desc = desc; 10798 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10799 10800 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10801 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10802 bdev_io->u.bdev.num_blocks = num_blocks; 10803 bdev_io->u.bdev.memory_domain = NULL; 10804 bdev_io->u.bdev.memory_domain_ctx = NULL; 10805 bdev_io->u.bdev.iovs = NULL; 10806 bdev_io->u.bdev.iovcnt = 0; 10807 bdev_io->u.bdev.md_buf = NULL; 10808 bdev_io->u.bdev.accel_sequence = NULL; 10809 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10810 10811 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10812 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10813 return 0; 10814 } 10815 10816 10817 /* If the copy size is large and should be split, use the generic split logic 10818 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10819 * 10820 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10821 * emulate it using regular read and write requests otherwise. 10822 */ 10823 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10824 bdev_io->internal.f.split) { 10825 bdev_io_submit(bdev_io); 10826 return 0; 10827 } 10828 10829 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10830 10831 return 0; 10832 } 10833 10834 SPDK_LOG_REGISTER_COMPONENT(bdev) 10835 10836 static void 10837 bdev_trace(void) 10838 { 10839 struct spdk_trace_tpoint_opts opts[] = { 10840 { 10841 "BDEV_IO_START", TRACE_BDEV_IO_START, 10842 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10843 { 10844 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10845 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10846 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10847 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10848 } 10849 }, 10850 { 10851 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10852 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10853 { 10854 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10855 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10856 } 10857 }, 10858 { 10859 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10860 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10861 { 10862 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10863 } 10864 }, 10865 { 10866 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10867 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10868 { 10869 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10870 } 10871 }, 10872 }; 10873 10874 10875 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10876 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10877 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10878 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10879 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10880 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10881 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10882 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 10883 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 10884 } 10885 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10886