1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 bool write; 337 bool memory_domains_supported; 338 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 339 struct spdk_bdev_open_opts opts; 340 struct spdk_thread *thread; 341 struct { 342 spdk_bdev_event_cb_t event_fn; 343 void *ctx; 344 } callback; 345 bool closed; 346 struct spdk_spinlock spinlock; 347 uint32_t refs; 348 TAILQ_HEAD(, media_event_entry) pending_media_events; 349 TAILQ_HEAD(, media_event_entry) free_media_events; 350 struct media_event_entry *media_events_buffer; 351 TAILQ_ENTRY(spdk_bdev_desc) link; 352 353 uint64_t timeout_in_sec; 354 spdk_bdev_io_timeout_cb cb_fn; 355 void *cb_arg; 356 struct spdk_poller *io_timeout_poller; 357 struct spdk_bdev_module_claim *claim; 358 }; 359 360 struct spdk_bdev_iostat_ctx { 361 struct spdk_bdev_io_stat *stat; 362 enum spdk_bdev_reset_stat_mode reset_mode; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 }; 392 393 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 394 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 395 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 396 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 397 398 static inline void bdev_io_complete(void *ctx); 399 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 404 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 405 406 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 407 struct spdk_io_channel *ch, void *_ctx); 408 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 409 410 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 412 uint64_t num_blocks, 413 struct spdk_memory_domain *domain, void *domain_ctx, 414 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 415 spdk_bdev_io_completion_cb cb, void *cb_arg); 416 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 417 struct iovec *iov, int iovcnt, void *md_buf, 418 uint64_t offset_blocks, uint64_t num_blocks, 419 struct spdk_memory_domain *domain, void *domain_ctx, 420 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 421 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 422 spdk_bdev_io_completion_cb cb, void *cb_arg); 423 424 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 425 uint64_t offset, uint64_t length, 426 lock_range_cb cb_fn, void *cb_arg); 427 428 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 429 uint64_t offset, uint64_t length, 430 lock_range_cb cb_fn, void *cb_arg); 431 432 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 433 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 434 435 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 436 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 437 static void claim_reset(struct spdk_bdev *bdev); 438 439 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 440 441 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 442 443 #define bdev_get_ext_io_opt(opts, field, defval) \ 444 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 445 446 static inline void 447 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 448 { 449 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 450 bdev_io->internal.ch->queue_depth++; 451 } 452 453 static inline void 454 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 455 { 456 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 457 bdev_io->internal.ch->queue_depth--; 458 } 459 460 void 461 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 462 { 463 if (!opts) { 464 SPDK_ERRLOG("opts should not be NULL\n"); 465 return; 466 } 467 468 if (!opts_size) { 469 SPDK_ERRLOG("opts_size should not be zero value\n"); 470 return; 471 } 472 473 opts->opts_size = opts_size; 474 475 #define SET_FIELD(field) \ 476 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 477 opts->field = g_bdev_opts.field; \ 478 } \ 479 480 SET_FIELD(bdev_io_pool_size); 481 SET_FIELD(bdev_io_cache_size); 482 SET_FIELD(bdev_auto_examine); 483 SET_FIELD(iobuf_small_cache_size); 484 SET_FIELD(iobuf_large_cache_size); 485 486 /* Do not remove this statement, you should always update this statement when you adding a new field, 487 * and do not forget to add the SET_FIELD statement for your added field. */ 488 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 489 490 #undef SET_FIELD 491 } 492 493 int 494 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 495 { 496 uint32_t min_pool_size; 497 498 if (!opts) { 499 SPDK_ERRLOG("opts cannot be NULL\n"); 500 return -1; 501 } 502 503 if (!opts->opts_size) { 504 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 505 return -1; 506 } 507 508 /* 509 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 510 * initialization. A second mgmt_ch will be created on the same thread when the application starts 511 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 512 */ 513 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 514 if (opts->bdev_io_pool_size < min_pool_size) { 515 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 516 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 517 spdk_thread_get_count()); 518 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 519 return -1; 520 } 521 522 #define SET_FIELD(field) \ 523 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 524 g_bdev_opts.field = opts->field; \ 525 } \ 526 527 SET_FIELD(bdev_io_pool_size); 528 SET_FIELD(bdev_io_cache_size); 529 SET_FIELD(bdev_auto_examine); 530 SET_FIELD(iobuf_small_cache_size); 531 SET_FIELD(iobuf_large_cache_size); 532 533 g_bdev_opts.opts_size = opts->opts_size; 534 535 #undef SET_FIELD 536 537 return 0; 538 } 539 540 static struct spdk_bdev * 541 bdev_get_by_name(const char *bdev_name) 542 { 543 struct spdk_bdev_name find; 544 struct spdk_bdev_name *res; 545 546 find.name = (char *)bdev_name; 547 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 548 if (res != NULL) { 549 return res->bdev; 550 } 551 552 return NULL; 553 } 554 555 struct spdk_bdev * 556 spdk_bdev_get_by_name(const char *bdev_name) 557 { 558 struct spdk_bdev *bdev; 559 560 spdk_spin_lock(&g_bdev_mgr.spinlock); 561 bdev = bdev_get_by_name(bdev_name); 562 spdk_spin_unlock(&g_bdev_mgr.spinlock); 563 564 return bdev; 565 } 566 567 struct bdev_io_status_string { 568 enum spdk_bdev_io_status status; 569 const char *str; 570 }; 571 572 static const struct bdev_io_status_string bdev_io_status_strings[] = { 573 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 574 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 575 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 576 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 577 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 578 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 579 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 580 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 581 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 582 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 583 }; 584 585 static const char * 586 bdev_io_status_get_string(enum spdk_bdev_io_status status) 587 { 588 uint32_t i; 589 590 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 591 if (bdev_io_status_strings[i].status == status) { 592 return bdev_io_status_strings[i].str; 593 } 594 } 595 596 return "reserved"; 597 } 598 599 struct spdk_bdev_wait_for_examine_ctx { 600 struct spdk_poller *poller; 601 spdk_bdev_wait_for_examine_cb cb_fn; 602 void *cb_arg; 603 }; 604 605 static bool bdev_module_all_actions_completed(void); 606 607 static int 608 bdev_wait_for_examine_cb(void *arg) 609 { 610 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 611 612 if (!bdev_module_all_actions_completed()) { 613 return SPDK_POLLER_IDLE; 614 } 615 616 spdk_poller_unregister(&ctx->poller); 617 ctx->cb_fn(ctx->cb_arg); 618 free(ctx); 619 620 return SPDK_POLLER_BUSY; 621 } 622 623 int 624 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 625 { 626 struct spdk_bdev_wait_for_examine_ctx *ctx; 627 628 ctx = calloc(1, sizeof(*ctx)); 629 if (ctx == NULL) { 630 return -ENOMEM; 631 } 632 ctx->cb_fn = cb_fn; 633 ctx->cb_arg = cb_arg; 634 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 635 636 return 0; 637 } 638 639 struct spdk_bdev_examine_item { 640 char *name; 641 TAILQ_ENTRY(spdk_bdev_examine_item) link; 642 }; 643 644 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 645 646 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 647 g_bdev_examine_allowlist); 648 649 static inline bool 650 bdev_examine_allowlist_check(const char *name) 651 { 652 struct spdk_bdev_examine_item *item; 653 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 654 if (strcmp(name, item->name) == 0) { 655 return true; 656 } 657 } 658 return false; 659 } 660 661 static inline void 662 bdev_examine_allowlist_remove(const char *name) 663 { 664 struct spdk_bdev_examine_item *item; 665 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 666 if (strcmp(name, item->name) == 0) { 667 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 668 free(item->name); 669 free(item); 670 break; 671 } 672 } 673 } 674 675 static inline void 676 bdev_examine_allowlist_free(void) 677 { 678 struct spdk_bdev_examine_item *item; 679 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 680 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 681 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 682 free(item->name); 683 free(item); 684 } 685 } 686 687 static inline bool 688 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 689 { 690 struct spdk_bdev_alias *tmp; 691 if (bdev_examine_allowlist_check(bdev->name)) { 692 return true; 693 } 694 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 695 if (bdev_examine_allowlist_check(tmp->alias.name)) { 696 return true; 697 } 698 } 699 return false; 700 } 701 702 static inline bool 703 bdev_ok_to_examine(struct spdk_bdev *bdev) 704 { 705 /* Some bdevs may not support the READ command. 706 * Do not try to examine them. 707 */ 708 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 709 return false; 710 } 711 712 if (g_bdev_opts.bdev_auto_examine) { 713 return true; 714 } else { 715 return bdev_in_examine_allowlist(bdev); 716 } 717 } 718 719 static void 720 bdev_examine(struct spdk_bdev *bdev) 721 { 722 struct spdk_bdev_module *module; 723 struct spdk_bdev_module_claim *claim, *tmpclaim; 724 uint32_t action; 725 726 if (!bdev_ok_to_examine(bdev)) { 727 return; 728 } 729 730 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 731 if (module->examine_config) { 732 spdk_spin_lock(&module->internal.spinlock); 733 action = module->internal.action_in_progress; 734 module->internal.action_in_progress++; 735 spdk_spin_unlock(&module->internal.spinlock); 736 module->examine_config(bdev); 737 if (action != module->internal.action_in_progress) { 738 SPDK_ERRLOG("examine_config for module %s did not call " 739 "spdk_bdev_module_examine_done()\n", module->name); 740 } 741 } 742 } 743 744 spdk_spin_lock(&bdev->internal.spinlock); 745 746 switch (bdev->internal.claim_type) { 747 case SPDK_BDEV_CLAIM_NONE: 748 /* Examine by all bdev modules */ 749 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 750 if (module->examine_disk) { 751 spdk_spin_lock(&module->internal.spinlock); 752 module->internal.action_in_progress++; 753 spdk_spin_unlock(&module->internal.spinlock); 754 spdk_spin_unlock(&bdev->internal.spinlock); 755 module->examine_disk(bdev); 756 spdk_spin_lock(&bdev->internal.spinlock); 757 } 758 } 759 break; 760 case SPDK_BDEV_CLAIM_EXCL_WRITE: 761 /* Examine by the one bdev module with a v1 claim */ 762 module = bdev->internal.claim.v1.module; 763 if (module->examine_disk) { 764 spdk_spin_lock(&module->internal.spinlock); 765 module->internal.action_in_progress++; 766 spdk_spin_unlock(&module->internal.spinlock); 767 spdk_spin_unlock(&bdev->internal.spinlock); 768 module->examine_disk(bdev); 769 return; 770 } 771 break; 772 default: 773 /* Examine by all bdev modules with a v2 claim */ 774 assert(claim_type_is_v2(bdev->internal.claim_type)); 775 /* 776 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 777 * list, perhaps accessing freed memory. Without protection, this could happen 778 * while the lock is dropped during the examine callback. 779 */ 780 bdev->internal.examine_in_progress++; 781 782 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 783 module = claim->module; 784 785 if (module == NULL) { 786 /* This is a vestigial claim, held by examine_count */ 787 continue; 788 } 789 790 if (module->examine_disk == NULL) { 791 continue; 792 } 793 794 spdk_spin_lock(&module->internal.spinlock); 795 module->internal.action_in_progress++; 796 spdk_spin_unlock(&module->internal.spinlock); 797 798 /* Call examine_disk without holding internal.spinlock. */ 799 spdk_spin_unlock(&bdev->internal.spinlock); 800 module->examine_disk(bdev); 801 spdk_spin_lock(&bdev->internal.spinlock); 802 } 803 804 assert(bdev->internal.examine_in_progress > 0); 805 bdev->internal.examine_in_progress--; 806 if (bdev->internal.examine_in_progress == 0) { 807 /* Remove any claims that were released during examine_disk */ 808 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 809 if (claim->desc != NULL) { 810 continue; 811 } 812 813 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 814 free(claim); 815 } 816 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 817 claim_reset(bdev); 818 } 819 } 820 } 821 822 spdk_spin_unlock(&bdev->internal.spinlock); 823 } 824 825 int 826 spdk_bdev_examine(const char *name) 827 { 828 struct spdk_bdev *bdev; 829 struct spdk_bdev_examine_item *item; 830 struct spdk_thread *thread = spdk_get_thread(); 831 832 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 833 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 834 thread ? spdk_thread_get_name(thread) : "null"); 835 return -EINVAL; 836 } 837 838 if (g_bdev_opts.bdev_auto_examine) { 839 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 840 return -EINVAL; 841 } 842 843 if (bdev_examine_allowlist_check(name)) { 844 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 845 return -EEXIST; 846 } 847 848 item = calloc(1, sizeof(*item)); 849 if (!item) { 850 return -ENOMEM; 851 } 852 item->name = strdup(name); 853 if (!item->name) { 854 free(item); 855 return -ENOMEM; 856 } 857 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 858 859 bdev = spdk_bdev_get_by_name(name); 860 if (bdev) { 861 bdev_examine(bdev); 862 } 863 return 0; 864 } 865 866 static inline void 867 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 868 { 869 struct spdk_bdev_examine_item *item; 870 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 871 spdk_json_write_object_begin(w); 872 spdk_json_write_named_string(w, "method", "bdev_examine"); 873 spdk_json_write_named_object_begin(w, "params"); 874 spdk_json_write_named_string(w, "name", item->name); 875 spdk_json_write_object_end(w); 876 spdk_json_write_object_end(w); 877 } 878 } 879 880 struct spdk_bdev * 881 spdk_bdev_first(void) 882 { 883 struct spdk_bdev *bdev; 884 885 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 struct spdk_bdev * 894 spdk_bdev_next(struct spdk_bdev *prev) 895 { 896 struct spdk_bdev *bdev; 897 898 bdev = TAILQ_NEXT(prev, internal.link); 899 if (bdev) { 900 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 901 } 902 903 return bdev; 904 } 905 906 static struct spdk_bdev * 907 _bdev_next_leaf(struct spdk_bdev *bdev) 908 { 909 while (bdev != NULL) { 910 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 911 return bdev; 912 } else { 913 bdev = TAILQ_NEXT(bdev, internal.link); 914 } 915 } 916 917 return bdev; 918 } 919 920 struct spdk_bdev * 921 spdk_bdev_first_leaf(void) 922 { 923 struct spdk_bdev *bdev; 924 925 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 926 927 if (bdev) { 928 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 929 } 930 931 return bdev; 932 } 933 934 struct spdk_bdev * 935 spdk_bdev_next_leaf(struct spdk_bdev *prev) 936 { 937 struct spdk_bdev *bdev; 938 939 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 940 941 if (bdev) { 942 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 943 } 944 945 return bdev; 946 } 947 948 static inline bool 949 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 950 { 951 return bdev_io->internal.f.has_memory_domain; 952 } 953 954 static inline bool 955 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 956 { 957 return bdev_io->internal.f.has_accel_sequence; 958 } 959 960 static inline void 961 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 962 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 963 { 964 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 965 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 966 * channels we will instead wait for half to complete. 967 */ 968 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 969 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 970 971 assert(state != BDEV_IO_RETRY_STATE_INVALID); 972 bdev_io->internal.retry_state = state; 973 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 974 } 975 976 static inline void 977 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 978 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 979 { 980 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 981 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 982 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 983 984 assert(state != BDEV_IO_RETRY_STATE_INVALID); 985 bdev_io->internal.retry_state = state; 986 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 987 } 988 989 void 990 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 991 { 992 struct iovec *iovs; 993 994 if (bdev_io->u.bdev.iovs == NULL) { 995 bdev_io->u.bdev.iovs = &bdev_io->iov; 996 bdev_io->u.bdev.iovcnt = 1; 997 } 998 999 iovs = bdev_io->u.bdev.iovs; 1000 1001 assert(iovs != NULL); 1002 assert(bdev_io->u.bdev.iovcnt >= 1); 1003 1004 iovs[0].iov_base = buf; 1005 iovs[0].iov_len = len; 1006 } 1007 1008 void 1009 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1010 { 1011 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1012 bdev_io->u.bdev.md_buf = md_buf; 1013 } 1014 1015 static bool 1016 _is_buf_allocated(const struct iovec *iovs) 1017 { 1018 if (iovs == NULL) { 1019 return false; 1020 } 1021 1022 return iovs[0].iov_base != NULL; 1023 } 1024 1025 static bool 1026 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1027 { 1028 int i; 1029 uintptr_t iov_base; 1030 1031 if (spdk_likely(alignment == 1)) { 1032 return true; 1033 } 1034 1035 for (i = 0; i < iovcnt; i++) { 1036 iov_base = (uintptr_t)iovs[i].iov_base; 1037 if ((iov_base & (alignment - 1)) != 0) { 1038 return false; 1039 } 1040 } 1041 1042 return true; 1043 } 1044 1045 static inline bool 1046 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1047 { 1048 if (!bdev_io_use_accel_sequence(bdev_io)) { 1049 return false; 1050 } 1051 1052 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1053 * bdev module didn't support accel sequences */ 1054 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1055 } 1056 1057 static inline void 1058 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1059 struct spdk_bdev_shared_resource *shared_resource) 1060 { 1061 bdev_ch->io_outstanding++; 1062 shared_resource->io_outstanding++; 1063 } 1064 1065 static inline void 1066 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1067 struct spdk_bdev_shared_resource *shared_resource) 1068 { 1069 assert(bdev_ch->io_outstanding > 0); 1070 assert(shared_resource->io_outstanding > 0); 1071 bdev_ch->io_outstanding--; 1072 shared_resource->io_outstanding--; 1073 } 1074 1075 static void 1076 bdev_io_submit_sequence_cb(void *ctx, int status) 1077 { 1078 struct spdk_bdev_io *bdev_io = ctx; 1079 1080 assert(bdev_io_use_accel_sequence(bdev_io)); 1081 1082 bdev_io->u.bdev.accel_sequence = NULL; 1083 bdev_io->internal.f.has_accel_sequence = false; 1084 1085 if (spdk_unlikely(status != 0)) { 1086 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1087 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1088 bdev_io_complete_unsubmitted(bdev_io); 1089 return; 1090 } 1091 1092 bdev_io_submit(bdev_io); 1093 } 1094 1095 static void 1096 bdev_io_exec_sequence_cb(void *ctx, int status) 1097 { 1098 struct spdk_bdev_io *bdev_io = ctx; 1099 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1100 1101 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1102 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1103 1104 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1105 bdev_ch_retry_io(ch); 1106 } 1107 1108 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1109 } 1110 1111 static void 1112 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1113 { 1114 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1115 1116 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1117 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1118 assert(bdev_io_use_accel_sequence(bdev_io)); 1119 1120 /* Since the operations are appended during submission, they're in the opposite order than 1121 * how we want to execute them for reads (i.e. we need to execute the most recently added 1122 * operation first), so reverse the sequence before executing it. 1123 */ 1124 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1125 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1126 } 1127 1128 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1129 bdev_io_increment_outstanding(ch, ch->shared_resource); 1130 bdev_io->internal.data_transfer_cpl = cb_fn; 1131 1132 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1133 bdev_io_exec_sequence_cb, bdev_io); 1134 } 1135 1136 static void 1137 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1138 { 1139 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1140 void *buf; 1141 1142 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1143 buf = bdev_io->internal.buf.ptr; 1144 bdev_io->internal.buf.ptr = NULL; 1145 bdev_io->internal.f.has_buf = false; 1146 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1147 bdev_io->internal.get_aux_buf_cb = NULL; 1148 } else { 1149 assert(bdev_io->internal.get_buf_cb != NULL); 1150 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1151 bdev_io->internal.get_buf_cb = NULL; 1152 } 1153 } 1154 1155 static void 1156 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1157 { 1158 struct spdk_bdev_io *bdev_io = ctx; 1159 1160 if (rc) { 1161 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1162 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1163 } 1164 bdev_io_get_buf_complete(bdev_io, !rc); 1165 } 1166 1167 static void 1168 bdev_io_pull_md_buf_done(void *ctx, int status) 1169 { 1170 struct spdk_bdev_io *bdev_io = ctx; 1171 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1172 1173 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1174 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1175 1176 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1177 bdev_ch_retry_io(ch); 1178 } 1179 1180 assert(bdev_io->internal.data_transfer_cpl); 1181 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1182 } 1183 1184 static void 1185 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1186 { 1187 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1188 int rc = 0; 1189 1190 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1191 assert(bdev_io->internal.f.has_bounce_buf); 1192 if (bdev_io_use_memory_domain(bdev_io)) { 1193 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1194 bdev_io_increment_outstanding(ch, ch->shared_resource); 1195 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1196 bdev_io->internal.memory_domain_ctx, 1197 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1198 &bdev_io->internal.bounce_buf.md_iov, 1, 1199 bdev_io_pull_md_buf_done, bdev_io); 1200 if (rc == 0) { 1201 /* Continue to submit IO in completion callback */ 1202 return; 1203 } 1204 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1205 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1206 if (rc != -ENOMEM) { 1207 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1208 spdk_memory_domain_get_dma_device_id( 1209 bdev_io->internal.memory_domain), rc); 1210 } 1211 } else { 1212 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1213 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1214 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1215 } 1216 } 1217 1218 if (spdk_unlikely(rc == -ENOMEM)) { 1219 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1220 } else { 1221 assert(bdev_io->internal.data_transfer_cpl); 1222 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1223 } 1224 } 1225 1226 static void 1227 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1228 { 1229 assert(bdev_io->internal.f.has_bounce_buf); 1230 1231 /* save original md_buf */ 1232 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1233 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1234 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1235 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1236 /* set bounce md_buf */ 1237 bdev_io->u.bdev.md_buf = md_buf; 1238 1239 bdev_io_pull_md_buf(bdev_io); 1240 } 1241 1242 static void 1243 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1244 { 1245 struct spdk_bdev *bdev = bdev_io->bdev; 1246 uint64_t md_len; 1247 void *buf; 1248 1249 if (spdk_bdev_is_md_separate(bdev)) { 1250 assert(!bdev_io_use_accel_sequence(bdev_io)); 1251 1252 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1253 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1254 1255 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1256 1257 if (bdev_io->u.bdev.md_buf != NULL) { 1258 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1259 return; 1260 } else { 1261 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1262 } 1263 } 1264 1265 bdev_io_get_buf_complete(bdev_io, true); 1266 } 1267 1268 static inline void 1269 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1270 { 1271 if (rc) { 1272 SPDK_ERRLOG("Failed to get data buffer\n"); 1273 assert(bdev_io->internal.data_transfer_cpl); 1274 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1275 return; 1276 } 1277 1278 _bdev_io_set_md_buf(bdev_io); 1279 } 1280 1281 static void 1282 bdev_io_pull_data_done_and_track(void *ctx, int status) 1283 { 1284 struct spdk_bdev_io *bdev_io = ctx; 1285 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1286 1287 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1288 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1289 1290 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1291 bdev_ch_retry_io(ch); 1292 } 1293 1294 bdev_io_pull_data_done(bdev_io, status); 1295 } 1296 1297 static void 1298 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1299 { 1300 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1301 int rc = 0; 1302 1303 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1304 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1305 * operation */ 1306 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1307 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1308 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1309 assert(bdev_io_use_accel_sequence(bdev_io)); 1310 assert(bdev_io->internal.f.has_bounce_buf); 1311 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1312 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1313 NULL, NULL, 1314 bdev_io->internal.bounce_buf.orig_iovs, 1315 bdev_io->internal.bounce_buf.orig_iovcnt, 1316 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1317 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1318 NULL, NULL); 1319 } else { 1320 /* We need to reverse the src/dst for reads */ 1321 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1322 assert(bdev_io_use_accel_sequence(bdev_io)); 1323 assert(bdev_io->internal.f.has_bounce_buf); 1324 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1325 bdev_io->internal.bounce_buf.orig_iovs, 1326 bdev_io->internal.bounce_buf.orig_iovcnt, 1327 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1328 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1329 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1330 NULL, NULL, NULL, NULL); 1331 } 1332 1333 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1334 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1335 bdev_io->internal.accel_sequence); 1336 } 1337 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1338 /* if this is write path, copy data from original buffer to bounce buffer */ 1339 if (bdev_io_use_memory_domain(bdev_io)) { 1340 assert(bdev_io->internal.f.has_bounce_buf); 1341 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1342 bdev_io_increment_outstanding(ch, ch->shared_resource); 1343 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1344 bdev_io->internal.memory_domain_ctx, 1345 bdev_io->internal.bounce_buf.orig_iovs, 1346 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1347 bdev_io->u.bdev.iovs, 1, 1348 bdev_io_pull_data_done_and_track, 1349 bdev_io); 1350 if (rc == 0) { 1351 /* Continue to submit IO in completion callback */ 1352 return; 1353 } 1354 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1355 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1356 if (rc != -ENOMEM) { 1357 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1358 spdk_memory_domain_get_dma_device_id( 1359 bdev_io->internal.memory_domain)); 1360 } 1361 } else { 1362 assert(bdev_io->u.bdev.iovcnt == 1); 1363 assert(bdev_io->internal.f.has_bounce_buf); 1364 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1365 bdev_io->u.bdev.iovs[0].iov_len, 1366 bdev_io->internal.bounce_buf.orig_iovs, 1367 bdev_io->internal.bounce_buf.orig_iovcnt); 1368 } 1369 } 1370 1371 if (spdk_unlikely(rc == -ENOMEM)) { 1372 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1373 } else { 1374 bdev_io_pull_data_done(bdev_io, rc); 1375 } 1376 } 1377 1378 static void 1379 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1380 bdev_copy_bounce_buffer_cpl cpl_cb) 1381 { 1382 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1383 1384 assert(bdev_io->internal.f.has_bounce_buf == false); 1385 1386 bdev_io->internal.data_transfer_cpl = cpl_cb; 1387 bdev_io->internal.f.has_bounce_buf = true; 1388 /* save original iovec */ 1389 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1390 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1391 /* zero the other data members */ 1392 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1393 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1394 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1395 /* set bounce iov */ 1396 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1397 bdev_io->u.bdev.iovcnt = 1; 1398 /* set bounce buffer for this operation */ 1399 bdev_io->u.bdev.iovs[0].iov_base = buf; 1400 bdev_io->u.bdev.iovs[0].iov_len = len; 1401 /* Now we use 1 iov, the split condition could have been changed */ 1402 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1403 1404 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1405 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1406 } else { 1407 bdev_io_pull_data(bdev_io); 1408 } 1409 } 1410 1411 static void 1412 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1413 { 1414 struct spdk_bdev *bdev = bdev_io->bdev; 1415 bool buf_allocated; 1416 uint64_t alignment; 1417 void *aligned_buf; 1418 1419 bdev_io->internal.buf.ptr = buf; 1420 bdev_io->internal.f.has_buf = true; 1421 1422 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1423 bdev_io_get_buf_complete(bdev_io, true); 1424 return; 1425 } 1426 1427 alignment = spdk_bdev_get_buf_align(bdev); 1428 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1429 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1430 1431 if (buf_allocated) { 1432 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1433 /* Continue in completion callback */ 1434 return; 1435 } else { 1436 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1437 } 1438 1439 _bdev_io_set_md_buf(bdev_io); 1440 } 1441 1442 static inline uint64_t 1443 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1444 { 1445 struct spdk_bdev *bdev = bdev_io->bdev; 1446 uint64_t md_len, alignment; 1447 1448 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1449 1450 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1451 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1452 1453 return len + alignment + md_len; 1454 } 1455 1456 static void 1457 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1458 { 1459 struct spdk_bdev_mgmt_channel *ch; 1460 1461 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1462 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1463 } 1464 1465 static void 1466 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1467 { 1468 assert(bdev_io->internal.f.has_buf); 1469 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1470 bdev_io->internal.buf.ptr = NULL; 1471 bdev_io->internal.f.has_buf = false; 1472 } 1473 1474 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1475 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1476 1477 void 1478 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1479 { 1480 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1481 1482 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1483 1484 assert(buf != NULL); 1485 _bdev_io_put_buf(bdev_io, buf, len); 1486 } 1487 1488 static inline void 1489 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1490 struct spdk_bdev_io *bdev_io) 1491 { 1492 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1493 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1494 * sequence pointer to make sure we won't touch it anymore. */ 1495 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1496 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1497 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1498 bdev_io->internal.f.has_accel_sequence = false; 1499 } 1500 1501 bdev->fn_table->submit_request(ioch, bdev_io); 1502 } 1503 1504 static inline void 1505 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1506 { 1507 struct spdk_bdev *bdev = bdev_io->bdev; 1508 1509 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1510 bdev_io->internal.error.nvme.cdw0 = 0; 1511 bdev_io->num_retries++; 1512 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1513 } 1514 1515 static void 1516 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1517 { 1518 struct spdk_bdev_io *bdev_io; 1519 1520 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1521 /* 1522 * Allow some more I/O to complete before retrying the nomem_io queue. 1523 * Some drivers (such as nvme) cannot immediately take a new I/O in 1524 * the context of a completion, because the resources for the I/O are 1525 * not released until control returns to the bdev poller. Also, we 1526 * may require several small I/O to complete before a larger I/O 1527 * (that requires splitting) can be submitted. 1528 */ 1529 return; 1530 } 1531 1532 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1533 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1534 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1535 1536 switch (bdev_io->internal.retry_state) { 1537 case BDEV_IO_RETRY_STATE_SUBMIT: 1538 bdev_ch_resubmit_io(shared_resource, bdev_io); 1539 break; 1540 case BDEV_IO_RETRY_STATE_PULL: 1541 bdev_io_pull_data(bdev_io); 1542 break; 1543 case BDEV_IO_RETRY_STATE_PULL_MD: 1544 bdev_io_pull_md_buf(bdev_io); 1545 break; 1546 case BDEV_IO_RETRY_STATE_PUSH: 1547 bdev_io_push_bounce_data(bdev_io); 1548 break; 1549 case BDEV_IO_RETRY_STATE_PUSH_MD: 1550 bdev_io_push_bounce_md_buf(bdev_io); 1551 break; 1552 default: 1553 assert(0 && "invalid retry state"); 1554 break; 1555 } 1556 1557 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1558 /* This IO completed again with NOMEM status, so break the loop and 1559 * don't try anymore. Note that a bdev_io that fails with NOMEM 1560 * always gets requeued at the front of the list, to maintain 1561 * ordering. 1562 */ 1563 break; 1564 } 1565 } 1566 } 1567 1568 static void 1569 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1570 { 1571 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1572 } 1573 1574 static int 1575 bdev_no_mem_poller(void *ctx) 1576 { 1577 struct spdk_bdev_shared_resource *shared_resource = ctx; 1578 1579 spdk_poller_unregister(&shared_resource->nomem_poller); 1580 1581 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1582 bdev_shared_ch_retry_io(shared_resource); 1583 } 1584 /* the retry cb may re-register the poller so double check */ 1585 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1586 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1587 /* No IOs were submitted, try again */ 1588 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1589 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1590 } 1591 1592 return SPDK_POLLER_BUSY; 1593 } 1594 1595 static inline bool 1596 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1597 { 1598 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1599 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1600 1601 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1602 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1603 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1604 1605 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1606 /* Special case when we have nomem IOs and no outstanding IOs which completions 1607 * could trigger retry of queued IOs 1608 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1609 * new IOs submitted, e.g. qd==1 */ 1610 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1611 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1612 } 1613 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1614 * ownership of that sequence is transferred back to the bdev layer, so we need to 1615 * restore internal.accel_sequence to make sure that the sequence is handled 1616 * correctly in case the I/O is later aborted. */ 1617 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1618 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1619 assert(!bdev_io_use_accel_sequence(bdev_io)); 1620 bdev_io->internal.f.has_accel_sequence = true; 1621 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1622 } 1623 1624 return true; 1625 } 1626 1627 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1628 bdev_ch_retry_io(bdev_ch); 1629 } 1630 1631 return false; 1632 } 1633 1634 static void 1635 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1636 { 1637 struct spdk_bdev_io *bdev_io = ctx; 1638 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1639 1640 if (rc) { 1641 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1642 } 1643 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1644 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1645 */ 1646 bdev_io_put_buf(bdev_io); 1647 1648 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1649 bdev_ch_retry_io(ch); 1650 } 1651 1652 /* Continue with IO completion flow */ 1653 bdev_io_complete(bdev_io); 1654 } 1655 1656 static void 1657 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1658 { 1659 struct spdk_bdev_io *bdev_io = ctx; 1660 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1661 1662 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1663 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1664 bdev_io->internal.f.has_bounce_buf = false; 1665 1666 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1667 bdev_ch_retry_io(ch); 1668 } 1669 1670 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1671 } 1672 1673 static inline void 1674 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1675 { 1676 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1677 int rc = 0; 1678 1679 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1680 assert(bdev_io->internal.f.has_bounce_buf); 1681 1682 /* do the same for metadata buffer */ 1683 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1684 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1685 1686 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1687 if (bdev_io_use_memory_domain(bdev_io)) { 1688 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1689 bdev_io_increment_outstanding(ch, ch->shared_resource); 1690 /* If memory domain is used then we need to call async push function */ 1691 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1692 bdev_io->internal.memory_domain_ctx, 1693 &bdev_io->internal.bounce_buf.orig_md_iov, 1694 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1695 &bdev_io->internal.bounce_buf.md_iov, 1, 1696 bdev_io_push_bounce_md_buf_done, 1697 bdev_io); 1698 if (rc == 0) { 1699 /* Continue IO completion in async callback */ 1700 return; 1701 } 1702 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1703 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1704 if (rc != -ENOMEM) { 1705 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1706 spdk_memory_domain_get_dma_device_id( 1707 bdev_io->internal.memory_domain)); 1708 } 1709 } else { 1710 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1711 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1712 } 1713 } 1714 } 1715 1716 if (spdk_unlikely(rc == -ENOMEM)) { 1717 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1718 } else { 1719 assert(bdev_io->internal.data_transfer_cpl); 1720 bdev_io->internal.f.has_bounce_buf = false; 1721 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1722 } 1723 } 1724 1725 static inline void 1726 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1727 { 1728 assert(bdev_io->internal.data_transfer_cpl); 1729 if (rc) { 1730 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1731 return; 1732 } 1733 1734 /* set original buffer for this io */ 1735 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1736 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1737 1738 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1739 * we still need to clear the md buf */ 1740 1741 bdev_io_push_bounce_md_buf(bdev_io); 1742 } 1743 1744 static void 1745 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1746 { 1747 struct spdk_bdev_io *bdev_io = ctx; 1748 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1749 1750 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1751 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1752 1753 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1754 bdev_ch_retry_io(ch); 1755 } 1756 1757 bdev_io_push_bounce_data_done(bdev_io, status); 1758 } 1759 1760 static inline void 1761 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1762 { 1763 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1764 int rc = 0; 1765 1766 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1767 assert(!bdev_io_use_accel_sequence(bdev_io)); 1768 assert(bdev_io->internal.f.has_bounce_buf); 1769 1770 /* if this is read path, copy data from bounce buffer to original buffer */ 1771 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1772 if (bdev_io_use_memory_domain(bdev_io)) { 1773 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1774 bdev_io_increment_outstanding(ch, ch->shared_resource); 1775 /* If memory domain is used then we need to call async push function */ 1776 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1777 bdev_io->internal.memory_domain_ctx, 1778 bdev_io->internal.bounce_buf.orig_iovs, 1779 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1780 &bdev_io->internal.bounce_buf.iov, 1, 1781 bdev_io_push_bounce_data_done_and_track, 1782 bdev_io); 1783 if (rc == 0) { 1784 /* Continue IO completion in async callback */ 1785 return; 1786 } 1787 1788 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1789 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1790 if (rc != -ENOMEM) { 1791 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1792 spdk_memory_domain_get_dma_device_id( 1793 bdev_io->internal.memory_domain)); 1794 } 1795 } else { 1796 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1797 bdev_io->internal.bounce_buf.orig_iovcnt, 1798 bdev_io->internal.bounce_buf.iov.iov_base, 1799 bdev_io->internal.bounce_buf.iov.iov_len); 1800 } 1801 } 1802 1803 if (spdk_unlikely(rc == -ENOMEM)) { 1804 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1805 } else { 1806 bdev_io_push_bounce_data_done(bdev_io, rc); 1807 } 1808 } 1809 1810 static inline void 1811 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1812 { 1813 bdev_io->internal.data_transfer_cpl = cpl_cb; 1814 bdev_io_push_bounce_data(bdev_io); 1815 } 1816 1817 static void 1818 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1819 { 1820 struct spdk_bdev_io *bdev_io; 1821 1822 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1823 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1824 } 1825 1826 static void 1827 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1828 { 1829 struct spdk_bdev_mgmt_channel *mgmt_ch; 1830 uint64_t max_len; 1831 void *buf; 1832 1833 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1834 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1835 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1836 1837 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1838 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1839 bdev_io_get_buf_complete(bdev_io, false); 1840 return; 1841 } 1842 1843 bdev_io->internal.buf.len = len; 1844 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1845 bdev_io_get_iobuf_cb); 1846 if (buf != NULL) { 1847 _bdev_io_set_buf(bdev_io, buf, len); 1848 } 1849 } 1850 1851 void 1852 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1853 { 1854 struct spdk_bdev *bdev = bdev_io->bdev; 1855 uint64_t alignment; 1856 1857 assert(cb != NULL); 1858 bdev_io->internal.get_buf_cb = cb; 1859 1860 alignment = spdk_bdev_get_buf_align(bdev); 1861 1862 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1863 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1864 /* Buffer already present and aligned */ 1865 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1866 return; 1867 } 1868 1869 bdev_io_get_buf(bdev_io, len); 1870 } 1871 1872 static void 1873 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1874 bool success) 1875 { 1876 if (!success) { 1877 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1878 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1879 bdev_io_complete_unsubmitted(bdev_io); 1880 return; 1881 } 1882 1883 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1884 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1885 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1886 return; 1887 } 1888 /* For reads we'll execute the sequence after the data is read, so, for now, only 1889 * clear out accel_sequence pointer and submit the IO */ 1890 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1891 bdev_io->u.bdev.accel_sequence = NULL; 1892 } 1893 1894 bdev_io_submit(bdev_io); 1895 } 1896 1897 static void 1898 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1899 uint64_t len) 1900 { 1901 assert(cb != NULL); 1902 bdev_io->internal.get_buf_cb = cb; 1903 1904 bdev_io_get_buf(bdev_io, len); 1905 } 1906 1907 1908 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 1909 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 1910 1911 void 1912 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1913 { 1914 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1915 1916 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 1917 1918 assert(cb != NULL); 1919 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1920 bdev_io->internal.get_aux_buf_cb = cb; 1921 bdev_io_get_buf(bdev_io, len); 1922 } 1923 1924 static int 1925 bdev_module_get_max_ctx_size(void) 1926 { 1927 struct spdk_bdev_module *bdev_module; 1928 int max_bdev_module_size = 0; 1929 1930 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1931 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1932 max_bdev_module_size = bdev_module->get_ctx_size(); 1933 } 1934 } 1935 1936 return max_bdev_module_size; 1937 } 1938 1939 static void 1940 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1941 { 1942 if (!bdev->internal.histogram_enabled) { 1943 return; 1944 } 1945 1946 spdk_json_write_object_begin(w); 1947 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1948 1949 spdk_json_write_named_object_begin(w, "params"); 1950 spdk_json_write_named_string(w, "name", bdev->name); 1951 1952 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1953 1954 if (bdev->internal.histogram_io_type) { 1955 spdk_json_write_named_string(w, "opc", 1956 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1957 } 1958 1959 spdk_json_write_object_end(w); 1960 1961 spdk_json_write_object_end(w); 1962 } 1963 1964 static void 1965 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1966 { 1967 int i; 1968 struct spdk_bdev_qos *qos = bdev->internal.qos; 1969 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1970 1971 if (!qos) { 1972 return; 1973 } 1974 1975 spdk_bdev_get_qos_rate_limits(bdev, limits); 1976 1977 spdk_json_write_object_begin(w); 1978 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1979 1980 spdk_json_write_named_object_begin(w, "params"); 1981 spdk_json_write_named_string(w, "name", bdev->name); 1982 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1983 if (limits[i] > 0) { 1984 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1985 } 1986 } 1987 spdk_json_write_object_end(w); 1988 1989 spdk_json_write_object_end(w); 1990 } 1991 1992 void 1993 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1994 { 1995 struct spdk_bdev_module *bdev_module; 1996 struct spdk_bdev *bdev; 1997 1998 assert(w != NULL); 1999 2000 spdk_json_write_array_begin(w); 2001 2002 spdk_json_write_object_begin(w); 2003 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2004 spdk_json_write_named_object_begin(w, "params"); 2005 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2006 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2007 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2008 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2009 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2010 spdk_json_write_object_end(w); 2011 spdk_json_write_object_end(w); 2012 2013 bdev_examine_allowlist_config_json(w); 2014 2015 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2016 if (bdev_module->config_json) { 2017 bdev_module->config_json(w); 2018 } 2019 } 2020 2021 spdk_spin_lock(&g_bdev_mgr.spinlock); 2022 2023 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2024 if (bdev->fn_table->write_config_json) { 2025 bdev->fn_table->write_config_json(bdev, w); 2026 } 2027 2028 bdev_qos_config_json(bdev, w); 2029 bdev_enable_histogram_config_json(bdev, w); 2030 } 2031 2032 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2033 2034 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2035 spdk_json_write_object_begin(w); 2036 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2037 spdk_json_write_object_end(w); 2038 2039 spdk_json_write_array_end(w); 2040 } 2041 2042 static void 2043 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2044 { 2045 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2046 struct spdk_bdev_io *bdev_io; 2047 2048 spdk_iobuf_channel_fini(&ch->iobuf); 2049 2050 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2051 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2052 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2053 ch->per_thread_cache_count--; 2054 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2055 } 2056 2057 assert(ch->per_thread_cache_count == 0); 2058 } 2059 2060 static int 2061 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2062 { 2063 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2064 struct spdk_bdev_io *bdev_io; 2065 uint32_t i; 2066 int rc; 2067 2068 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2069 g_bdev_opts.iobuf_small_cache_size, 2070 g_bdev_opts.iobuf_large_cache_size); 2071 if (rc != 0) { 2072 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2073 return -1; 2074 } 2075 2076 STAILQ_INIT(&ch->per_thread_cache); 2077 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2078 2079 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2080 ch->per_thread_cache_count = 0; 2081 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2082 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2083 if (bdev_io == NULL) { 2084 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2085 assert(false); 2086 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2087 return -1; 2088 } 2089 ch->per_thread_cache_count++; 2090 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2091 } 2092 2093 TAILQ_INIT(&ch->shared_resources); 2094 TAILQ_INIT(&ch->io_wait_queue); 2095 2096 return 0; 2097 } 2098 2099 static void 2100 bdev_init_complete(int rc) 2101 { 2102 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2103 void *cb_arg = g_init_cb_arg; 2104 struct spdk_bdev_module *m; 2105 2106 g_bdev_mgr.init_complete = true; 2107 g_init_cb_fn = NULL; 2108 g_init_cb_arg = NULL; 2109 2110 /* 2111 * For modules that need to know when subsystem init is complete, 2112 * inform them now. 2113 */ 2114 if (rc == 0) { 2115 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2116 if (m->init_complete) { 2117 m->init_complete(); 2118 } 2119 } 2120 } 2121 2122 cb_fn(cb_arg, rc); 2123 } 2124 2125 static bool 2126 bdev_module_all_actions_completed(void) 2127 { 2128 struct spdk_bdev_module *m; 2129 2130 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2131 if (m->internal.action_in_progress > 0) { 2132 return false; 2133 } 2134 } 2135 return true; 2136 } 2137 2138 static void 2139 bdev_module_action_complete(void) 2140 { 2141 /* 2142 * Don't finish bdev subsystem initialization if 2143 * module pre-initialization is still in progress, or 2144 * the subsystem been already initialized. 2145 */ 2146 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2147 return; 2148 } 2149 2150 /* 2151 * Check all bdev modules for inits/examinations in progress. If any 2152 * exist, return immediately since we cannot finish bdev subsystem 2153 * initialization until all are completed. 2154 */ 2155 if (!bdev_module_all_actions_completed()) { 2156 return; 2157 } 2158 2159 /* 2160 * Modules already finished initialization - now that all 2161 * the bdev modules have finished their asynchronous I/O 2162 * processing, the entire bdev layer can be marked as complete. 2163 */ 2164 bdev_init_complete(0); 2165 } 2166 2167 static void 2168 bdev_module_action_done(struct spdk_bdev_module *module) 2169 { 2170 spdk_spin_lock(&module->internal.spinlock); 2171 assert(module->internal.action_in_progress > 0); 2172 module->internal.action_in_progress--; 2173 spdk_spin_unlock(&module->internal.spinlock); 2174 bdev_module_action_complete(); 2175 } 2176 2177 void 2178 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2179 { 2180 assert(module->async_init); 2181 bdev_module_action_done(module); 2182 } 2183 2184 void 2185 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2186 { 2187 bdev_module_action_done(module); 2188 } 2189 2190 /** The last initialized bdev module */ 2191 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2192 2193 static void 2194 bdev_init_failed(void *cb_arg) 2195 { 2196 struct spdk_bdev_module *module = cb_arg; 2197 2198 spdk_spin_lock(&module->internal.spinlock); 2199 assert(module->internal.action_in_progress > 0); 2200 module->internal.action_in_progress--; 2201 spdk_spin_unlock(&module->internal.spinlock); 2202 bdev_init_complete(-1); 2203 } 2204 2205 static int 2206 bdev_modules_init(void) 2207 { 2208 struct spdk_bdev_module *module; 2209 int rc = 0; 2210 2211 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2212 g_resume_bdev_module = module; 2213 if (module->async_init) { 2214 spdk_spin_lock(&module->internal.spinlock); 2215 module->internal.action_in_progress = 1; 2216 spdk_spin_unlock(&module->internal.spinlock); 2217 } 2218 rc = module->module_init(); 2219 if (rc != 0) { 2220 /* Bump action_in_progress to prevent other modules from completion of modules_init 2221 * Send message to defer application shutdown until resources are cleaned up */ 2222 spdk_spin_lock(&module->internal.spinlock); 2223 module->internal.action_in_progress = 1; 2224 spdk_spin_unlock(&module->internal.spinlock); 2225 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2226 return rc; 2227 } 2228 } 2229 2230 g_resume_bdev_module = NULL; 2231 return 0; 2232 } 2233 2234 void 2235 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2236 { 2237 int rc = 0; 2238 char mempool_name[32]; 2239 2240 assert(cb_fn != NULL); 2241 2242 g_init_cb_fn = cb_fn; 2243 g_init_cb_arg = cb_arg; 2244 2245 spdk_notify_type_register("bdev_register"); 2246 spdk_notify_type_register("bdev_unregister"); 2247 2248 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2249 2250 rc = spdk_iobuf_register_module("bdev"); 2251 if (rc != 0) { 2252 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2253 bdev_init_complete(-1); 2254 return; 2255 } 2256 2257 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2258 g_bdev_opts.bdev_io_pool_size, 2259 sizeof(struct spdk_bdev_io) + 2260 bdev_module_get_max_ctx_size(), 2261 0, 2262 SPDK_ENV_NUMA_ID_ANY); 2263 2264 if (g_bdev_mgr.bdev_io_pool == NULL) { 2265 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2266 bdev_init_complete(-1); 2267 return; 2268 } 2269 2270 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2271 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2272 if (!g_bdev_mgr.zero_buffer) { 2273 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2274 bdev_init_complete(-1); 2275 return; 2276 } 2277 2278 #ifdef SPDK_CONFIG_VTUNE 2279 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2280 #endif 2281 2282 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2283 bdev_mgmt_channel_destroy, 2284 sizeof(struct spdk_bdev_mgmt_channel), 2285 "bdev_mgr"); 2286 2287 rc = bdev_modules_init(); 2288 g_bdev_mgr.module_init_complete = true; 2289 if (rc != 0) { 2290 SPDK_ERRLOG("bdev modules init failed\n"); 2291 return; 2292 } 2293 2294 bdev_module_action_complete(); 2295 } 2296 2297 static void 2298 bdev_mgr_unregister_cb(void *io_device) 2299 { 2300 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2301 2302 if (g_bdev_mgr.bdev_io_pool) { 2303 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2304 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2305 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2306 g_bdev_opts.bdev_io_pool_size); 2307 } 2308 2309 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2310 } 2311 2312 spdk_free(g_bdev_mgr.zero_buffer); 2313 2314 bdev_examine_allowlist_free(); 2315 2316 cb_fn(g_fini_cb_arg); 2317 g_fini_cb_fn = NULL; 2318 g_fini_cb_arg = NULL; 2319 g_bdev_mgr.init_complete = false; 2320 g_bdev_mgr.module_init_complete = false; 2321 } 2322 2323 static void 2324 bdev_module_fini_iter(void *arg) 2325 { 2326 struct spdk_bdev_module *bdev_module; 2327 2328 /* FIXME: Handling initialization failures is broken now, 2329 * so we won't even try cleaning up after successfully 2330 * initialized modules. if module_init_complete is false, 2331 * just call spdk_bdev_mgr_unregister_cb 2332 */ 2333 if (!g_bdev_mgr.module_init_complete) { 2334 bdev_mgr_unregister_cb(NULL); 2335 return; 2336 } 2337 2338 /* Start iterating from the last touched module */ 2339 if (!g_resume_bdev_module) { 2340 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2341 } else { 2342 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2343 internal.tailq); 2344 } 2345 2346 while (bdev_module) { 2347 if (bdev_module->async_fini) { 2348 /* Save our place so we can resume later. We must 2349 * save the variable here, before calling module_fini() 2350 * below, because in some cases the module may immediately 2351 * call spdk_bdev_module_fini_done() and re-enter 2352 * this function to continue iterating. */ 2353 g_resume_bdev_module = bdev_module; 2354 } 2355 2356 if (bdev_module->module_fini) { 2357 bdev_module->module_fini(); 2358 } 2359 2360 if (bdev_module->async_fini) { 2361 return; 2362 } 2363 2364 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2365 internal.tailq); 2366 } 2367 2368 g_resume_bdev_module = NULL; 2369 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2370 } 2371 2372 void 2373 spdk_bdev_module_fini_done(void) 2374 { 2375 if (spdk_get_thread() != g_fini_thread) { 2376 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2377 } else { 2378 bdev_module_fini_iter(NULL); 2379 } 2380 } 2381 2382 static void 2383 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2384 { 2385 struct spdk_bdev *bdev = cb_arg; 2386 2387 if (bdeverrno && bdev) { 2388 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2389 bdev->name); 2390 2391 /* 2392 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2393 * bdev; try to continue by manually removing this bdev from the list and continue 2394 * with the next bdev in the list. 2395 */ 2396 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2397 } 2398 2399 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2400 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2401 /* 2402 * Bdev module finish need to be deferred as we might be in the middle of some context 2403 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2404 * after returning. 2405 */ 2406 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2407 return; 2408 } 2409 2410 /* 2411 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2412 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2413 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2414 * base bdevs. 2415 * 2416 * Also, walk the list in the reverse order. 2417 */ 2418 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2419 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2420 spdk_spin_lock(&bdev->internal.spinlock); 2421 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2422 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2423 spdk_spin_unlock(&bdev->internal.spinlock); 2424 continue; 2425 } 2426 spdk_spin_unlock(&bdev->internal.spinlock); 2427 2428 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2429 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2430 return; 2431 } 2432 2433 /* 2434 * If any bdev fails to unclaim underlying bdev properly, we may face the 2435 * case of bdev list consisting of claimed bdevs only (if claims are managed 2436 * correctly, this would mean there's a loop in the claims graph which is 2437 * clearly impossible). Warn and unregister last bdev on the list then. 2438 */ 2439 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2440 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2441 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2442 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2443 return; 2444 } 2445 } 2446 2447 static void 2448 bdev_module_fini_start_iter(void *arg) 2449 { 2450 struct spdk_bdev_module *bdev_module; 2451 2452 if (!g_resume_bdev_module) { 2453 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2454 } else { 2455 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2456 } 2457 2458 while (bdev_module) { 2459 if (bdev_module->async_fini_start) { 2460 /* Save our place so we can resume later. We must 2461 * save the variable here, before calling fini_start() 2462 * below, because in some cases the module may immediately 2463 * call spdk_bdev_module_fini_start_done() and re-enter 2464 * this function to continue iterating. */ 2465 g_resume_bdev_module = bdev_module; 2466 } 2467 2468 if (bdev_module->fini_start) { 2469 bdev_module->fini_start(); 2470 } 2471 2472 if (bdev_module->async_fini_start) { 2473 return; 2474 } 2475 2476 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2477 } 2478 2479 g_resume_bdev_module = NULL; 2480 2481 bdev_finish_unregister_bdevs_iter(NULL, 0); 2482 } 2483 2484 void 2485 spdk_bdev_module_fini_start_done(void) 2486 { 2487 if (spdk_get_thread() != g_fini_thread) { 2488 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2489 } else { 2490 bdev_module_fini_start_iter(NULL); 2491 } 2492 } 2493 2494 static void 2495 bdev_finish_wait_for_examine_done(void *cb_arg) 2496 { 2497 bdev_module_fini_start_iter(NULL); 2498 } 2499 2500 static void bdev_open_async_fini(void); 2501 2502 void 2503 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2504 { 2505 int rc; 2506 2507 assert(cb_fn != NULL); 2508 2509 g_fini_thread = spdk_get_thread(); 2510 2511 g_fini_cb_fn = cb_fn; 2512 g_fini_cb_arg = cb_arg; 2513 2514 bdev_open_async_fini(); 2515 2516 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2517 if (rc != 0) { 2518 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2519 bdev_finish_wait_for_examine_done(NULL); 2520 } 2521 } 2522 2523 struct spdk_bdev_io * 2524 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2525 { 2526 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2527 struct spdk_bdev_io *bdev_io; 2528 2529 if (ch->per_thread_cache_count > 0) { 2530 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2531 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2532 ch->per_thread_cache_count--; 2533 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2534 /* 2535 * Don't try to look for bdev_ios in the global pool if there are 2536 * waiters on bdev_ios - we don't want this caller to jump the line. 2537 */ 2538 bdev_io = NULL; 2539 } else { 2540 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2541 } 2542 2543 return bdev_io; 2544 } 2545 2546 void 2547 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2548 { 2549 struct spdk_bdev_mgmt_channel *ch; 2550 2551 assert(bdev_io != NULL); 2552 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2553 2554 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2555 2556 if (bdev_io->internal.f.has_buf) { 2557 bdev_io_put_buf(bdev_io); 2558 } 2559 2560 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2561 ch->per_thread_cache_count++; 2562 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2563 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2564 struct spdk_bdev_io_wait_entry *entry; 2565 2566 entry = TAILQ_FIRST(&ch->io_wait_queue); 2567 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2568 entry->cb_fn(entry->cb_arg); 2569 } 2570 } else { 2571 /* We should never have a full cache with entries on the io wait queue. */ 2572 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2573 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2574 } 2575 } 2576 2577 static bool 2578 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2579 { 2580 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2581 2582 switch (limit) { 2583 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2584 return true; 2585 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2586 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2587 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2588 return false; 2589 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2590 default: 2591 return false; 2592 } 2593 } 2594 2595 static bool 2596 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2597 { 2598 switch (bdev_io->type) { 2599 case SPDK_BDEV_IO_TYPE_NVME_IO: 2600 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2601 case SPDK_BDEV_IO_TYPE_READ: 2602 case SPDK_BDEV_IO_TYPE_WRITE: 2603 return true; 2604 case SPDK_BDEV_IO_TYPE_ZCOPY: 2605 if (bdev_io->u.bdev.zcopy.start) { 2606 return true; 2607 } else { 2608 return false; 2609 } 2610 default: 2611 return false; 2612 } 2613 } 2614 2615 static bool 2616 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2617 { 2618 switch (bdev_io->type) { 2619 case SPDK_BDEV_IO_TYPE_NVME_IO: 2620 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2621 /* Bit 1 (0x2) set for read operation */ 2622 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2623 return true; 2624 } else { 2625 return false; 2626 } 2627 case SPDK_BDEV_IO_TYPE_READ: 2628 return true; 2629 case SPDK_BDEV_IO_TYPE_ZCOPY: 2630 /* Populate to read from disk */ 2631 if (bdev_io->u.bdev.zcopy.populate) { 2632 return true; 2633 } else { 2634 return false; 2635 } 2636 default: 2637 return false; 2638 } 2639 } 2640 2641 static uint64_t 2642 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2643 { 2644 struct spdk_bdev *bdev = bdev_io->bdev; 2645 2646 switch (bdev_io->type) { 2647 case SPDK_BDEV_IO_TYPE_NVME_IO: 2648 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2649 return bdev_io->u.nvme_passthru.nbytes; 2650 case SPDK_BDEV_IO_TYPE_READ: 2651 case SPDK_BDEV_IO_TYPE_WRITE: 2652 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2653 case SPDK_BDEV_IO_TYPE_ZCOPY: 2654 /* Track the data in the start phase only */ 2655 if (bdev_io->u.bdev.zcopy.start) { 2656 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2657 } else { 2658 return 0; 2659 } 2660 default: 2661 return 0; 2662 } 2663 } 2664 2665 static inline bool 2666 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2667 { 2668 int64_t remaining_this_timeslice; 2669 2670 if (!limit->max_per_timeslice) { 2671 /* The QoS is disabled */ 2672 return false; 2673 } 2674 2675 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2676 __ATOMIC_RELAXED); 2677 if (remaining_this_timeslice + (int64_t)delta > 0) { 2678 /* There was still a quota for this delta -> the IO shouldn't be queued 2679 * 2680 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2681 * quota can be allowed once a while. Such overrun then taken into account in 2682 * the QoS poller, where the next timeslice quota is calculated. 2683 */ 2684 return false; 2685 } 2686 2687 /* There was no quota for this delta -> the IO should be queued 2688 * The remaining_this_timeslice must be rewinded so it reflects the real 2689 * amount of IOs or bytes allowed. 2690 */ 2691 __atomic_add_fetch( 2692 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2693 return true; 2694 } 2695 2696 static inline void 2697 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2698 { 2699 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2700 } 2701 2702 static bool 2703 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2704 { 2705 return bdev_qos_rw_queue_io(limit, io, 1); 2706 } 2707 2708 static void 2709 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2710 { 2711 bdev_qos_rw_rewind_io(limit, io, 1); 2712 } 2713 2714 static bool 2715 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2716 { 2717 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2718 } 2719 2720 static void 2721 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2722 { 2723 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2724 } 2725 2726 static bool 2727 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2728 { 2729 if (bdev_is_read_io(io) == false) { 2730 return false; 2731 } 2732 2733 return bdev_qos_rw_bps_queue(limit, io); 2734 } 2735 2736 static void 2737 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2738 { 2739 if (bdev_is_read_io(io) != false) { 2740 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2741 } 2742 } 2743 2744 static bool 2745 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2746 { 2747 if (bdev_is_read_io(io) == true) { 2748 return false; 2749 } 2750 2751 return bdev_qos_rw_bps_queue(limit, io); 2752 } 2753 2754 static void 2755 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2756 { 2757 if (bdev_is_read_io(io) != true) { 2758 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2759 } 2760 } 2761 2762 static void 2763 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2764 { 2765 int i; 2766 2767 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2768 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2769 qos->rate_limits[i].queue_io = NULL; 2770 continue; 2771 } 2772 2773 switch (i) { 2774 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2775 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2776 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2777 break; 2778 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2779 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2780 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2781 break; 2782 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2783 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2784 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2785 break; 2786 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2787 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2788 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2789 break; 2790 default: 2791 break; 2792 } 2793 } 2794 } 2795 2796 static void 2797 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2798 struct spdk_bdev_io *bdev_io, 2799 enum spdk_bdev_io_status status) 2800 { 2801 bdev_io->internal.f.in_submit_request = true; 2802 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2803 spdk_bdev_io_complete(bdev_io, status); 2804 bdev_io->internal.f.in_submit_request = false; 2805 } 2806 2807 static inline void 2808 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2809 { 2810 struct spdk_bdev *bdev = bdev_io->bdev; 2811 struct spdk_io_channel *ch = bdev_ch->channel; 2812 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2813 2814 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2815 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2816 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2817 2818 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2819 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2820 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2821 SPDK_BDEV_IO_STATUS_SUCCESS); 2822 return; 2823 } 2824 } 2825 2826 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2827 bdev_io->bdev->split_on_write_unit && 2828 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2829 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2830 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2831 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2832 return; 2833 } 2834 2835 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2836 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2837 bdev_io->internal.f.in_submit_request = true; 2838 bdev_submit_request(bdev, ch, bdev_io); 2839 bdev_io->internal.f.in_submit_request = false; 2840 } else { 2841 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2842 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2843 /* Special case when we have nomem IOs and no outstanding IOs which completions 2844 * could trigger retry of queued IOs */ 2845 bdev_shared_ch_retry_io(shared_resource); 2846 } 2847 } 2848 } 2849 2850 static bool 2851 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2852 { 2853 int i; 2854 2855 if (bdev_qos_io_to_limit(bdev_io) == true) { 2856 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2857 if (!qos->rate_limits[i].queue_io) { 2858 continue; 2859 } 2860 2861 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2862 bdev_io) == true) { 2863 for (i -= 1; i >= 0 ; i--) { 2864 if (!qos->rate_limits[i].queue_io) { 2865 continue; 2866 } 2867 2868 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2869 } 2870 return true; 2871 } 2872 } 2873 } 2874 2875 return false; 2876 } 2877 2878 static int 2879 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2880 { 2881 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2882 int submitted_ios = 0; 2883 2884 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2885 if (!bdev_qos_queue_io(qos, bdev_io)) { 2886 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2887 bdev_io_do_submit(ch, bdev_io); 2888 2889 submitted_ios++; 2890 } 2891 } 2892 2893 return submitted_ios; 2894 } 2895 2896 static void 2897 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2898 { 2899 int rc; 2900 2901 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2902 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2903 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2904 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2905 &bdev_io->internal.waitq_entry); 2906 if (rc != 0) { 2907 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2908 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2909 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2910 } 2911 } 2912 2913 static bool 2914 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2915 { 2916 uint32_t io_boundary; 2917 struct spdk_bdev *bdev = bdev_io->bdev; 2918 uint32_t max_segment_size = bdev->max_segment_size; 2919 uint32_t max_size = bdev->max_rw_size; 2920 int max_segs = bdev->max_num_segments; 2921 2922 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2923 io_boundary = bdev->write_unit_size; 2924 } else if (bdev->split_on_optimal_io_boundary) { 2925 io_boundary = bdev->optimal_io_boundary; 2926 } else { 2927 io_boundary = 0; 2928 } 2929 2930 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2931 return false; 2932 } 2933 2934 if (io_boundary) { 2935 uint64_t start_stripe, end_stripe; 2936 2937 start_stripe = bdev_io->u.bdev.offset_blocks; 2938 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2939 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2940 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2941 start_stripe >>= spdk_u32log2(io_boundary); 2942 end_stripe >>= spdk_u32log2(io_boundary); 2943 } else { 2944 start_stripe /= io_boundary; 2945 end_stripe /= io_boundary; 2946 } 2947 2948 if (start_stripe != end_stripe) { 2949 return true; 2950 } 2951 } 2952 2953 if (max_segs) { 2954 if (bdev_io->u.bdev.iovcnt > max_segs) { 2955 return true; 2956 } 2957 } 2958 2959 if (max_segment_size) { 2960 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2961 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2962 return true; 2963 } 2964 } 2965 } 2966 2967 if (max_size) { 2968 if (bdev_io->u.bdev.num_blocks > max_size) { 2969 return true; 2970 } 2971 } 2972 2973 return false; 2974 } 2975 2976 static bool 2977 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2978 { 2979 uint32_t num_unmap_segments; 2980 2981 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2982 return false; 2983 } 2984 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2985 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2986 return true; 2987 } 2988 2989 return false; 2990 } 2991 2992 static bool 2993 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2994 { 2995 if (!bdev_io->bdev->max_write_zeroes) { 2996 return false; 2997 } 2998 2999 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 3000 return true; 3001 } 3002 3003 return false; 3004 } 3005 3006 static bool 3007 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3008 { 3009 if (bdev_io->bdev->max_copy != 0 && 3010 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3011 return true; 3012 } 3013 3014 return false; 3015 } 3016 3017 static bool 3018 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3019 { 3020 switch (bdev_io->type) { 3021 case SPDK_BDEV_IO_TYPE_READ: 3022 case SPDK_BDEV_IO_TYPE_WRITE: 3023 return bdev_rw_should_split(bdev_io); 3024 case SPDK_BDEV_IO_TYPE_UNMAP: 3025 return bdev_unmap_should_split(bdev_io); 3026 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3027 return bdev_write_zeroes_should_split(bdev_io); 3028 case SPDK_BDEV_IO_TYPE_COPY: 3029 return bdev_copy_should_split(bdev_io); 3030 default: 3031 return false; 3032 } 3033 } 3034 3035 static uint32_t 3036 _to_next_boundary(uint64_t offset, uint32_t boundary) 3037 { 3038 return (boundary - (offset % boundary)); 3039 } 3040 3041 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3042 3043 static void _bdev_rw_split(void *_bdev_io); 3044 3045 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3046 3047 static void 3048 _bdev_unmap_split(void *_bdev_io) 3049 { 3050 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3051 } 3052 3053 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3054 3055 static void 3056 _bdev_write_zeroes_split(void *_bdev_io) 3057 { 3058 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3059 } 3060 3061 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3062 3063 static void 3064 _bdev_copy_split(void *_bdev_io) 3065 { 3066 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3067 } 3068 3069 static int 3070 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3071 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3072 { 3073 int rc; 3074 uint64_t current_offset, current_remaining, current_src_offset; 3075 spdk_bdev_io_wait_cb io_wait_fn; 3076 3077 current_offset = *offset; 3078 current_remaining = *remaining; 3079 3080 assert(bdev_io->internal.f.split); 3081 3082 bdev_io->internal.split.outstanding++; 3083 3084 io_wait_fn = _bdev_rw_split; 3085 switch (bdev_io->type) { 3086 case SPDK_BDEV_IO_TYPE_READ: 3087 assert(bdev_io->u.bdev.accel_sequence == NULL); 3088 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3089 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3090 iov, iovcnt, md_buf, current_offset, 3091 num_blocks, 3092 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3093 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3094 NULL, 3095 bdev_io->u.bdev.dif_check_flags, 3096 bdev_io_split_done, bdev_io); 3097 break; 3098 case SPDK_BDEV_IO_TYPE_WRITE: 3099 assert(bdev_io->u.bdev.accel_sequence == NULL); 3100 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3101 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3102 iov, iovcnt, md_buf, current_offset, 3103 num_blocks, 3104 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3105 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3106 NULL, 3107 bdev_io->u.bdev.dif_check_flags, 3108 bdev_io->u.bdev.nvme_cdw12.raw, 3109 bdev_io->u.bdev.nvme_cdw13.raw, 3110 bdev_io_split_done, bdev_io); 3111 break; 3112 case SPDK_BDEV_IO_TYPE_UNMAP: 3113 io_wait_fn = _bdev_unmap_split; 3114 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3115 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3116 current_offset, num_blocks, 3117 bdev_io_split_done, bdev_io); 3118 break; 3119 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3120 io_wait_fn = _bdev_write_zeroes_split; 3121 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3122 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3123 current_offset, num_blocks, 3124 bdev_io_split_done, bdev_io); 3125 break; 3126 case SPDK_BDEV_IO_TYPE_COPY: 3127 io_wait_fn = _bdev_copy_split; 3128 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3129 (current_offset - bdev_io->u.bdev.offset_blocks); 3130 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3131 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3132 current_offset, current_src_offset, num_blocks, 3133 bdev_io_split_done, bdev_io); 3134 break; 3135 default: 3136 assert(false); 3137 rc = -EINVAL; 3138 break; 3139 } 3140 3141 if (rc == 0) { 3142 current_offset += num_blocks; 3143 current_remaining -= num_blocks; 3144 bdev_io->internal.split.current_offset_blocks = current_offset; 3145 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3146 *offset = current_offset; 3147 *remaining = current_remaining; 3148 } else { 3149 bdev_io->internal.split.outstanding--; 3150 if (rc == -ENOMEM) { 3151 if (bdev_io->internal.split.outstanding == 0) { 3152 /* No I/O is outstanding. Hence we should wait here. */ 3153 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3154 } 3155 } else { 3156 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3157 if (bdev_io->internal.split.outstanding == 0) { 3158 bdev_ch_remove_from_io_submitted(bdev_io); 3159 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3160 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3161 bdev_io->internal.ch->queue_depth); 3162 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3163 } 3164 } 3165 } 3166 3167 return rc; 3168 } 3169 3170 static void 3171 _bdev_rw_split(void *_bdev_io) 3172 { 3173 struct iovec *parent_iov, *iov; 3174 struct spdk_bdev_io *bdev_io = _bdev_io; 3175 struct spdk_bdev *bdev = bdev_io->bdev; 3176 uint64_t parent_offset, current_offset, remaining; 3177 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3178 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3179 uint32_t iovcnt, iov_len, child_iovsize; 3180 uint32_t blocklen = bdev->blocklen; 3181 uint32_t io_boundary; 3182 uint32_t max_segment_size = bdev->max_segment_size; 3183 uint32_t max_child_iovcnt = bdev->max_num_segments; 3184 uint32_t max_size = bdev->max_rw_size; 3185 void *md_buf = NULL; 3186 int rc; 3187 3188 max_size = max_size ? max_size : UINT32_MAX; 3189 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3190 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3191 SPDK_BDEV_IO_NUM_CHILD_IOV; 3192 3193 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3194 io_boundary = bdev->write_unit_size; 3195 } else if (bdev->split_on_optimal_io_boundary) { 3196 io_boundary = bdev->optimal_io_boundary; 3197 } else { 3198 io_boundary = UINT32_MAX; 3199 } 3200 3201 assert(bdev_io->internal.f.split); 3202 3203 remaining = bdev_io->internal.split.remaining_num_blocks; 3204 current_offset = bdev_io->internal.split.current_offset_blocks; 3205 parent_offset = bdev_io->u.bdev.offset_blocks; 3206 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3207 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3208 3209 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3210 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3211 if (parent_iov_offset < parent_iov->iov_len) { 3212 break; 3213 } 3214 parent_iov_offset -= parent_iov->iov_len; 3215 } 3216 3217 child_iovcnt = 0; 3218 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3219 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3220 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3221 to_next_boundary = spdk_min(remaining, to_next_boundary); 3222 to_next_boundary = spdk_min(max_size, to_next_boundary); 3223 to_next_boundary_bytes = to_next_boundary * blocklen; 3224 3225 iov = &bdev_io->child_iov[child_iovcnt]; 3226 iovcnt = 0; 3227 3228 if (bdev_io->u.bdev.md_buf) { 3229 md_buf = (char *)bdev_io->u.bdev.md_buf + 3230 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3231 } 3232 3233 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3234 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3235 iovcnt < child_iovsize) { 3236 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3237 iov_len = parent_iov->iov_len - parent_iov_offset; 3238 3239 iov_len = spdk_min(iov_len, max_segment_size); 3240 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3241 to_next_boundary_bytes -= iov_len; 3242 3243 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3244 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3245 3246 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3247 parent_iov_offset += iov_len; 3248 } else { 3249 parent_iovpos++; 3250 parent_iov_offset = 0; 3251 } 3252 child_iovcnt++; 3253 iovcnt++; 3254 } 3255 3256 if (to_next_boundary_bytes > 0) { 3257 /* We had to stop this child I/O early because we ran out of 3258 * child_iov space or were limited by max_num_segments. 3259 * Ensure the iovs to be aligned with block size and 3260 * then adjust to_next_boundary before starting the 3261 * child I/O. 3262 */ 3263 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3264 iovcnt == child_iovsize); 3265 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3266 if (to_last_block_bytes != 0) { 3267 uint32_t child_iovpos = child_iovcnt - 1; 3268 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3269 * so the loop will naturally end 3270 */ 3271 3272 to_last_block_bytes = blocklen - to_last_block_bytes; 3273 to_next_boundary_bytes += to_last_block_bytes; 3274 while (to_last_block_bytes > 0 && iovcnt > 0) { 3275 iov_len = spdk_min(to_last_block_bytes, 3276 bdev_io->child_iov[child_iovpos].iov_len); 3277 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3278 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3279 child_iovpos--; 3280 if (--iovcnt == 0) { 3281 /* If the child IO is less than a block size just return. 3282 * If the first child IO of any split round is less than 3283 * a block size, an error exit. 3284 */ 3285 if (bdev_io->internal.split.outstanding == 0) { 3286 SPDK_ERRLOG("The first child io was less than a block size\n"); 3287 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3288 bdev_ch_remove_from_io_submitted(bdev_io); 3289 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3290 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3291 bdev_io->internal.ch->queue_depth); 3292 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3293 } 3294 3295 return; 3296 } 3297 } 3298 3299 to_last_block_bytes -= iov_len; 3300 3301 if (parent_iov_offset == 0) { 3302 parent_iovpos--; 3303 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3304 } 3305 parent_iov_offset -= iov_len; 3306 } 3307 3308 assert(to_last_block_bytes == 0); 3309 } 3310 to_next_boundary -= to_next_boundary_bytes / blocklen; 3311 } 3312 3313 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3314 ¤t_offset, &remaining); 3315 if (spdk_unlikely(rc)) { 3316 return; 3317 } 3318 } 3319 } 3320 3321 static void 3322 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3323 { 3324 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3325 uint32_t num_children_reqs = 0; 3326 int rc; 3327 3328 assert(bdev_io->internal.f.split); 3329 3330 offset = bdev_io->internal.split.current_offset_blocks; 3331 remaining = bdev_io->internal.split.remaining_num_blocks; 3332 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3333 3334 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3335 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3336 3337 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3338 &offset, &remaining); 3339 if (spdk_likely(rc == 0)) { 3340 num_children_reqs++; 3341 } else { 3342 return; 3343 } 3344 } 3345 } 3346 3347 static void 3348 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3349 { 3350 uint64_t offset, write_zeroes_blocks, remaining; 3351 uint32_t num_children_reqs = 0; 3352 int rc; 3353 3354 assert(bdev_io->internal.f.split); 3355 3356 offset = bdev_io->internal.split.current_offset_blocks; 3357 remaining = bdev_io->internal.split.remaining_num_blocks; 3358 3359 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3360 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3361 3362 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3363 &offset, &remaining); 3364 if (spdk_likely(rc == 0)) { 3365 num_children_reqs++; 3366 } else { 3367 return; 3368 } 3369 } 3370 } 3371 3372 static void 3373 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3374 { 3375 uint64_t offset, copy_blocks, remaining; 3376 uint32_t num_children_reqs = 0; 3377 int rc; 3378 3379 assert(bdev_io->internal.f.split); 3380 3381 offset = bdev_io->internal.split.current_offset_blocks; 3382 remaining = bdev_io->internal.split.remaining_num_blocks; 3383 3384 assert(bdev_io->bdev->max_copy != 0); 3385 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3386 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3387 3388 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3389 &offset, &remaining); 3390 if (spdk_likely(rc == 0)) { 3391 num_children_reqs++; 3392 } else { 3393 return; 3394 } 3395 } 3396 } 3397 3398 static void 3399 parent_bdev_io_complete(void *ctx, int rc) 3400 { 3401 struct spdk_bdev_io *parent_io = ctx; 3402 3403 if (rc) { 3404 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3405 } 3406 3407 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3408 parent_io->internal.caller_ctx); 3409 } 3410 3411 static void 3412 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3413 { 3414 struct spdk_bdev_io *bdev_io = ctx; 3415 3416 /* u.bdev.accel_sequence should have already been cleared at this point */ 3417 assert(bdev_io->u.bdev.accel_sequence == NULL); 3418 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3419 bdev_io->internal.f.has_accel_sequence = false; 3420 3421 if (spdk_unlikely(status != 0)) { 3422 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3423 } 3424 3425 parent_bdev_io_complete(bdev_io, status); 3426 } 3427 3428 static void 3429 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3430 { 3431 struct spdk_bdev_io *parent_io = cb_arg; 3432 3433 spdk_bdev_free_io(bdev_io); 3434 3435 assert(parent_io->internal.f.split); 3436 3437 if (!success) { 3438 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3439 /* If any child I/O failed, stop further splitting process. */ 3440 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3441 parent_io->internal.split.remaining_num_blocks = 0; 3442 } 3443 parent_io->internal.split.outstanding--; 3444 if (parent_io->internal.split.outstanding != 0) { 3445 return; 3446 } 3447 3448 /* 3449 * Parent I/O finishes when all blocks are consumed. 3450 */ 3451 if (parent_io->internal.split.remaining_num_blocks == 0) { 3452 assert(parent_io->internal.cb != bdev_io_split_done); 3453 bdev_ch_remove_from_io_submitted(parent_io); 3454 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3455 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3456 parent_io->internal.ch->queue_depth); 3457 3458 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3459 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3460 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3461 return; 3462 } else if (parent_io->internal.f.has_bounce_buf && 3463 !bdev_io_use_accel_sequence(bdev_io)) { 3464 /* bdev IO will be completed in the callback */ 3465 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3466 return; 3467 } 3468 } 3469 3470 parent_bdev_io_complete(parent_io, 0); 3471 return; 3472 } 3473 3474 /* 3475 * Continue with the splitting process. This function will complete the parent I/O if the 3476 * splitting is done. 3477 */ 3478 switch (parent_io->type) { 3479 case SPDK_BDEV_IO_TYPE_READ: 3480 case SPDK_BDEV_IO_TYPE_WRITE: 3481 _bdev_rw_split(parent_io); 3482 break; 3483 case SPDK_BDEV_IO_TYPE_UNMAP: 3484 bdev_unmap_split(parent_io); 3485 break; 3486 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3487 bdev_write_zeroes_split(parent_io); 3488 break; 3489 case SPDK_BDEV_IO_TYPE_COPY: 3490 bdev_copy_split(parent_io); 3491 break; 3492 default: 3493 assert(false); 3494 break; 3495 } 3496 } 3497 3498 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3499 bool success); 3500 3501 static void 3502 bdev_io_split(struct spdk_bdev_io *bdev_io) 3503 { 3504 assert(bdev_io_should_split(bdev_io)); 3505 assert(bdev_io->internal.f.split); 3506 3507 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3508 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3509 bdev_io->internal.split.outstanding = 0; 3510 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3511 3512 switch (bdev_io->type) { 3513 case SPDK_BDEV_IO_TYPE_READ: 3514 case SPDK_BDEV_IO_TYPE_WRITE: 3515 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3516 _bdev_rw_split(bdev_io); 3517 } else { 3518 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3519 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3520 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3521 } 3522 break; 3523 case SPDK_BDEV_IO_TYPE_UNMAP: 3524 bdev_unmap_split(bdev_io); 3525 break; 3526 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3527 bdev_write_zeroes_split(bdev_io); 3528 break; 3529 case SPDK_BDEV_IO_TYPE_COPY: 3530 bdev_copy_split(bdev_io); 3531 break; 3532 default: 3533 assert(false); 3534 break; 3535 } 3536 } 3537 3538 static void 3539 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3540 { 3541 if (!success) { 3542 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3543 return; 3544 } 3545 3546 _bdev_rw_split(bdev_io); 3547 } 3548 3549 static inline void 3550 _bdev_io_submit(struct spdk_bdev_io *bdev_io) 3551 { 3552 struct spdk_bdev *bdev = bdev_io->bdev; 3553 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3554 3555 if (spdk_likely(bdev_ch->flags == 0)) { 3556 bdev_io_do_submit(bdev_ch, bdev_io); 3557 return; 3558 } 3559 3560 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3561 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3562 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3563 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3564 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3565 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3566 } else { 3567 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3568 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3569 } 3570 } else { 3571 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3572 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3573 } 3574 } 3575 3576 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3577 3578 bool 3579 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3580 { 3581 if (range1->length == 0 || range2->length == 0) { 3582 return false; 3583 } 3584 3585 if (range1->offset + range1->length <= range2->offset) { 3586 return false; 3587 } 3588 3589 if (range2->offset + range2->length <= range1->offset) { 3590 return false; 3591 } 3592 3593 return true; 3594 } 3595 3596 static bool 3597 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3598 { 3599 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3600 struct lba_range r; 3601 3602 switch (bdev_io->type) { 3603 case SPDK_BDEV_IO_TYPE_NVME_IO: 3604 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3605 /* Don't try to decode the NVMe command - just assume worst-case and that 3606 * it overlaps a locked range. 3607 */ 3608 return true; 3609 case SPDK_BDEV_IO_TYPE_READ: 3610 if (!range->quiesce) { 3611 return false; 3612 } 3613 /* fallthrough */ 3614 case SPDK_BDEV_IO_TYPE_WRITE: 3615 case SPDK_BDEV_IO_TYPE_UNMAP: 3616 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3617 case SPDK_BDEV_IO_TYPE_ZCOPY: 3618 case SPDK_BDEV_IO_TYPE_COPY: 3619 r.offset = bdev_io->u.bdev.offset_blocks; 3620 r.length = bdev_io->u.bdev.num_blocks; 3621 if (!bdev_lba_range_overlapped(range, &r)) { 3622 /* This I/O doesn't overlap the specified LBA range. */ 3623 return false; 3624 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3625 /* This I/O overlaps, but the I/O is on the same channel that locked this 3626 * range, and the caller_ctx is the same as the locked_ctx. This means 3627 * that this I/O is associated with the lock, and is allowed to execute. 3628 */ 3629 return false; 3630 } else { 3631 return true; 3632 } 3633 default: 3634 return false; 3635 } 3636 } 3637 3638 void 3639 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3640 { 3641 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3642 3643 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3644 3645 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3646 struct lba_range *range; 3647 3648 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3649 if (bdev_io_range_is_locked(bdev_io, range)) { 3650 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3651 return; 3652 } 3653 } 3654 } 3655 3656 bdev_ch_add_to_io_submitted(bdev_io); 3657 3658 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3659 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3660 ch->trace_id, bdev_io->u.bdev.num_blocks, 3661 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3662 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3663 3664 if (bdev_io->internal.f.split) { 3665 bdev_io_split(bdev_io); 3666 return; 3667 } 3668 3669 _bdev_io_submit(bdev_io); 3670 } 3671 3672 static inline void 3673 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3674 { 3675 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3676 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3677 * For write operation we need to pull buffers from memory domain before submitting IO. 3678 * Once read operation completes, we need to use memory_domain push functionality to 3679 * update data in original memory domain IO buffer 3680 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3681 assert(bdev_io->internal.f.has_memory_domain); 3682 bdev_io->u.bdev.memory_domain = NULL; 3683 bdev_io->u.bdev.memory_domain_ctx = NULL; 3684 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3685 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3686 } 3687 3688 static inline void 3689 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3690 { 3691 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3692 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3693 3694 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3695 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3696 bdev_io_complete_unsubmitted(bdev_io); 3697 return; 3698 } 3699 3700 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3701 * support them, but we need to execute an accel sequence and the data buffer is from accel 3702 * memory domain (to avoid doing a push/pull from that domain). 3703 */ 3704 if (bdev_io_use_memory_domain(bdev_io)) { 3705 if (!desc->memory_domains_supported || 3706 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3707 _bdev_io_ext_use_bounce_buffer(bdev_io); 3708 return; 3709 } 3710 } 3711 3712 if (needs_exec) { 3713 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3714 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3715 return; 3716 } 3717 /* For reads we'll execute the sequence after the data is read, so, for now, only 3718 * clear out accel_sequence pointer and submit the IO */ 3719 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3720 bdev_io->u.bdev.accel_sequence = NULL; 3721 } 3722 3723 bdev_io_submit(bdev_io); 3724 } 3725 3726 static void 3727 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3728 { 3729 struct spdk_bdev *bdev = bdev_io->bdev; 3730 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3731 struct spdk_io_channel *ch = bdev_ch->channel; 3732 3733 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3734 3735 bdev_io->internal.f.in_submit_request = true; 3736 bdev_submit_request(bdev, ch, bdev_io); 3737 bdev_io->internal.f.in_submit_request = false; 3738 } 3739 3740 void 3741 bdev_io_init(struct spdk_bdev_io *bdev_io, 3742 struct spdk_bdev *bdev, void *cb_arg, 3743 spdk_bdev_io_completion_cb cb) 3744 { 3745 bdev_io->bdev = bdev; 3746 bdev_io->internal.f.raw = 0; 3747 bdev_io->internal.caller_ctx = cb_arg; 3748 bdev_io->internal.cb = cb; 3749 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3750 bdev_io->internal.f.in_submit_request = false; 3751 bdev_io->internal.error.nvme.cdw0 = 0; 3752 bdev_io->num_retries = 0; 3753 bdev_io->internal.get_buf_cb = NULL; 3754 bdev_io->internal.get_aux_buf_cb = NULL; 3755 bdev_io->internal.data_transfer_cpl = NULL; 3756 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3757 } 3758 3759 static bool 3760 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3761 { 3762 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3763 } 3764 3765 bool 3766 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3767 { 3768 bool supported; 3769 3770 supported = bdev_io_type_supported(bdev, io_type); 3771 3772 if (!supported) { 3773 switch (io_type) { 3774 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3775 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3776 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3777 break; 3778 default: 3779 break; 3780 } 3781 } 3782 3783 return supported; 3784 } 3785 3786 static const char *g_io_type_strings[] = { 3787 [SPDK_BDEV_IO_TYPE_READ] = "read", 3788 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3789 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3790 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3791 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3792 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3793 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3794 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3795 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3796 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3797 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3798 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3799 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3800 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3801 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3802 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3803 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3804 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3805 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3806 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3807 }; 3808 3809 const char * 3810 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3811 { 3812 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3813 return NULL; 3814 } 3815 3816 return g_io_type_strings[io_type]; 3817 } 3818 3819 int 3820 spdk_bdev_get_io_type(const char *io_type_string) 3821 { 3822 int i; 3823 3824 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3825 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3826 return i; 3827 } 3828 } 3829 3830 return -1; 3831 } 3832 3833 uint64_t 3834 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3835 { 3836 return bdev_io->internal.submit_tsc; 3837 } 3838 3839 int 3840 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3841 { 3842 if (bdev->fn_table->dump_info_json) { 3843 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3844 } 3845 3846 return 0; 3847 } 3848 3849 static void 3850 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3851 { 3852 uint32_t max_per_timeslice = 0; 3853 int i; 3854 3855 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3856 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3857 qos->rate_limits[i].max_per_timeslice = 0; 3858 continue; 3859 } 3860 3861 max_per_timeslice = qos->rate_limits[i].limit * 3862 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3863 3864 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3865 qos->rate_limits[i].min_per_timeslice); 3866 3867 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3868 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3869 } 3870 3871 bdev_qos_set_ops(qos); 3872 } 3873 3874 static void 3875 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3876 struct spdk_io_channel *io_ch, void *ctx) 3877 { 3878 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3879 int status; 3880 3881 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3882 3883 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3884 /* TODO: channels round robing */ 3885 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3886 3887 spdk_bdev_for_each_channel_continue(i, status); 3888 } 3889 3890 3891 static void 3892 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3893 { 3894 3895 } 3896 3897 static int 3898 bdev_channel_poll_qos(void *arg) 3899 { 3900 struct spdk_bdev *bdev = arg; 3901 struct spdk_bdev_qos *qos = bdev->internal.qos; 3902 uint64_t now = spdk_get_ticks(); 3903 int i; 3904 int64_t remaining_last_timeslice; 3905 3906 if (spdk_unlikely(qos->thread == NULL)) { 3907 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3908 return SPDK_POLLER_IDLE; 3909 } 3910 3911 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3912 /* We received our callback earlier than expected - return 3913 * immediately and wait to do accounting until at least one 3914 * timeslice has actually expired. This should never happen 3915 * with a well-behaved timer implementation. 3916 */ 3917 return SPDK_POLLER_IDLE; 3918 } 3919 3920 /* Reset for next round of rate limiting */ 3921 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3922 /* We may have allowed the IOs or bytes to slightly overrun in the last 3923 * timeslice. remaining_this_timeslice is signed, so if it's negative 3924 * here, we'll account for the overrun so that the next timeslice will 3925 * be appropriately reduced. 3926 */ 3927 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3928 0, __ATOMIC_RELAXED); 3929 if (remaining_last_timeslice < 0) { 3930 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3931 * potentially use 2 atomic ops each, so they can intertwine. 3932 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3933 */ 3934 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3935 remaining_last_timeslice, __ATOMIC_RELAXED); 3936 } 3937 } 3938 3939 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3940 qos->last_timeslice += qos->timeslice_size; 3941 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3942 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3943 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3944 } 3945 } 3946 3947 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3948 bdev_channel_submit_qos_io_done); 3949 3950 return SPDK_POLLER_BUSY; 3951 } 3952 3953 static void 3954 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3955 { 3956 struct spdk_bdev_shared_resource *shared_resource; 3957 struct lba_range *range; 3958 3959 bdev_free_io_stat(ch->stat); 3960 #ifdef SPDK_CONFIG_VTUNE 3961 bdev_free_io_stat(ch->prev_stat); 3962 #endif 3963 3964 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3965 range = TAILQ_FIRST(&ch->locked_ranges); 3966 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3967 free(range); 3968 } 3969 3970 spdk_put_io_channel(ch->channel); 3971 spdk_put_io_channel(ch->accel_channel); 3972 3973 shared_resource = ch->shared_resource; 3974 3975 assert(TAILQ_EMPTY(&ch->io_locked)); 3976 assert(TAILQ_EMPTY(&ch->io_submitted)); 3977 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3978 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3979 assert(ch->io_outstanding == 0); 3980 assert(shared_resource->ref > 0); 3981 shared_resource->ref--; 3982 if (shared_resource->ref == 0) { 3983 assert(shared_resource->io_outstanding == 0); 3984 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3985 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3986 spdk_poller_unregister(&shared_resource->nomem_poller); 3987 free(shared_resource); 3988 } 3989 } 3990 3991 static void 3992 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3993 { 3994 struct spdk_bdev_qos *qos = bdev->internal.qos; 3995 int i; 3996 3997 assert(spdk_spin_held(&bdev->internal.spinlock)); 3998 3999 /* Rate limiting on this bdev enabled */ 4000 if (qos) { 4001 if (qos->ch == NULL) { 4002 struct spdk_io_channel *io_ch; 4003 4004 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4005 bdev->name, spdk_get_thread()); 4006 4007 /* No qos channel has been selected, so set one up */ 4008 4009 /* Take another reference to ch */ 4010 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4011 assert(io_ch != NULL); 4012 qos->ch = ch; 4013 4014 qos->thread = spdk_io_channel_get_thread(io_ch); 4015 4016 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4017 if (bdev_qos_is_iops_rate_limit(i) == true) { 4018 qos->rate_limits[i].min_per_timeslice = 4019 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4020 } else { 4021 qos->rate_limits[i].min_per_timeslice = 4022 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4023 } 4024 4025 if (qos->rate_limits[i].limit == 0) { 4026 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4027 } 4028 } 4029 bdev_qos_update_max_quota_per_timeslice(qos); 4030 qos->timeslice_size = 4031 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4032 qos->last_timeslice = spdk_get_ticks(); 4033 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4034 bdev, 4035 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4036 } 4037 4038 ch->flags |= BDEV_CH_QOS_ENABLED; 4039 } 4040 } 4041 4042 struct poll_timeout_ctx { 4043 struct spdk_bdev_desc *desc; 4044 uint64_t timeout_in_sec; 4045 spdk_bdev_io_timeout_cb cb_fn; 4046 void *cb_arg; 4047 }; 4048 4049 static void 4050 bdev_desc_free(struct spdk_bdev_desc *desc) 4051 { 4052 spdk_spin_destroy(&desc->spinlock); 4053 free(desc->media_events_buffer); 4054 free(desc); 4055 } 4056 4057 static void 4058 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4059 { 4060 struct poll_timeout_ctx *ctx = _ctx; 4061 struct spdk_bdev_desc *desc = ctx->desc; 4062 4063 free(ctx); 4064 4065 spdk_spin_lock(&desc->spinlock); 4066 desc->refs--; 4067 if (desc->closed == true && desc->refs == 0) { 4068 spdk_spin_unlock(&desc->spinlock); 4069 bdev_desc_free(desc); 4070 return; 4071 } 4072 spdk_spin_unlock(&desc->spinlock); 4073 } 4074 4075 static void 4076 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4077 struct spdk_io_channel *io_ch, void *_ctx) 4078 { 4079 struct poll_timeout_ctx *ctx = _ctx; 4080 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4081 struct spdk_bdev_desc *desc = ctx->desc; 4082 struct spdk_bdev_io *bdev_io; 4083 uint64_t now; 4084 4085 spdk_spin_lock(&desc->spinlock); 4086 if (desc->closed == true) { 4087 spdk_spin_unlock(&desc->spinlock); 4088 spdk_bdev_for_each_channel_continue(i, -1); 4089 return; 4090 } 4091 spdk_spin_unlock(&desc->spinlock); 4092 4093 now = spdk_get_ticks(); 4094 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4095 /* Exclude any I/O that are generated via splitting. */ 4096 if (bdev_io->internal.cb == bdev_io_split_done) { 4097 continue; 4098 } 4099 4100 /* Once we find an I/O that has not timed out, we can immediately 4101 * exit the loop. 4102 */ 4103 if (now < (bdev_io->internal.submit_tsc + 4104 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4105 goto end; 4106 } 4107 4108 if (bdev_io->internal.desc == desc) { 4109 ctx->cb_fn(ctx->cb_arg, bdev_io); 4110 } 4111 } 4112 4113 end: 4114 spdk_bdev_for_each_channel_continue(i, 0); 4115 } 4116 4117 static int 4118 bdev_poll_timeout_io(void *arg) 4119 { 4120 struct spdk_bdev_desc *desc = arg; 4121 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4122 struct poll_timeout_ctx *ctx; 4123 4124 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4125 if (!ctx) { 4126 SPDK_ERRLOG("failed to allocate memory\n"); 4127 return SPDK_POLLER_BUSY; 4128 } 4129 ctx->desc = desc; 4130 ctx->cb_arg = desc->cb_arg; 4131 ctx->cb_fn = desc->cb_fn; 4132 ctx->timeout_in_sec = desc->timeout_in_sec; 4133 4134 /* Take a ref on the descriptor in case it gets closed while we are checking 4135 * all of the channels. 4136 */ 4137 spdk_spin_lock(&desc->spinlock); 4138 desc->refs++; 4139 spdk_spin_unlock(&desc->spinlock); 4140 4141 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4142 bdev_channel_poll_timeout_io_done); 4143 4144 return SPDK_POLLER_BUSY; 4145 } 4146 4147 int 4148 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4149 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4150 { 4151 assert(desc->thread == spdk_get_thread()); 4152 4153 spdk_poller_unregister(&desc->io_timeout_poller); 4154 4155 if (timeout_in_sec) { 4156 assert(cb_fn != NULL); 4157 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4158 desc, 4159 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4160 1000); 4161 if (desc->io_timeout_poller == NULL) { 4162 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4163 return -1; 4164 } 4165 } 4166 4167 desc->cb_fn = cb_fn; 4168 desc->cb_arg = cb_arg; 4169 desc->timeout_in_sec = timeout_in_sec; 4170 4171 return 0; 4172 } 4173 4174 static int 4175 bdev_channel_create(void *io_device, void *ctx_buf) 4176 { 4177 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4178 struct spdk_bdev_channel *ch = ctx_buf; 4179 struct spdk_io_channel *mgmt_io_ch; 4180 struct spdk_bdev_mgmt_channel *mgmt_ch; 4181 struct spdk_bdev_shared_resource *shared_resource; 4182 struct lba_range *range; 4183 4184 ch->bdev = bdev; 4185 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4186 if (!ch->channel) { 4187 return -1; 4188 } 4189 4190 ch->accel_channel = spdk_accel_get_io_channel(); 4191 if (!ch->accel_channel) { 4192 spdk_put_io_channel(ch->channel); 4193 return -1; 4194 } 4195 4196 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4197 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4198 4199 assert(ch->histogram == NULL); 4200 if (bdev->internal.histogram_enabled) { 4201 ch->histogram = spdk_histogram_data_alloc(); 4202 if (ch->histogram == NULL) { 4203 SPDK_ERRLOG("Could not allocate histogram\n"); 4204 } 4205 } 4206 4207 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4208 if (!mgmt_io_ch) { 4209 spdk_put_io_channel(ch->channel); 4210 spdk_put_io_channel(ch->accel_channel); 4211 return -1; 4212 } 4213 4214 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4215 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4216 if (shared_resource->shared_ch == ch->channel) { 4217 spdk_put_io_channel(mgmt_io_ch); 4218 shared_resource->ref++; 4219 break; 4220 } 4221 } 4222 4223 if (shared_resource == NULL) { 4224 shared_resource = calloc(1, sizeof(*shared_resource)); 4225 if (shared_resource == NULL) { 4226 spdk_put_io_channel(ch->channel); 4227 spdk_put_io_channel(ch->accel_channel); 4228 spdk_put_io_channel(mgmt_io_ch); 4229 return -1; 4230 } 4231 4232 shared_resource->mgmt_ch = mgmt_ch; 4233 shared_resource->io_outstanding = 0; 4234 TAILQ_INIT(&shared_resource->nomem_io); 4235 shared_resource->nomem_threshold = 0; 4236 shared_resource->shared_ch = ch->channel; 4237 shared_resource->ref = 1; 4238 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4239 } 4240 4241 ch->io_outstanding = 0; 4242 TAILQ_INIT(&ch->locked_ranges); 4243 TAILQ_INIT(&ch->qos_queued_io); 4244 ch->flags = 0; 4245 ch->trace_id = bdev->internal.trace_id; 4246 ch->shared_resource = shared_resource; 4247 4248 TAILQ_INIT(&ch->io_submitted); 4249 TAILQ_INIT(&ch->io_locked); 4250 TAILQ_INIT(&ch->io_accel_exec); 4251 TAILQ_INIT(&ch->io_memory_domain); 4252 4253 ch->stat = bdev_alloc_io_stat(false); 4254 if (ch->stat == NULL) { 4255 bdev_channel_destroy_resource(ch); 4256 return -1; 4257 } 4258 4259 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4260 4261 #ifdef SPDK_CONFIG_VTUNE 4262 { 4263 char *name; 4264 __itt_init_ittlib(NULL, 0); 4265 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4266 if (!name) { 4267 bdev_channel_destroy_resource(ch); 4268 return -1; 4269 } 4270 ch->handle = __itt_string_handle_create(name); 4271 free(name); 4272 ch->start_tsc = spdk_get_ticks(); 4273 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4274 ch->prev_stat = bdev_alloc_io_stat(false); 4275 if (ch->prev_stat == NULL) { 4276 bdev_channel_destroy_resource(ch); 4277 return -1; 4278 } 4279 } 4280 #endif 4281 4282 spdk_spin_lock(&bdev->internal.spinlock); 4283 bdev_enable_qos(bdev, ch); 4284 4285 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4286 struct lba_range *new_range; 4287 4288 new_range = calloc(1, sizeof(*new_range)); 4289 if (new_range == NULL) { 4290 spdk_spin_unlock(&bdev->internal.spinlock); 4291 bdev_channel_destroy_resource(ch); 4292 return -1; 4293 } 4294 new_range->length = range->length; 4295 new_range->offset = range->offset; 4296 new_range->locked_ctx = range->locked_ctx; 4297 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4298 } 4299 4300 spdk_spin_unlock(&bdev->internal.spinlock); 4301 4302 return 0; 4303 } 4304 4305 static int 4306 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4307 void *cb_ctx) 4308 { 4309 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4310 struct spdk_bdev_io *bdev_io; 4311 uint64_t buf_len; 4312 4313 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4314 if (bdev_io->internal.ch == bdev_ch) { 4315 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4316 spdk_iobuf_entry_abort(ch, entry, buf_len); 4317 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4318 } 4319 4320 return 0; 4321 } 4322 4323 /* 4324 * Abort I/O that are waiting on a data buffer. 4325 */ 4326 static void 4327 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4328 { 4329 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4330 } 4331 4332 /* 4333 * Abort I/O that are queued waiting for submission. These types of I/O are 4334 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4335 */ 4336 static void 4337 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4338 { 4339 struct spdk_bdev_io *bdev_io, *tmp; 4340 4341 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4342 if (bdev_io->internal.ch == ch) { 4343 TAILQ_REMOVE(queue, bdev_io, internal.link); 4344 /* 4345 * spdk_bdev_io_complete() assumes that the completed I/O had 4346 * been submitted to the bdev module. Since in this case it 4347 * hadn't, bump io_outstanding to account for the decrement 4348 * that spdk_bdev_io_complete() will do. 4349 */ 4350 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4351 bdev_io_increment_outstanding(ch, ch->shared_resource); 4352 } 4353 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4354 } 4355 } 4356 } 4357 4358 static bool 4359 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4360 { 4361 struct spdk_bdev_io *bdev_io; 4362 4363 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4364 if (bdev_io == bio_to_abort) { 4365 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4366 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4367 return true; 4368 } 4369 } 4370 4371 return false; 4372 } 4373 4374 static int 4375 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4376 { 4377 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4378 uint64_t buf_len; 4379 4380 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4381 if (bdev_io == bio_to_abort) { 4382 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4383 spdk_iobuf_entry_abort(ch, entry, buf_len); 4384 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4385 return 1; 4386 } 4387 4388 return 0; 4389 } 4390 4391 static bool 4392 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4393 { 4394 int rc; 4395 4396 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4397 return rc == 1; 4398 } 4399 4400 static void 4401 bdev_qos_channel_destroy(void *cb_arg) 4402 { 4403 struct spdk_bdev_qos *qos = cb_arg; 4404 4405 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4406 spdk_poller_unregister(&qos->poller); 4407 4408 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4409 4410 free(qos); 4411 } 4412 4413 static int 4414 bdev_qos_destroy(struct spdk_bdev *bdev) 4415 { 4416 int i; 4417 4418 /* 4419 * Cleanly shutting down the QoS poller is tricky, because 4420 * during the asynchronous operation the user could open 4421 * a new descriptor and create a new channel, spawning 4422 * a new QoS poller. 4423 * 4424 * The strategy is to create a new QoS structure here and swap it 4425 * in. The shutdown path then continues to refer to the old one 4426 * until it completes and then releases it. 4427 */ 4428 struct spdk_bdev_qos *new_qos, *old_qos; 4429 4430 old_qos = bdev->internal.qos; 4431 4432 new_qos = calloc(1, sizeof(*new_qos)); 4433 if (!new_qos) { 4434 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4435 return -ENOMEM; 4436 } 4437 4438 /* Copy the old QoS data into the newly allocated structure */ 4439 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4440 4441 /* Zero out the key parts of the QoS structure */ 4442 new_qos->ch = NULL; 4443 new_qos->thread = NULL; 4444 new_qos->poller = NULL; 4445 /* 4446 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4447 * It will be used later for the new QoS structure. 4448 */ 4449 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4450 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4451 new_qos->rate_limits[i].min_per_timeslice = 0; 4452 new_qos->rate_limits[i].max_per_timeslice = 0; 4453 } 4454 4455 bdev->internal.qos = new_qos; 4456 4457 if (old_qos->thread == NULL) { 4458 free(old_qos); 4459 } else { 4460 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4461 } 4462 4463 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4464 * been destroyed yet. The destruction path will end up waiting for the final 4465 * channel to be put before it releases resources. */ 4466 4467 return 0; 4468 } 4469 4470 void 4471 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4472 { 4473 total->bytes_read += add->bytes_read; 4474 total->num_read_ops += add->num_read_ops; 4475 total->bytes_written += add->bytes_written; 4476 total->num_write_ops += add->num_write_ops; 4477 total->bytes_unmapped += add->bytes_unmapped; 4478 total->num_unmap_ops += add->num_unmap_ops; 4479 total->bytes_copied += add->bytes_copied; 4480 total->num_copy_ops += add->num_copy_ops; 4481 total->read_latency_ticks += add->read_latency_ticks; 4482 total->write_latency_ticks += add->write_latency_ticks; 4483 total->unmap_latency_ticks += add->unmap_latency_ticks; 4484 total->copy_latency_ticks += add->copy_latency_ticks; 4485 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4486 total->max_read_latency_ticks = add->max_read_latency_ticks; 4487 } 4488 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4489 total->min_read_latency_ticks = add->min_read_latency_ticks; 4490 } 4491 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4492 total->max_write_latency_ticks = add->max_write_latency_ticks; 4493 } 4494 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4495 total->min_write_latency_ticks = add->min_write_latency_ticks; 4496 } 4497 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4498 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4499 } 4500 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4501 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4502 } 4503 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4504 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4505 } 4506 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4507 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4508 } 4509 } 4510 4511 static void 4512 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4513 { 4514 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4515 4516 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4517 memcpy(to_stat->io_error, from_stat->io_error, 4518 sizeof(struct spdk_bdev_io_error_stat)); 4519 } 4520 } 4521 4522 void 4523 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4524 { 4525 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4526 return; 4527 } 4528 4529 stat->max_read_latency_ticks = 0; 4530 stat->min_read_latency_ticks = UINT64_MAX; 4531 stat->max_write_latency_ticks = 0; 4532 stat->min_write_latency_ticks = UINT64_MAX; 4533 stat->max_unmap_latency_ticks = 0; 4534 stat->min_unmap_latency_ticks = UINT64_MAX; 4535 stat->max_copy_latency_ticks = 0; 4536 stat->min_copy_latency_ticks = UINT64_MAX; 4537 4538 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4539 return; 4540 } 4541 4542 stat->bytes_read = 0; 4543 stat->num_read_ops = 0; 4544 stat->bytes_written = 0; 4545 stat->num_write_ops = 0; 4546 stat->bytes_unmapped = 0; 4547 stat->num_unmap_ops = 0; 4548 stat->bytes_copied = 0; 4549 stat->num_copy_ops = 0; 4550 stat->read_latency_ticks = 0; 4551 stat->write_latency_ticks = 0; 4552 stat->unmap_latency_ticks = 0; 4553 stat->copy_latency_ticks = 0; 4554 4555 if (stat->io_error != NULL) { 4556 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4557 } 4558 } 4559 4560 struct spdk_bdev_io_stat * 4561 bdev_alloc_io_stat(bool io_error_stat) 4562 { 4563 struct spdk_bdev_io_stat *stat; 4564 4565 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4566 if (stat == NULL) { 4567 return NULL; 4568 } 4569 4570 if (io_error_stat) { 4571 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4572 if (stat->io_error == NULL) { 4573 free(stat); 4574 return NULL; 4575 } 4576 } else { 4577 stat->io_error = NULL; 4578 } 4579 4580 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4581 4582 return stat; 4583 } 4584 4585 void 4586 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4587 { 4588 if (stat != NULL) { 4589 free(stat->io_error); 4590 free(stat); 4591 } 4592 } 4593 4594 void 4595 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4596 { 4597 int i; 4598 4599 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4600 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4601 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4602 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4603 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4604 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4605 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4606 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4607 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4608 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4609 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4610 stat->min_read_latency_ticks != UINT64_MAX ? 4611 stat->min_read_latency_ticks : 0); 4612 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4613 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4614 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4615 stat->min_write_latency_ticks != UINT64_MAX ? 4616 stat->min_write_latency_ticks : 0); 4617 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4618 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4619 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4620 stat->min_unmap_latency_ticks != UINT64_MAX ? 4621 stat->min_unmap_latency_ticks : 0); 4622 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4623 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4624 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4625 stat->min_copy_latency_ticks != UINT64_MAX ? 4626 stat->min_copy_latency_ticks : 0); 4627 4628 if (stat->io_error != NULL) { 4629 spdk_json_write_named_object_begin(w, "io_error"); 4630 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4631 if (stat->io_error->error_status[i] != 0) { 4632 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4633 stat->io_error->error_status[i]); 4634 } 4635 } 4636 spdk_json_write_object_end(w); 4637 } 4638 } 4639 4640 static void 4641 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4642 { 4643 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4644 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4645 4646 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4647 bdev_abort_all_buf_io(mgmt_ch, ch); 4648 } 4649 4650 static void 4651 bdev_channel_destroy(void *io_device, void *ctx_buf) 4652 { 4653 struct spdk_bdev_channel *ch = ctx_buf; 4654 4655 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4656 spdk_get_thread()); 4657 4658 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4659 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4660 4661 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4662 spdk_spin_lock(&ch->bdev->internal.spinlock); 4663 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4664 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4665 4666 bdev_channel_abort_queued_ios(ch); 4667 4668 if (ch->histogram) { 4669 spdk_histogram_data_free(ch->histogram); 4670 } 4671 4672 bdev_channel_destroy_resource(ch); 4673 } 4674 4675 /* 4676 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4677 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4678 */ 4679 static int 4680 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4681 { 4682 struct spdk_bdev_name *tmp; 4683 4684 bdev_name->name = strdup(name); 4685 if (bdev_name->name == NULL) { 4686 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4687 return -ENOMEM; 4688 } 4689 4690 bdev_name->bdev = bdev; 4691 4692 spdk_spin_lock(&g_bdev_mgr.spinlock); 4693 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4694 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4695 4696 if (tmp != NULL) { 4697 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4698 free(bdev_name->name); 4699 return -EEXIST; 4700 } 4701 4702 return 0; 4703 } 4704 4705 static void 4706 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4707 { 4708 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4709 free(bdev_name->name); 4710 } 4711 4712 static void 4713 bdev_name_del(struct spdk_bdev_name *bdev_name) 4714 { 4715 spdk_spin_lock(&g_bdev_mgr.spinlock); 4716 bdev_name_del_unsafe(bdev_name); 4717 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4718 } 4719 4720 int 4721 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4722 { 4723 struct spdk_bdev_alias *tmp; 4724 int ret; 4725 4726 if (alias == NULL) { 4727 SPDK_ERRLOG("Empty alias passed\n"); 4728 return -EINVAL; 4729 } 4730 4731 tmp = calloc(1, sizeof(*tmp)); 4732 if (tmp == NULL) { 4733 SPDK_ERRLOG("Unable to allocate alias\n"); 4734 return -ENOMEM; 4735 } 4736 4737 ret = bdev_name_add(&tmp->alias, bdev, alias); 4738 if (ret != 0) { 4739 free(tmp); 4740 return ret; 4741 } 4742 4743 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4744 4745 return 0; 4746 } 4747 4748 static int 4749 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4750 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4751 { 4752 struct spdk_bdev_alias *tmp; 4753 4754 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4755 if (strcmp(alias, tmp->alias.name) == 0) { 4756 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4757 alias_del_fn(&tmp->alias); 4758 free(tmp); 4759 return 0; 4760 } 4761 } 4762 4763 return -ENOENT; 4764 } 4765 4766 int 4767 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4768 { 4769 int rc; 4770 4771 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4772 if (rc == -ENOENT) { 4773 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4774 } 4775 4776 return rc; 4777 } 4778 4779 void 4780 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4781 { 4782 struct spdk_bdev_alias *p, *tmp; 4783 4784 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4785 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4786 bdev_name_del(&p->alias); 4787 free(p); 4788 } 4789 } 4790 4791 struct spdk_io_channel * 4792 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4793 { 4794 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4795 } 4796 4797 void * 4798 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4799 { 4800 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4801 void *ctx = NULL; 4802 4803 if (bdev->fn_table->get_module_ctx) { 4804 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4805 } 4806 4807 return ctx; 4808 } 4809 4810 const char * 4811 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4812 { 4813 return bdev->module->name; 4814 } 4815 4816 const char * 4817 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4818 { 4819 return bdev->name; 4820 } 4821 4822 const char * 4823 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4824 { 4825 return bdev->product_name; 4826 } 4827 4828 const struct spdk_bdev_aliases_list * 4829 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4830 { 4831 return &bdev->aliases; 4832 } 4833 4834 uint32_t 4835 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4836 { 4837 return bdev->blocklen; 4838 } 4839 4840 uint32_t 4841 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4842 { 4843 return bdev->write_unit_size; 4844 } 4845 4846 uint64_t 4847 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4848 { 4849 return bdev->blockcnt; 4850 } 4851 4852 const char * 4853 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4854 { 4855 return qos_rpc_type[type]; 4856 } 4857 4858 void 4859 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4860 { 4861 int i; 4862 4863 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4864 4865 spdk_spin_lock(&bdev->internal.spinlock); 4866 if (bdev->internal.qos) { 4867 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4868 if (bdev->internal.qos->rate_limits[i].limit != 4869 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4870 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4871 if (bdev_qos_is_iops_rate_limit(i) == false) { 4872 /* Change from Byte to Megabyte which is user visible. */ 4873 limits[i] = limits[i] / 1024 / 1024; 4874 } 4875 } 4876 } 4877 } 4878 spdk_spin_unlock(&bdev->internal.spinlock); 4879 } 4880 4881 size_t 4882 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4883 { 4884 return 1 << bdev->required_alignment; 4885 } 4886 4887 uint32_t 4888 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4889 { 4890 return bdev->optimal_io_boundary; 4891 } 4892 4893 bool 4894 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4895 { 4896 return bdev->write_cache; 4897 } 4898 4899 const struct spdk_uuid * 4900 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4901 { 4902 return &bdev->uuid; 4903 } 4904 4905 uint16_t 4906 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4907 { 4908 return bdev->acwu; 4909 } 4910 4911 uint32_t 4912 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4913 { 4914 return bdev->md_len; 4915 } 4916 4917 bool 4918 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4919 { 4920 return (bdev->md_len != 0) && bdev->md_interleave; 4921 } 4922 4923 bool 4924 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4925 { 4926 return (bdev->md_len != 0) && !bdev->md_interleave; 4927 } 4928 4929 bool 4930 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4931 { 4932 return bdev->zoned; 4933 } 4934 4935 uint32_t 4936 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4937 { 4938 if (spdk_bdev_is_md_interleaved(bdev)) { 4939 return bdev->blocklen - bdev->md_len; 4940 } else { 4941 return bdev->blocklen; 4942 } 4943 } 4944 4945 uint32_t 4946 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4947 { 4948 return bdev->phys_blocklen; 4949 } 4950 4951 static uint32_t 4952 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4953 { 4954 if (!spdk_bdev_is_md_interleaved(bdev)) { 4955 return bdev->blocklen + bdev->md_len; 4956 } else { 4957 return bdev->blocklen; 4958 } 4959 } 4960 4961 /* We have to use the typedef in the function declaration to appease astyle. */ 4962 typedef enum spdk_dif_type spdk_dif_type_t; 4963 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4964 4965 spdk_dif_type_t 4966 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4967 { 4968 if (bdev->md_len != 0) { 4969 return bdev->dif_type; 4970 } else { 4971 return SPDK_DIF_DISABLE; 4972 } 4973 } 4974 4975 spdk_dif_pi_format_t 4976 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4977 { 4978 return bdev->dif_pi_format; 4979 } 4980 4981 bool 4982 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4983 { 4984 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4985 return bdev->dif_is_head_of_md; 4986 } else { 4987 return false; 4988 } 4989 } 4990 4991 bool 4992 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4993 enum spdk_dif_check_type check_type) 4994 { 4995 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4996 return false; 4997 } 4998 4999 switch (check_type) { 5000 case SPDK_DIF_CHECK_TYPE_REFTAG: 5001 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5002 case SPDK_DIF_CHECK_TYPE_APPTAG: 5003 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5004 case SPDK_DIF_CHECK_TYPE_GUARD: 5005 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5006 default: 5007 return false; 5008 } 5009 } 5010 5011 static uint32_t 5012 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5013 { 5014 uint64_t aligned_length, max_write_blocks; 5015 5016 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5017 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5018 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5019 5020 return max_write_blocks; 5021 } 5022 5023 uint32_t 5024 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5025 { 5026 return bdev->max_copy; 5027 } 5028 5029 uint64_t 5030 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5031 { 5032 return bdev->internal.measured_queue_depth; 5033 } 5034 5035 uint64_t 5036 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5037 { 5038 return bdev->internal.period; 5039 } 5040 5041 uint64_t 5042 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5043 { 5044 return bdev->internal.weighted_io_time; 5045 } 5046 5047 uint64_t 5048 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5049 { 5050 return bdev->internal.io_time; 5051 } 5052 5053 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5054 { 5055 return bdev->ctratt; 5056 } 5057 5058 uint32_t 5059 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5060 { 5061 return bdev->nsid; 5062 } 5063 5064 uint32_t 5065 spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 5066 { 5067 struct spdk_bdev *bdev = desc->bdev; 5068 5069 return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen; 5070 } 5071 5072 uint32_t 5073 spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc) 5074 { 5075 struct spdk_bdev *bdev = desc->bdev; 5076 5077 return desc->opts.hide_metadata ? 0 : bdev->md_len; 5078 } 5079 5080 bool 5081 spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc) 5082 { 5083 struct spdk_bdev *bdev = desc->bdev; 5084 5085 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev); 5086 } 5087 5088 bool 5089 spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc) 5090 { 5091 struct spdk_bdev *bdev = desc->bdev; 5092 5093 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev); 5094 } 5095 5096 spdk_dif_type_t 5097 spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc) 5098 { 5099 struct spdk_bdev *bdev = desc->bdev; 5100 5101 return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev); 5102 } 5103 5104 spdk_dif_pi_format_t 5105 spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc) 5106 { 5107 struct spdk_bdev *bdev = desc->bdev; 5108 5109 return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev); 5110 } 5111 5112 bool 5113 spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc) 5114 { 5115 struct spdk_bdev *bdev = desc->bdev; 5116 5117 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev); 5118 } 5119 5120 bool 5121 spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc, 5122 enum spdk_dif_check_type check_type) 5123 { 5124 struct spdk_bdev *bdev = desc->bdev; 5125 5126 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type); 5127 } 5128 5129 static void bdev_update_qd_sampling_period(void *ctx); 5130 5131 static void 5132 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5133 { 5134 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5135 5136 if (bdev->internal.measured_queue_depth) { 5137 bdev->internal.io_time += bdev->internal.period; 5138 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5139 } 5140 5141 bdev->internal.qd_poll_in_progress = false; 5142 5143 bdev_update_qd_sampling_period(bdev); 5144 } 5145 5146 static void 5147 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5148 struct spdk_io_channel *io_ch, void *_ctx) 5149 { 5150 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5151 5152 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5153 spdk_bdev_for_each_channel_continue(i, 0); 5154 } 5155 5156 static int 5157 bdev_calculate_measured_queue_depth(void *ctx) 5158 { 5159 struct spdk_bdev *bdev = ctx; 5160 5161 bdev->internal.qd_poll_in_progress = true; 5162 bdev->internal.temporary_queue_depth = 0; 5163 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5164 return SPDK_POLLER_BUSY; 5165 } 5166 5167 static void 5168 bdev_update_qd_sampling_period(void *ctx) 5169 { 5170 struct spdk_bdev *bdev = ctx; 5171 5172 if (bdev->internal.period == bdev->internal.new_period) { 5173 return; 5174 } 5175 5176 if (bdev->internal.qd_poll_in_progress) { 5177 return; 5178 } 5179 5180 bdev->internal.period = bdev->internal.new_period; 5181 5182 spdk_poller_unregister(&bdev->internal.qd_poller); 5183 if (bdev->internal.period != 0) { 5184 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5185 bdev, bdev->internal.period); 5186 } else { 5187 spdk_bdev_close(bdev->internal.qd_desc); 5188 bdev->internal.qd_desc = NULL; 5189 } 5190 } 5191 5192 static void 5193 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5194 { 5195 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5196 } 5197 5198 void 5199 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5200 { 5201 int rc; 5202 5203 if (bdev->internal.new_period == period) { 5204 return; 5205 } 5206 5207 bdev->internal.new_period = period; 5208 5209 if (bdev->internal.qd_desc != NULL) { 5210 assert(bdev->internal.period != 0); 5211 5212 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5213 bdev_update_qd_sampling_period, bdev); 5214 return; 5215 } 5216 5217 assert(bdev->internal.period == 0); 5218 5219 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5220 NULL, &bdev->internal.qd_desc); 5221 if (rc != 0) { 5222 return; 5223 } 5224 5225 bdev->internal.period = period; 5226 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5227 bdev, period); 5228 } 5229 5230 struct bdev_get_current_qd_ctx { 5231 uint64_t current_qd; 5232 spdk_bdev_get_current_qd_cb cb_fn; 5233 void *cb_arg; 5234 }; 5235 5236 static void 5237 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5238 { 5239 struct bdev_get_current_qd_ctx *ctx = _ctx; 5240 5241 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5242 5243 free(ctx); 5244 } 5245 5246 static void 5247 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5248 struct spdk_io_channel *io_ch, void *_ctx) 5249 { 5250 struct bdev_get_current_qd_ctx *ctx = _ctx; 5251 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5252 5253 ctx->current_qd += bdev_ch->io_outstanding; 5254 5255 spdk_bdev_for_each_channel_continue(i, 0); 5256 } 5257 5258 void 5259 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5260 void *cb_arg) 5261 { 5262 struct bdev_get_current_qd_ctx *ctx; 5263 5264 assert(cb_fn != NULL); 5265 5266 ctx = calloc(1, sizeof(*ctx)); 5267 if (ctx == NULL) { 5268 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5269 return; 5270 } 5271 5272 ctx->cb_fn = cb_fn; 5273 ctx->cb_arg = cb_arg; 5274 5275 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5276 } 5277 5278 static void 5279 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5280 { 5281 assert(desc->thread == spdk_get_thread()); 5282 5283 spdk_spin_lock(&desc->spinlock); 5284 desc->refs--; 5285 if (!desc->closed) { 5286 spdk_spin_unlock(&desc->spinlock); 5287 desc->callback.event_fn(type, 5288 desc->bdev, 5289 desc->callback.ctx); 5290 return; 5291 } else if (desc->refs == 0) { 5292 /* This descriptor was closed after this event_notify message was sent. 5293 * spdk_bdev_close() could not free the descriptor since this message was 5294 * in flight, so we free it now using bdev_desc_free(). 5295 */ 5296 spdk_spin_unlock(&desc->spinlock); 5297 bdev_desc_free(desc); 5298 return; 5299 } 5300 spdk_spin_unlock(&desc->spinlock); 5301 } 5302 5303 static void 5304 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5305 { 5306 spdk_spin_lock(&desc->spinlock); 5307 desc->refs++; 5308 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5309 spdk_spin_unlock(&desc->spinlock); 5310 } 5311 5312 static void 5313 _resize_notify(void *ctx) 5314 { 5315 struct spdk_bdev_desc *desc = ctx; 5316 5317 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5318 } 5319 5320 int 5321 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5322 { 5323 struct spdk_bdev_desc *desc; 5324 int ret; 5325 5326 if (size == bdev->blockcnt) { 5327 return 0; 5328 } 5329 5330 spdk_spin_lock(&bdev->internal.spinlock); 5331 5332 /* bdev has open descriptors */ 5333 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5334 bdev->blockcnt > size) { 5335 ret = -EBUSY; 5336 } else { 5337 bdev->blockcnt = size; 5338 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5339 event_notify(desc, _resize_notify); 5340 } 5341 ret = 0; 5342 } 5343 5344 spdk_spin_unlock(&bdev->internal.spinlock); 5345 5346 return ret; 5347 } 5348 5349 /* 5350 * Convert I/O offset and length from bytes to blocks. 5351 * 5352 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5353 */ 5354 static uint64_t 5355 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes, 5356 uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks) 5357 { 5358 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5359 uint32_t block_size = bdev->blocklen; 5360 uint8_t shift_cnt; 5361 5362 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5363 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5364 shift_cnt = spdk_u32log2(block_size); 5365 *offset_blocks = offset_bytes >> shift_cnt; 5366 *num_blocks = num_bytes >> shift_cnt; 5367 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5368 (num_bytes - (*num_blocks << shift_cnt)); 5369 } else { 5370 *offset_blocks = offset_bytes / block_size; 5371 *num_blocks = num_bytes / block_size; 5372 return (offset_bytes % block_size) | (num_bytes % block_size); 5373 } 5374 } 5375 5376 static bool 5377 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5378 { 5379 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5380 * has been an overflow and hence the offset has been wrapped around */ 5381 if (offset_blocks + num_blocks < offset_blocks) { 5382 return false; 5383 } 5384 5385 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5386 if (offset_blocks + num_blocks > bdev->blockcnt) { 5387 return false; 5388 } 5389 5390 return true; 5391 } 5392 5393 static void 5394 bdev_seek_complete_cb(void *ctx) 5395 { 5396 struct spdk_bdev_io *bdev_io = ctx; 5397 5398 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5399 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5400 } 5401 5402 static int 5403 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5404 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5405 spdk_bdev_io_completion_cb cb, void *cb_arg) 5406 { 5407 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5408 struct spdk_bdev_io *bdev_io; 5409 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5410 5411 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5412 5413 /* Check if offset_blocks is valid looking at the validity of one block */ 5414 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5415 return -EINVAL; 5416 } 5417 5418 bdev_io = bdev_channel_get_io(channel); 5419 if (!bdev_io) { 5420 return -ENOMEM; 5421 } 5422 5423 bdev_io->internal.ch = channel; 5424 bdev_io->internal.desc = desc; 5425 bdev_io->type = io_type; 5426 bdev_io->u.bdev.offset_blocks = offset_blocks; 5427 bdev_io->u.bdev.memory_domain = NULL; 5428 bdev_io->u.bdev.memory_domain_ctx = NULL; 5429 bdev_io->u.bdev.accel_sequence = NULL; 5430 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5431 5432 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5433 /* In case bdev doesn't support seek to next data/hole offset, 5434 * it is assumed that only data and no holes are present */ 5435 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5436 bdev_io->u.bdev.seek.offset = offset_blocks; 5437 } else { 5438 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5439 } 5440 5441 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5442 return 0; 5443 } 5444 5445 bdev_io_submit(bdev_io); 5446 return 0; 5447 } 5448 5449 int 5450 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5451 uint64_t offset_blocks, 5452 spdk_bdev_io_completion_cb cb, void *cb_arg) 5453 { 5454 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5455 } 5456 5457 int 5458 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5459 uint64_t offset_blocks, 5460 spdk_bdev_io_completion_cb cb, void *cb_arg) 5461 { 5462 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5463 } 5464 5465 uint64_t 5466 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5467 { 5468 return bdev_io->u.bdev.seek.offset; 5469 } 5470 5471 static int 5472 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5473 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5474 spdk_bdev_io_completion_cb cb, void *cb_arg) 5475 { 5476 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5477 struct spdk_bdev_io *bdev_io; 5478 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5479 5480 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5481 return -EINVAL; 5482 } 5483 5484 bdev_io = bdev_channel_get_io(channel); 5485 if (!bdev_io) { 5486 return -ENOMEM; 5487 } 5488 5489 bdev_io->internal.ch = channel; 5490 bdev_io->internal.desc = desc; 5491 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5492 bdev_io->u.bdev.iovs = &bdev_io->iov; 5493 bdev_io->u.bdev.iovs[0].iov_base = buf; 5494 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5495 bdev_io->u.bdev.iovcnt = 1; 5496 bdev_io->u.bdev.md_buf = md_buf; 5497 bdev_io->u.bdev.num_blocks = num_blocks; 5498 bdev_io->u.bdev.offset_blocks = offset_blocks; 5499 bdev_io->u.bdev.memory_domain = NULL; 5500 bdev_io->u.bdev.memory_domain_ctx = NULL; 5501 bdev_io->u.bdev.accel_sequence = NULL; 5502 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5503 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5504 5505 bdev_io_submit(bdev_io); 5506 return 0; 5507 } 5508 5509 int 5510 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5511 void *buf, uint64_t offset, uint64_t nbytes, 5512 spdk_bdev_io_completion_cb cb, void *cb_arg) 5513 { 5514 uint64_t offset_blocks, num_blocks; 5515 5516 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5517 return -EINVAL; 5518 } 5519 5520 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5521 } 5522 5523 int 5524 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5525 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5526 spdk_bdev_io_completion_cb cb, void *cb_arg) 5527 { 5528 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5529 } 5530 5531 int 5532 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5533 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5534 spdk_bdev_io_completion_cb cb, void *cb_arg) 5535 { 5536 struct iovec iov = { 5537 .iov_base = buf, 5538 }; 5539 5540 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5541 return -EINVAL; 5542 } 5543 5544 if (md_buf && !_is_buf_allocated(&iov)) { 5545 return -EINVAL; 5546 } 5547 5548 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5549 cb, cb_arg); 5550 } 5551 5552 int 5553 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5554 struct iovec *iov, int iovcnt, 5555 uint64_t offset, uint64_t nbytes, 5556 spdk_bdev_io_completion_cb cb, void *cb_arg) 5557 { 5558 uint64_t offset_blocks, num_blocks; 5559 5560 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5561 return -EINVAL; 5562 } 5563 5564 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5565 } 5566 5567 static int 5568 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5569 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5570 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5571 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5572 spdk_bdev_io_completion_cb cb, void *cb_arg) 5573 { 5574 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5575 struct spdk_bdev_io *bdev_io; 5576 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5577 5578 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5579 return -EINVAL; 5580 } 5581 5582 bdev_io = bdev_channel_get_io(channel); 5583 if (spdk_unlikely(!bdev_io)) { 5584 return -ENOMEM; 5585 } 5586 5587 bdev_io->internal.ch = channel; 5588 bdev_io->internal.desc = desc; 5589 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5590 bdev_io->u.bdev.iovs = iov; 5591 bdev_io->u.bdev.iovcnt = iovcnt; 5592 bdev_io->u.bdev.md_buf = md_buf; 5593 bdev_io->u.bdev.num_blocks = num_blocks; 5594 bdev_io->u.bdev.offset_blocks = offset_blocks; 5595 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5596 5597 if (seq != NULL) { 5598 bdev_io->internal.f.has_accel_sequence = true; 5599 bdev_io->internal.accel_sequence = seq; 5600 } 5601 5602 if (domain != NULL) { 5603 bdev_io->internal.f.has_memory_domain = true; 5604 bdev_io->internal.memory_domain = domain; 5605 bdev_io->internal.memory_domain_ctx = domain_ctx; 5606 } 5607 5608 bdev_io->u.bdev.memory_domain = domain; 5609 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5610 bdev_io->u.bdev.accel_sequence = seq; 5611 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5612 5613 _bdev_io_submit_ext(desc, bdev_io); 5614 5615 return 0; 5616 } 5617 5618 int 5619 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5620 struct iovec *iov, int iovcnt, 5621 uint64_t offset_blocks, uint64_t num_blocks, 5622 spdk_bdev_io_completion_cb cb, void *cb_arg) 5623 { 5624 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5625 5626 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5627 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5628 } 5629 5630 int 5631 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5632 struct iovec *iov, int iovcnt, void *md_buf, 5633 uint64_t offset_blocks, uint64_t num_blocks, 5634 spdk_bdev_io_completion_cb cb, void *cb_arg) 5635 { 5636 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5637 5638 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5639 return -EINVAL; 5640 } 5641 5642 if (md_buf && !_is_buf_allocated(iov)) { 5643 return -EINVAL; 5644 } 5645 5646 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5647 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5648 } 5649 5650 static inline bool 5651 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5652 { 5653 /* 5654 * We check if opts size is at least of size when we first introduced 5655 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5656 * are not checked internal. 5657 */ 5658 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5659 sizeof(opts->metadata) && 5660 opts->size <= sizeof(*opts) && 5661 /* When memory domain is used, the user must provide data buffers */ 5662 (!opts->memory_domain || (iov && iov[0].iov_base)); 5663 } 5664 5665 int 5666 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5667 struct iovec *iov, int iovcnt, 5668 uint64_t offset_blocks, uint64_t num_blocks, 5669 spdk_bdev_io_completion_cb cb, void *cb_arg, 5670 struct spdk_bdev_ext_io_opts *opts) 5671 { 5672 struct spdk_memory_domain *domain = NULL; 5673 struct spdk_accel_sequence *seq = NULL; 5674 void *domain_ctx = NULL, *md = NULL; 5675 uint32_t dif_check_flags = 0; 5676 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5677 5678 if (opts) { 5679 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5680 return -EINVAL; 5681 } 5682 5683 md = opts->metadata; 5684 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5685 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5686 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5687 if (md) { 5688 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5689 return -EINVAL; 5690 } 5691 5692 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5693 return -EINVAL; 5694 } 5695 5696 if (spdk_unlikely(seq != NULL)) { 5697 return -EINVAL; 5698 } 5699 } 5700 } 5701 5702 dif_check_flags = bdev->dif_check_flags & 5703 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5704 5705 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5706 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5707 } 5708 5709 static int 5710 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5711 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5712 spdk_bdev_io_completion_cb cb, void *cb_arg) 5713 { 5714 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5715 struct spdk_bdev_io *bdev_io; 5716 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5717 5718 if (!desc->write) { 5719 return -EBADF; 5720 } 5721 5722 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5723 return -EINVAL; 5724 } 5725 5726 bdev_io = bdev_channel_get_io(channel); 5727 if (!bdev_io) { 5728 return -ENOMEM; 5729 } 5730 5731 bdev_io->internal.ch = channel; 5732 bdev_io->internal.desc = desc; 5733 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5734 bdev_io->u.bdev.iovs = &bdev_io->iov; 5735 bdev_io->u.bdev.iovs[0].iov_base = buf; 5736 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5737 bdev_io->u.bdev.iovcnt = 1; 5738 bdev_io->u.bdev.md_buf = md_buf; 5739 bdev_io->u.bdev.num_blocks = num_blocks; 5740 bdev_io->u.bdev.offset_blocks = offset_blocks; 5741 bdev_io->u.bdev.memory_domain = NULL; 5742 bdev_io->u.bdev.memory_domain_ctx = NULL; 5743 bdev_io->u.bdev.accel_sequence = NULL; 5744 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5745 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5746 5747 bdev_io_submit(bdev_io); 5748 return 0; 5749 } 5750 5751 int 5752 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5753 void *buf, uint64_t offset, uint64_t nbytes, 5754 spdk_bdev_io_completion_cb cb, void *cb_arg) 5755 { 5756 uint64_t offset_blocks, num_blocks; 5757 5758 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5759 return -EINVAL; 5760 } 5761 5762 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5763 } 5764 5765 int 5766 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5767 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5768 spdk_bdev_io_completion_cb cb, void *cb_arg) 5769 { 5770 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5771 cb, cb_arg); 5772 } 5773 5774 int 5775 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5776 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5777 spdk_bdev_io_completion_cb cb, void *cb_arg) 5778 { 5779 struct iovec iov = { 5780 .iov_base = buf, 5781 }; 5782 5783 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5784 return -EINVAL; 5785 } 5786 5787 if (md_buf && !_is_buf_allocated(&iov)) { 5788 return -EINVAL; 5789 } 5790 5791 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5792 cb, cb_arg); 5793 } 5794 5795 static int 5796 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5797 struct iovec *iov, int iovcnt, void *md_buf, 5798 uint64_t offset_blocks, uint64_t num_blocks, 5799 struct spdk_memory_domain *domain, void *domain_ctx, 5800 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5801 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5802 spdk_bdev_io_completion_cb cb, void *cb_arg) 5803 { 5804 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5805 struct spdk_bdev_io *bdev_io; 5806 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5807 5808 if (spdk_unlikely(!desc->write)) { 5809 return -EBADF; 5810 } 5811 5812 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5813 return -EINVAL; 5814 } 5815 5816 bdev_io = bdev_channel_get_io(channel); 5817 if (spdk_unlikely(!bdev_io)) { 5818 return -ENOMEM; 5819 } 5820 5821 bdev_io->internal.ch = channel; 5822 bdev_io->internal.desc = desc; 5823 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5824 bdev_io->u.bdev.iovs = iov; 5825 bdev_io->u.bdev.iovcnt = iovcnt; 5826 bdev_io->u.bdev.md_buf = md_buf; 5827 bdev_io->u.bdev.num_blocks = num_blocks; 5828 bdev_io->u.bdev.offset_blocks = offset_blocks; 5829 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5830 if (seq != NULL) { 5831 bdev_io->internal.f.has_accel_sequence = true; 5832 bdev_io->internal.accel_sequence = seq; 5833 } 5834 5835 if (domain != NULL) { 5836 bdev_io->internal.f.has_memory_domain = true; 5837 bdev_io->internal.memory_domain = domain; 5838 bdev_io->internal.memory_domain_ctx = domain_ctx; 5839 } 5840 5841 bdev_io->u.bdev.memory_domain = domain; 5842 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5843 bdev_io->u.bdev.accel_sequence = seq; 5844 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5845 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5846 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5847 5848 _bdev_io_submit_ext(desc, bdev_io); 5849 5850 return 0; 5851 } 5852 5853 int 5854 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5855 struct iovec *iov, int iovcnt, 5856 uint64_t offset, uint64_t len, 5857 spdk_bdev_io_completion_cb cb, void *cb_arg) 5858 { 5859 uint64_t offset_blocks, num_blocks; 5860 5861 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 5862 return -EINVAL; 5863 } 5864 5865 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5866 } 5867 5868 int 5869 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5870 struct iovec *iov, int iovcnt, 5871 uint64_t offset_blocks, uint64_t num_blocks, 5872 spdk_bdev_io_completion_cb cb, void *cb_arg) 5873 { 5874 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5875 5876 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5877 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5878 cb, cb_arg); 5879 } 5880 5881 int 5882 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5883 struct iovec *iov, int iovcnt, void *md_buf, 5884 uint64_t offset_blocks, uint64_t num_blocks, 5885 spdk_bdev_io_completion_cb cb, void *cb_arg) 5886 { 5887 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5888 5889 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5890 return -EINVAL; 5891 } 5892 5893 if (md_buf && !_is_buf_allocated(iov)) { 5894 return -EINVAL; 5895 } 5896 5897 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5898 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5899 cb, cb_arg); 5900 } 5901 5902 int 5903 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5904 struct iovec *iov, int iovcnt, 5905 uint64_t offset_blocks, uint64_t num_blocks, 5906 spdk_bdev_io_completion_cb cb, void *cb_arg, 5907 struct spdk_bdev_ext_io_opts *opts) 5908 { 5909 struct spdk_memory_domain *domain = NULL; 5910 struct spdk_accel_sequence *seq = NULL; 5911 void *domain_ctx = NULL, *md = NULL; 5912 uint32_t dif_check_flags = 0; 5913 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5914 uint32_t nvme_cdw12_raw = 0; 5915 uint32_t nvme_cdw13_raw = 0; 5916 5917 if (opts) { 5918 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5919 return -EINVAL; 5920 } 5921 md = opts->metadata; 5922 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5923 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5924 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5925 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5926 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5927 if (md) { 5928 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5929 return -EINVAL; 5930 } 5931 5932 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5933 return -EINVAL; 5934 } 5935 5936 if (spdk_unlikely(seq != NULL)) { 5937 return -EINVAL; 5938 } 5939 } 5940 } 5941 5942 dif_check_flags = bdev->dif_check_flags & 5943 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5944 5945 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5946 domain, domain_ctx, seq, dif_check_flags, 5947 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5948 } 5949 5950 static void 5951 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5952 { 5953 struct spdk_bdev_io *parent_io = cb_arg; 5954 struct spdk_bdev *bdev = parent_io->bdev; 5955 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5956 int i, rc = 0; 5957 5958 if (!success) { 5959 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5960 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5961 spdk_bdev_free_io(bdev_io); 5962 return; 5963 } 5964 5965 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5966 rc = memcmp(read_buf, 5967 parent_io->u.bdev.iovs[i].iov_base, 5968 parent_io->u.bdev.iovs[i].iov_len); 5969 if (rc) { 5970 break; 5971 } 5972 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5973 } 5974 5975 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5976 rc = memcmp(bdev_io->u.bdev.md_buf, 5977 parent_io->u.bdev.md_buf, 5978 spdk_bdev_get_md_size(bdev)); 5979 } 5980 5981 spdk_bdev_free_io(bdev_io); 5982 5983 if (rc == 0) { 5984 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5985 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5986 } else { 5987 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5988 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5989 } 5990 } 5991 5992 static void 5993 bdev_compare_do_read(void *_bdev_io) 5994 { 5995 struct spdk_bdev_io *bdev_io = _bdev_io; 5996 int rc; 5997 5998 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5999 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 6000 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6001 bdev_compare_do_read_done, bdev_io); 6002 6003 if (rc == -ENOMEM) { 6004 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 6005 } else if (rc != 0) { 6006 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6007 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6008 } 6009 } 6010 6011 static int 6012 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6013 struct iovec *iov, int iovcnt, void *md_buf, 6014 uint64_t offset_blocks, uint64_t num_blocks, 6015 spdk_bdev_io_completion_cb cb, void *cb_arg) 6016 { 6017 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6018 struct spdk_bdev_io *bdev_io; 6019 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6020 6021 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6022 return -EINVAL; 6023 } 6024 6025 bdev_io = bdev_channel_get_io(channel); 6026 if (!bdev_io) { 6027 return -ENOMEM; 6028 } 6029 6030 bdev_io->internal.ch = channel; 6031 bdev_io->internal.desc = desc; 6032 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6033 bdev_io->u.bdev.iovs = iov; 6034 bdev_io->u.bdev.iovcnt = iovcnt; 6035 bdev_io->u.bdev.md_buf = md_buf; 6036 bdev_io->u.bdev.num_blocks = num_blocks; 6037 bdev_io->u.bdev.offset_blocks = offset_blocks; 6038 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6039 bdev_io->u.bdev.memory_domain = NULL; 6040 bdev_io->u.bdev.memory_domain_ctx = NULL; 6041 bdev_io->u.bdev.accel_sequence = NULL; 6042 6043 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6044 bdev_io_submit(bdev_io); 6045 return 0; 6046 } 6047 6048 bdev_compare_do_read(bdev_io); 6049 6050 return 0; 6051 } 6052 6053 int 6054 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6055 struct iovec *iov, int iovcnt, 6056 uint64_t offset_blocks, uint64_t num_blocks, 6057 spdk_bdev_io_completion_cb cb, void *cb_arg) 6058 { 6059 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6060 num_blocks, cb, cb_arg); 6061 } 6062 6063 int 6064 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6065 struct iovec *iov, int iovcnt, void *md_buf, 6066 uint64_t offset_blocks, uint64_t num_blocks, 6067 spdk_bdev_io_completion_cb cb, void *cb_arg) 6068 { 6069 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6070 return -EINVAL; 6071 } 6072 6073 if (md_buf && !_is_buf_allocated(iov)) { 6074 return -EINVAL; 6075 } 6076 6077 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6078 num_blocks, cb, cb_arg); 6079 } 6080 6081 static int 6082 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6083 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6084 spdk_bdev_io_completion_cb cb, void *cb_arg) 6085 { 6086 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6087 struct spdk_bdev_io *bdev_io; 6088 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6089 6090 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6091 return -EINVAL; 6092 } 6093 6094 bdev_io = bdev_channel_get_io(channel); 6095 if (!bdev_io) { 6096 return -ENOMEM; 6097 } 6098 6099 bdev_io->internal.ch = channel; 6100 bdev_io->internal.desc = desc; 6101 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6102 bdev_io->u.bdev.iovs = &bdev_io->iov; 6103 bdev_io->u.bdev.iovs[0].iov_base = buf; 6104 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 6105 bdev_io->u.bdev.iovcnt = 1; 6106 bdev_io->u.bdev.md_buf = md_buf; 6107 bdev_io->u.bdev.num_blocks = num_blocks; 6108 bdev_io->u.bdev.offset_blocks = offset_blocks; 6109 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6110 bdev_io->u.bdev.memory_domain = NULL; 6111 bdev_io->u.bdev.memory_domain_ctx = NULL; 6112 bdev_io->u.bdev.accel_sequence = NULL; 6113 6114 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6115 bdev_io_submit(bdev_io); 6116 return 0; 6117 } 6118 6119 bdev_compare_do_read(bdev_io); 6120 6121 return 0; 6122 } 6123 6124 int 6125 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6126 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6127 spdk_bdev_io_completion_cb cb, void *cb_arg) 6128 { 6129 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6130 cb, cb_arg); 6131 } 6132 6133 int 6134 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6135 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6136 spdk_bdev_io_completion_cb cb, void *cb_arg) 6137 { 6138 struct iovec iov = { 6139 .iov_base = buf, 6140 }; 6141 6142 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6143 return -EINVAL; 6144 } 6145 6146 if (md_buf && !_is_buf_allocated(&iov)) { 6147 return -EINVAL; 6148 } 6149 6150 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6151 cb, cb_arg); 6152 } 6153 6154 static void 6155 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6156 { 6157 struct spdk_bdev_io *bdev_io = ctx; 6158 6159 if (unlock_status) { 6160 SPDK_ERRLOG("LBA range unlock failed\n"); 6161 } 6162 6163 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6164 false, bdev_io->internal.caller_ctx); 6165 } 6166 6167 static void 6168 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6169 { 6170 bdev_io->internal.status = status; 6171 6172 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6173 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6174 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6175 } 6176 6177 static void 6178 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6179 { 6180 struct spdk_bdev_io *parent_io = cb_arg; 6181 6182 if (!success) { 6183 SPDK_ERRLOG("Compare and write operation failed\n"); 6184 } 6185 6186 spdk_bdev_free_io(bdev_io); 6187 6188 bdev_comparev_and_writev_blocks_unlock(parent_io, 6189 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6190 } 6191 6192 static void 6193 bdev_compare_and_write_do_write(void *_bdev_io) 6194 { 6195 struct spdk_bdev_io *bdev_io = _bdev_io; 6196 int rc; 6197 6198 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6199 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6200 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6201 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6202 bdev_compare_and_write_do_write_done, bdev_io); 6203 6204 6205 if (rc == -ENOMEM) { 6206 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6207 } else if (rc != 0) { 6208 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6209 } 6210 } 6211 6212 static void 6213 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6214 { 6215 struct spdk_bdev_io *parent_io = cb_arg; 6216 6217 spdk_bdev_free_io(bdev_io); 6218 6219 if (!success) { 6220 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6221 return; 6222 } 6223 6224 bdev_compare_and_write_do_write(parent_io); 6225 } 6226 6227 static void 6228 bdev_compare_and_write_do_compare(void *_bdev_io) 6229 { 6230 struct spdk_bdev_io *bdev_io = _bdev_io; 6231 int rc; 6232 6233 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6234 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6235 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6236 bdev_compare_and_write_do_compare_done, bdev_io); 6237 6238 if (rc == -ENOMEM) { 6239 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6240 } else if (rc != 0) { 6241 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6242 } 6243 } 6244 6245 static void 6246 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6247 { 6248 struct spdk_bdev_io *bdev_io = ctx; 6249 6250 if (status) { 6251 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6252 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6253 return; 6254 } 6255 6256 bdev_compare_and_write_do_compare(bdev_io); 6257 } 6258 6259 int 6260 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6261 struct iovec *compare_iov, int compare_iovcnt, 6262 struct iovec *write_iov, int write_iovcnt, 6263 uint64_t offset_blocks, uint64_t num_blocks, 6264 spdk_bdev_io_completion_cb cb, void *cb_arg) 6265 { 6266 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6267 struct spdk_bdev_io *bdev_io; 6268 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6269 6270 if (!desc->write) { 6271 return -EBADF; 6272 } 6273 6274 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6275 return -EINVAL; 6276 } 6277 6278 if (num_blocks > bdev->acwu) { 6279 return -EINVAL; 6280 } 6281 6282 bdev_io = bdev_channel_get_io(channel); 6283 if (!bdev_io) { 6284 return -ENOMEM; 6285 } 6286 6287 bdev_io->internal.ch = channel; 6288 bdev_io->internal.desc = desc; 6289 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6290 bdev_io->u.bdev.iovs = compare_iov; 6291 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6292 bdev_io->u.bdev.fused_iovs = write_iov; 6293 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6294 bdev_io->u.bdev.md_buf = NULL; 6295 bdev_io->u.bdev.num_blocks = num_blocks; 6296 bdev_io->u.bdev.offset_blocks = offset_blocks; 6297 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6298 bdev_io->u.bdev.memory_domain = NULL; 6299 bdev_io->u.bdev.memory_domain_ctx = NULL; 6300 bdev_io->u.bdev.accel_sequence = NULL; 6301 6302 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6303 bdev_io_submit(bdev_io); 6304 return 0; 6305 } 6306 6307 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6308 bdev_comparev_and_writev_blocks_locked, bdev_io); 6309 } 6310 6311 int 6312 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6313 struct iovec *iov, int iovcnt, 6314 uint64_t offset_blocks, uint64_t num_blocks, 6315 bool populate, 6316 spdk_bdev_io_completion_cb cb, void *cb_arg) 6317 { 6318 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6319 struct spdk_bdev_io *bdev_io; 6320 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6321 6322 if (!desc->write) { 6323 return -EBADF; 6324 } 6325 6326 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6327 return -EINVAL; 6328 } 6329 6330 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6331 return -ENOTSUP; 6332 } 6333 6334 bdev_io = bdev_channel_get_io(channel); 6335 if (!bdev_io) { 6336 return -ENOMEM; 6337 } 6338 6339 bdev_io->internal.ch = channel; 6340 bdev_io->internal.desc = desc; 6341 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6342 bdev_io->u.bdev.num_blocks = num_blocks; 6343 bdev_io->u.bdev.offset_blocks = offset_blocks; 6344 bdev_io->u.bdev.iovs = iov; 6345 bdev_io->u.bdev.iovcnt = iovcnt; 6346 bdev_io->u.bdev.md_buf = NULL; 6347 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6348 bdev_io->u.bdev.zcopy.commit = 0; 6349 bdev_io->u.bdev.zcopy.start = 1; 6350 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6351 bdev_io->u.bdev.memory_domain = NULL; 6352 bdev_io->u.bdev.memory_domain_ctx = NULL; 6353 bdev_io->u.bdev.accel_sequence = NULL; 6354 6355 bdev_io_submit(bdev_io); 6356 6357 return 0; 6358 } 6359 6360 int 6361 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6362 spdk_bdev_io_completion_cb cb, void *cb_arg) 6363 { 6364 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6365 return -EINVAL; 6366 } 6367 6368 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6369 bdev_io->u.bdev.zcopy.start = 0; 6370 bdev_io->internal.caller_ctx = cb_arg; 6371 bdev_io->internal.cb = cb; 6372 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6373 6374 bdev_io_submit(bdev_io); 6375 6376 return 0; 6377 } 6378 6379 int 6380 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6381 uint64_t offset, uint64_t len, 6382 spdk_bdev_io_completion_cb cb, void *cb_arg) 6383 { 6384 uint64_t offset_blocks, num_blocks; 6385 6386 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6387 return -EINVAL; 6388 } 6389 6390 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6391 } 6392 6393 int 6394 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6395 uint64_t offset_blocks, uint64_t num_blocks, 6396 spdk_bdev_io_completion_cb cb, void *cb_arg) 6397 { 6398 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6399 struct spdk_bdev_io *bdev_io; 6400 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6401 6402 if (!desc->write) { 6403 return -EBADF; 6404 } 6405 6406 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6407 return -EINVAL; 6408 } 6409 6410 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6411 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6412 return -ENOTSUP; 6413 } 6414 6415 bdev_io = bdev_channel_get_io(channel); 6416 6417 if (!bdev_io) { 6418 return -ENOMEM; 6419 } 6420 6421 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6422 bdev_io->internal.ch = channel; 6423 bdev_io->internal.desc = desc; 6424 bdev_io->u.bdev.offset_blocks = offset_blocks; 6425 bdev_io->u.bdev.num_blocks = num_blocks; 6426 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6427 bdev_io->u.bdev.memory_domain = NULL; 6428 bdev_io->u.bdev.memory_domain_ctx = NULL; 6429 bdev_io->u.bdev.accel_sequence = NULL; 6430 6431 /* If the write_zeroes size is large and should be split, use the generic split 6432 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6433 * 6434 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6435 * or emulate it using regular write request otherwise. 6436 */ 6437 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6438 bdev_io->internal.f.split) { 6439 bdev_io_submit(bdev_io); 6440 return 0; 6441 } 6442 6443 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6444 6445 return bdev_write_zero_buffer(bdev_io); 6446 } 6447 6448 int 6449 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6450 uint64_t offset, uint64_t nbytes, 6451 spdk_bdev_io_completion_cb cb, void *cb_arg) 6452 { 6453 uint64_t offset_blocks, num_blocks; 6454 6455 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6456 return -EINVAL; 6457 } 6458 6459 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6460 } 6461 6462 static void 6463 bdev_io_complete_cb(void *ctx) 6464 { 6465 struct spdk_bdev_io *bdev_io = ctx; 6466 6467 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6468 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6469 } 6470 6471 int 6472 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6473 uint64_t offset_blocks, uint64_t num_blocks, 6474 spdk_bdev_io_completion_cb cb, void *cb_arg) 6475 { 6476 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6477 struct spdk_bdev_io *bdev_io; 6478 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6479 6480 if (!desc->write) { 6481 return -EBADF; 6482 } 6483 6484 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6485 return -EINVAL; 6486 } 6487 6488 bdev_io = bdev_channel_get_io(channel); 6489 if (!bdev_io) { 6490 return -ENOMEM; 6491 } 6492 6493 bdev_io->internal.ch = channel; 6494 bdev_io->internal.desc = desc; 6495 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6496 6497 bdev_io->u.bdev.iovs = &bdev_io->iov; 6498 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6499 bdev_io->u.bdev.iovs[0].iov_len = 0; 6500 bdev_io->u.bdev.iovcnt = 1; 6501 6502 bdev_io->u.bdev.offset_blocks = offset_blocks; 6503 bdev_io->u.bdev.num_blocks = num_blocks; 6504 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6505 bdev_io->u.bdev.memory_domain = NULL; 6506 bdev_io->u.bdev.memory_domain_ctx = NULL; 6507 bdev_io->u.bdev.accel_sequence = NULL; 6508 6509 if (num_blocks == 0) { 6510 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6511 return 0; 6512 } 6513 6514 bdev_io_submit(bdev_io); 6515 return 0; 6516 } 6517 6518 int 6519 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6520 uint64_t offset, uint64_t length, 6521 spdk_bdev_io_completion_cb cb, void *cb_arg) 6522 { 6523 uint64_t offset_blocks, num_blocks; 6524 6525 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) { 6526 return -EINVAL; 6527 } 6528 6529 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6530 } 6531 6532 int 6533 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6534 uint64_t offset_blocks, uint64_t num_blocks, 6535 spdk_bdev_io_completion_cb cb, void *cb_arg) 6536 { 6537 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6538 struct spdk_bdev_io *bdev_io; 6539 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6540 6541 if (!desc->write) { 6542 return -EBADF; 6543 } 6544 6545 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6546 return -ENOTSUP; 6547 } 6548 6549 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6550 return -EINVAL; 6551 } 6552 6553 bdev_io = bdev_channel_get_io(channel); 6554 if (!bdev_io) { 6555 return -ENOMEM; 6556 } 6557 6558 bdev_io->internal.ch = channel; 6559 bdev_io->internal.desc = desc; 6560 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6561 bdev_io->u.bdev.iovs = NULL; 6562 bdev_io->u.bdev.iovcnt = 0; 6563 bdev_io->u.bdev.offset_blocks = offset_blocks; 6564 bdev_io->u.bdev.num_blocks = num_blocks; 6565 bdev_io->u.bdev.memory_domain = NULL; 6566 bdev_io->u.bdev.memory_domain_ctx = NULL; 6567 bdev_io->u.bdev.accel_sequence = NULL; 6568 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6569 6570 bdev_io_submit(bdev_io); 6571 return 0; 6572 } 6573 6574 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6575 6576 static void 6577 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6578 { 6579 struct spdk_bdev_io *bdev_io = _ctx; 6580 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6581 6582 if (status == -EBUSY) { 6583 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6584 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6585 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6586 } else { 6587 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6588 /* If outstanding IOs are still present and reset_io_drain_timeout 6589 * seconds passed, start the reset. */ 6590 bdev_io_submit_reset(bdev_io); 6591 } else { 6592 /* We still have in progress memory domain pull/push or we're 6593 * executing accel sequence. Since we cannot abort either of those 6594 * operations, fail the reset request. */ 6595 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6596 } 6597 } 6598 } else { 6599 SPDK_DEBUGLOG(bdev, 6600 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6601 ch->bdev->name); 6602 /* Mark the completion status as a SUCCESS and complete the reset. */ 6603 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6604 } 6605 } 6606 6607 static void 6608 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6609 struct spdk_io_channel *io_ch, void *_ctx) 6610 { 6611 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6612 int status = 0; 6613 6614 if (cur_ch->io_outstanding > 0 || 6615 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6616 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6617 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6618 * further iteration over the rest of the channels and pass non-zero status 6619 * to the callback function. */ 6620 status = -EBUSY; 6621 } 6622 spdk_bdev_for_each_channel_continue(i, status); 6623 } 6624 6625 static int 6626 bdev_reset_poll_for_outstanding_io(void *ctx) 6627 { 6628 struct spdk_bdev_io *bdev_io = ctx; 6629 6630 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6631 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6632 bdev_reset_check_outstanding_io_done); 6633 6634 return SPDK_POLLER_BUSY; 6635 } 6636 6637 static void 6638 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6639 { 6640 struct spdk_bdev_io *bdev_io = _ctx; 6641 6642 if (bdev->reset_io_drain_timeout == 0) { 6643 bdev_io_submit_reset(bdev_io); 6644 return; 6645 } 6646 6647 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6648 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6649 6650 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6651 * submit the reset to the underlying module only if outstanding I/O 6652 * remain after reset_io_drain_timeout seconds have passed. */ 6653 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6654 bdev_reset_check_outstanding_io_done); 6655 } 6656 6657 static void 6658 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6659 struct spdk_io_channel *ch, void *_ctx) 6660 { 6661 struct spdk_bdev_channel *channel; 6662 struct spdk_bdev_mgmt_channel *mgmt_channel; 6663 struct spdk_bdev_shared_resource *shared_resource; 6664 bdev_io_tailq_t tmp_queued; 6665 6666 TAILQ_INIT(&tmp_queued); 6667 6668 channel = __io_ch_to_bdev_ch(ch); 6669 shared_resource = channel->shared_resource; 6670 mgmt_channel = shared_resource->mgmt_ch; 6671 6672 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6673 6674 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6675 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6676 } 6677 6678 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6679 bdev_abort_all_buf_io(mgmt_channel, channel); 6680 bdev_abort_all_queued_io(&tmp_queued, channel); 6681 6682 spdk_bdev_for_each_channel_continue(i, 0); 6683 } 6684 6685 static void 6686 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6687 { 6688 struct spdk_bdev *bdev = bdev_io->bdev; 6689 bool freeze_channel = false; 6690 6691 bdev_ch_add_to_io_submitted(bdev_io); 6692 6693 /** 6694 * Take a channel reference for the target bdev for the life of this 6695 * reset. This guards against the channel getting destroyed before 6696 * the reset is completed. We will release the reference when this 6697 * reset is completed. 6698 */ 6699 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6700 6701 spdk_spin_lock(&bdev->internal.spinlock); 6702 if (bdev->internal.reset_in_progress == NULL) { 6703 bdev->internal.reset_in_progress = bdev_io; 6704 freeze_channel = true; 6705 } else { 6706 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6707 } 6708 spdk_spin_unlock(&bdev->internal.spinlock); 6709 6710 if (freeze_channel) { 6711 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6712 bdev_reset_freeze_channel_done); 6713 } 6714 } 6715 6716 int 6717 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6718 spdk_bdev_io_completion_cb cb, void *cb_arg) 6719 { 6720 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6721 struct spdk_bdev_io *bdev_io; 6722 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6723 6724 bdev_io = bdev_channel_get_io(channel); 6725 if (!bdev_io) { 6726 return -ENOMEM; 6727 } 6728 6729 bdev_io->internal.ch = channel; 6730 bdev_io->internal.desc = desc; 6731 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6732 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6733 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6734 6735 bdev_start_reset(bdev_io); 6736 return 0; 6737 } 6738 6739 void 6740 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6741 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6742 { 6743 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6744 6745 bdev_get_io_stat(stat, channel->stat); 6746 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 6747 } 6748 6749 static void 6750 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6751 { 6752 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6753 6754 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6755 bdev_iostat_ctx->cb_arg, 0); 6756 free(bdev_iostat_ctx); 6757 } 6758 6759 static void 6760 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6761 struct spdk_io_channel *ch, void *_ctx) 6762 { 6763 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6764 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6765 6766 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6767 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 6768 spdk_bdev_for_each_channel_continue(i, 0); 6769 } 6770 6771 void 6772 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6773 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6774 { 6775 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6776 6777 assert(bdev != NULL); 6778 assert(stat != NULL); 6779 assert(cb != NULL); 6780 6781 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6782 if (bdev_iostat_ctx == NULL) { 6783 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6784 cb(bdev, stat, cb_arg, -ENOMEM); 6785 return; 6786 } 6787 6788 bdev_iostat_ctx->stat = stat; 6789 bdev_iostat_ctx->cb = cb; 6790 bdev_iostat_ctx->cb_arg = cb_arg; 6791 bdev_iostat_ctx->reset_mode = reset_mode; 6792 6793 /* Start with the statistics from previously deleted channels. */ 6794 spdk_spin_lock(&bdev->internal.spinlock); 6795 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6796 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 6797 spdk_spin_unlock(&bdev->internal.spinlock); 6798 6799 /* Then iterate and add the statistics from each existing channel. */ 6800 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6801 bdev_get_device_stat_done); 6802 } 6803 6804 struct bdev_iostat_reset_ctx { 6805 enum spdk_bdev_reset_stat_mode mode; 6806 bdev_reset_device_stat_cb cb; 6807 void *cb_arg; 6808 }; 6809 6810 static void 6811 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6812 { 6813 struct bdev_iostat_reset_ctx *ctx = _ctx; 6814 6815 ctx->cb(bdev, ctx->cb_arg, 0); 6816 6817 free(ctx); 6818 } 6819 6820 static void 6821 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6822 struct spdk_io_channel *ch, void *_ctx) 6823 { 6824 struct bdev_iostat_reset_ctx *ctx = _ctx; 6825 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6826 6827 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6828 6829 spdk_bdev_for_each_channel_continue(i, 0); 6830 } 6831 6832 void 6833 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6834 bdev_reset_device_stat_cb cb, void *cb_arg) 6835 { 6836 struct bdev_iostat_reset_ctx *ctx; 6837 6838 assert(bdev != NULL); 6839 assert(cb != NULL); 6840 6841 ctx = calloc(1, sizeof(*ctx)); 6842 if (ctx == NULL) { 6843 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6844 cb(bdev, cb_arg, -ENOMEM); 6845 return; 6846 } 6847 6848 ctx->mode = mode; 6849 ctx->cb = cb; 6850 ctx->cb_arg = cb_arg; 6851 6852 spdk_spin_lock(&bdev->internal.spinlock); 6853 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6854 spdk_spin_unlock(&bdev->internal.spinlock); 6855 6856 spdk_bdev_for_each_channel(bdev, 6857 bdev_reset_each_channel_stat, 6858 ctx, 6859 bdev_reset_device_stat_done); 6860 } 6861 6862 int 6863 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6864 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6865 spdk_bdev_io_completion_cb cb, void *cb_arg) 6866 { 6867 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6868 struct spdk_bdev_io *bdev_io; 6869 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6870 6871 if (!desc->write) { 6872 return -EBADF; 6873 } 6874 6875 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6876 return -ENOTSUP; 6877 } 6878 6879 bdev_io = bdev_channel_get_io(channel); 6880 if (!bdev_io) { 6881 return -ENOMEM; 6882 } 6883 6884 bdev_io->internal.ch = channel; 6885 bdev_io->internal.desc = desc; 6886 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6887 bdev_io->u.nvme_passthru.cmd = *cmd; 6888 bdev_io->u.nvme_passthru.buf = buf; 6889 bdev_io->u.nvme_passthru.nbytes = nbytes; 6890 bdev_io->u.nvme_passthru.md_buf = NULL; 6891 bdev_io->u.nvme_passthru.md_len = 0; 6892 6893 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6894 6895 bdev_io_submit(bdev_io); 6896 return 0; 6897 } 6898 6899 int 6900 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6901 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6902 spdk_bdev_io_completion_cb cb, void *cb_arg) 6903 { 6904 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6905 struct spdk_bdev_io *bdev_io; 6906 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6907 6908 if (!desc->write) { 6909 /* 6910 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6911 * to easily determine if the command is a read or write, but for now just 6912 * do not allow io_passthru with a read-only descriptor. 6913 */ 6914 return -EBADF; 6915 } 6916 6917 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6918 return -ENOTSUP; 6919 } 6920 6921 bdev_io = bdev_channel_get_io(channel); 6922 if (!bdev_io) { 6923 return -ENOMEM; 6924 } 6925 6926 bdev_io->internal.ch = channel; 6927 bdev_io->internal.desc = desc; 6928 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6929 bdev_io->u.nvme_passthru.cmd = *cmd; 6930 bdev_io->u.nvme_passthru.buf = buf; 6931 bdev_io->u.nvme_passthru.nbytes = nbytes; 6932 bdev_io->u.nvme_passthru.md_buf = NULL; 6933 bdev_io->u.nvme_passthru.md_len = 0; 6934 6935 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6936 6937 bdev_io_submit(bdev_io); 6938 return 0; 6939 } 6940 6941 int 6942 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6943 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6944 spdk_bdev_io_completion_cb cb, void *cb_arg) 6945 { 6946 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6947 struct spdk_bdev_io *bdev_io; 6948 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6949 6950 if (!desc->write) { 6951 /* 6952 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6953 * to easily determine if the command is a read or write, but for now just 6954 * do not allow io_passthru with a read-only descriptor. 6955 */ 6956 return -EBADF; 6957 } 6958 6959 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6960 return -ENOTSUP; 6961 } 6962 6963 bdev_io = bdev_channel_get_io(channel); 6964 if (!bdev_io) { 6965 return -ENOMEM; 6966 } 6967 6968 bdev_io->internal.ch = channel; 6969 bdev_io->internal.desc = desc; 6970 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6971 bdev_io->u.nvme_passthru.cmd = *cmd; 6972 bdev_io->u.nvme_passthru.buf = buf; 6973 bdev_io->u.nvme_passthru.nbytes = nbytes; 6974 bdev_io->u.nvme_passthru.md_buf = md_buf; 6975 bdev_io->u.nvme_passthru.md_len = md_len; 6976 6977 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6978 6979 bdev_io_submit(bdev_io); 6980 return 0; 6981 } 6982 6983 int 6984 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6985 struct spdk_io_channel *ch, 6986 const struct spdk_nvme_cmd *cmd, 6987 struct iovec *iov, int iovcnt, size_t nbytes, 6988 void *md_buf, size_t md_len, 6989 spdk_bdev_io_completion_cb cb, void *cb_arg) 6990 { 6991 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6992 struct spdk_bdev_io *bdev_io; 6993 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6994 6995 if (!desc->write) { 6996 /* 6997 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6998 * to easily determine if the command is a read or write, but for now just 6999 * do not allow io_passthru with a read-only descriptor. 7000 */ 7001 return -EBADF; 7002 } 7003 7004 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7005 return -ENOTSUP; 7006 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7007 return -ENOTSUP; 7008 } 7009 7010 bdev_io = bdev_channel_get_io(channel); 7011 if (!bdev_io) { 7012 return -ENOMEM; 7013 } 7014 7015 bdev_io->internal.ch = channel; 7016 bdev_io->internal.desc = desc; 7017 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 7018 bdev_io->u.nvme_passthru.cmd = *cmd; 7019 bdev_io->u.nvme_passthru.iovs = iov; 7020 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 7021 bdev_io->u.nvme_passthru.nbytes = nbytes; 7022 bdev_io->u.nvme_passthru.md_buf = md_buf; 7023 bdev_io->u.nvme_passthru.md_len = md_len; 7024 7025 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7026 7027 bdev_io_submit(bdev_io); 7028 return 0; 7029 } 7030 7031 static void bdev_abort_retry(void *ctx); 7032 static void bdev_abort(struct spdk_bdev_io *parent_io); 7033 7034 static void 7035 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7036 { 7037 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 7038 struct spdk_bdev_io *parent_io = cb_arg; 7039 struct spdk_bdev_io *bio_to_abort, *tmp_io; 7040 7041 bio_to_abort = bdev_io->u.abort.bio_to_abort; 7042 7043 spdk_bdev_free_io(bdev_io); 7044 7045 if (!success) { 7046 /* Check if the target I/O completed in the meantime. */ 7047 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 7048 if (tmp_io == bio_to_abort) { 7049 break; 7050 } 7051 } 7052 7053 /* If the target I/O still exists, set the parent to failed. */ 7054 if (tmp_io != NULL) { 7055 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7056 } 7057 } 7058 7059 assert(parent_io->internal.f.split); 7060 7061 parent_io->internal.split.outstanding--; 7062 if (parent_io->internal.split.outstanding == 0) { 7063 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7064 bdev_abort_retry(parent_io); 7065 } else { 7066 bdev_io_complete(parent_io); 7067 } 7068 } 7069 } 7070 7071 static int 7072 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7073 struct spdk_bdev_io *bio_to_abort, 7074 spdk_bdev_io_completion_cb cb, void *cb_arg) 7075 { 7076 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7077 struct spdk_bdev_io *bdev_io; 7078 7079 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7080 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7081 /* TODO: Abort reset or abort request. */ 7082 return -ENOTSUP; 7083 } 7084 7085 bdev_io = bdev_channel_get_io(channel); 7086 if (bdev_io == NULL) { 7087 return -ENOMEM; 7088 } 7089 7090 bdev_io->internal.ch = channel; 7091 bdev_io->internal.desc = desc; 7092 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7093 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7094 7095 if (bio_to_abort->internal.f.split) { 7096 assert(bdev_io_should_split(bio_to_abort)); 7097 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7098 7099 /* Parent abort request is not submitted directly, but to manage its 7100 * execution add it to the submitted list here. 7101 */ 7102 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7103 bdev_ch_add_to_io_submitted(bdev_io); 7104 7105 bdev_abort(bdev_io); 7106 7107 return 0; 7108 } 7109 7110 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7111 7112 /* Submit the abort request to the underlying bdev module. */ 7113 bdev_io_submit(bdev_io); 7114 7115 return 0; 7116 } 7117 7118 static bool 7119 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7120 { 7121 struct spdk_bdev_io *iter; 7122 7123 TAILQ_FOREACH(iter, tailq, internal.link) { 7124 if (iter == bdev_io) { 7125 return true; 7126 } 7127 } 7128 7129 return false; 7130 } 7131 7132 static uint32_t 7133 _bdev_abort(struct spdk_bdev_io *parent_io) 7134 { 7135 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7136 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7137 void *bio_cb_arg; 7138 struct spdk_bdev_io *bio_to_abort; 7139 uint32_t matched_ios; 7140 int rc; 7141 7142 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7143 7144 /* matched_ios is returned and will be kept by the caller. 7145 * 7146 * This function will be used for two cases, 1) the same cb_arg is used for 7147 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7148 * Incrementing split_outstanding directly here may confuse readers especially 7149 * for the 1st case. 7150 * 7151 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7152 * works as expected. 7153 */ 7154 matched_ios = 0; 7155 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7156 7157 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7158 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7159 continue; 7160 } 7161 7162 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7163 /* Any I/O which was submitted after this abort command should be excluded. */ 7164 continue; 7165 } 7166 7167 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7168 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7169 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7170 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7171 break; 7172 } 7173 7174 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7175 if (rc != 0) { 7176 if (rc == -ENOMEM) { 7177 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7178 } else { 7179 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7180 } 7181 break; 7182 } 7183 matched_ios++; 7184 } 7185 7186 return matched_ios; 7187 } 7188 7189 static void 7190 bdev_abort_retry(void *ctx) 7191 { 7192 struct spdk_bdev_io *parent_io = ctx; 7193 uint32_t matched_ios; 7194 7195 matched_ios = _bdev_abort(parent_io); 7196 7197 if (matched_ios == 0) { 7198 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7199 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7200 } else { 7201 /* For retry, the case that no target I/O was found is success 7202 * because it means target I/Os completed in the meantime. 7203 */ 7204 bdev_io_complete(parent_io); 7205 } 7206 return; 7207 } 7208 7209 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7210 parent_io->internal.f.split = true; 7211 parent_io->internal.split.outstanding = matched_ios; 7212 } 7213 7214 static void 7215 bdev_abort(struct spdk_bdev_io *parent_io) 7216 { 7217 uint32_t matched_ios; 7218 7219 matched_ios = _bdev_abort(parent_io); 7220 7221 if (matched_ios == 0) { 7222 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7223 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7224 } else { 7225 /* The case the no target I/O was found is failure. */ 7226 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7227 bdev_io_complete(parent_io); 7228 } 7229 return; 7230 } 7231 7232 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7233 parent_io->internal.f.split = true; 7234 parent_io->internal.split.outstanding = matched_ios; 7235 } 7236 7237 int 7238 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7239 void *bio_cb_arg, 7240 spdk_bdev_io_completion_cb cb, void *cb_arg) 7241 { 7242 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7243 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7244 struct spdk_bdev_io *bdev_io; 7245 7246 if (bio_cb_arg == NULL) { 7247 return -EINVAL; 7248 } 7249 7250 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7251 return -ENOTSUP; 7252 } 7253 7254 bdev_io = bdev_channel_get_io(channel); 7255 if (bdev_io == NULL) { 7256 return -ENOMEM; 7257 } 7258 7259 bdev_io->internal.ch = channel; 7260 bdev_io->internal.desc = desc; 7261 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7262 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7263 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7264 7265 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7266 7267 /* Parent abort request is not submitted directly, but to manage its execution, 7268 * add it to the submitted list here. 7269 */ 7270 bdev_ch_add_to_io_submitted(bdev_io); 7271 7272 bdev_abort(bdev_io); 7273 7274 return 0; 7275 } 7276 7277 int 7278 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7279 struct spdk_bdev_io_wait_entry *entry) 7280 { 7281 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7282 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7283 7284 if (bdev != entry->bdev) { 7285 SPDK_ERRLOG("bdevs do not match\n"); 7286 return -EINVAL; 7287 } 7288 7289 if (mgmt_ch->per_thread_cache_count > 0) { 7290 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7291 return -EINVAL; 7292 } 7293 7294 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7295 return 0; 7296 } 7297 7298 static inline void 7299 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7300 { 7301 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7302 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7303 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7304 uint32_t blocklen = bdev_io->bdev->blocklen; 7305 7306 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7307 switch (bdev_io->type) { 7308 case SPDK_BDEV_IO_TYPE_READ: 7309 io_stat->bytes_read += num_blocks * blocklen; 7310 io_stat->num_read_ops++; 7311 io_stat->read_latency_ticks += tsc_diff; 7312 if (io_stat->max_read_latency_ticks < tsc_diff) { 7313 io_stat->max_read_latency_ticks = tsc_diff; 7314 } 7315 if (io_stat->min_read_latency_ticks > tsc_diff) { 7316 io_stat->min_read_latency_ticks = tsc_diff; 7317 } 7318 break; 7319 case SPDK_BDEV_IO_TYPE_WRITE: 7320 io_stat->bytes_written += num_blocks * blocklen; 7321 io_stat->num_write_ops++; 7322 io_stat->write_latency_ticks += tsc_diff; 7323 if (io_stat->max_write_latency_ticks < tsc_diff) { 7324 io_stat->max_write_latency_ticks = tsc_diff; 7325 } 7326 if (io_stat->min_write_latency_ticks > tsc_diff) { 7327 io_stat->min_write_latency_ticks = tsc_diff; 7328 } 7329 break; 7330 case SPDK_BDEV_IO_TYPE_UNMAP: 7331 io_stat->bytes_unmapped += num_blocks * blocklen; 7332 io_stat->num_unmap_ops++; 7333 io_stat->unmap_latency_ticks += tsc_diff; 7334 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7335 io_stat->max_unmap_latency_ticks = tsc_diff; 7336 } 7337 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7338 io_stat->min_unmap_latency_ticks = tsc_diff; 7339 } 7340 break; 7341 case SPDK_BDEV_IO_TYPE_ZCOPY: 7342 /* Track the data in the start phase only */ 7343 if (bdev_io->u.bdev.zcopy.start) { 7344 if (bdev_io->u.bdev.zcopy.populate) { 7345 io_stat->bytes_read += num_blocks * blocklen; 7346 io_stat->num_read_ops++; 7347 io_stat->read_latency_ticks += tsc_diff; 7348 if (io_stat->max_read_latency_ticks < tsc_diff) { 7349 io_stat->max_read_latency_ticks = tsc_diff; 7350 } 7351 if (io_stat->min_read_latency_ticks > tsc_diff) { 7352 io_stat->min_read_latency_ticks = tsc_diff; 7353 } 7354 } else { 7355 io_stat->bytes_written += num_blocks * blocklen; 7356 io_stat->num_write_ops++; 7357 io_stat->write_latency_ticks += tsc_diff; 7358 if (io_stat->max_write_latency_ticks < tsc_diff) { 7359 io_stat->max_write_latency_ticks = tsc_diff; 7360 } 7361 if (io_stat->min_write_latency_ticks > tsc_diff) { 7362 io_stat->min_write_latency_ticks = tsc_diff; 7363 } 7364 } 7365 } 7366 break; 7367 case SPDK_BDEV_IO_TYPE_COPY: 7368 io_stat->bytes_copied += num_blocks * blocklen; 7369 io_stat->num_copy_ops++; 7370 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7371 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7372 io_stat->max_copy_latency_ticks = tsc_diff; 7373 } 7374 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7375 io_stat->min_copy_latency_ticks = tsc_diff; 7376 } 7377 break; 7378 default: 7379 break; 7380 } 7381 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7382 io_stat = bdev_io->bdev->internal.stat; 7383 assert(io_stat->io_error != NULL); 7384 7385 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7386 io_stat->io_error->error_status[-io_status - 1]++; 7387 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7388 } 7389 7390 #ifdef SPDK_CONFIG_VTUNE 7391 uint64_t now_tsc = spdk_get_ticks(); 7392 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7393 uint64_t data[5]; 7394 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7395 7396 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7397 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7398 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7399 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7400 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7401 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7402 7403 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7404 __itt_metadata_u64, 5, data); 7405 7406 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7407 bdev_io->internal.ch->start_tsc = now_tsc; 7408 } 7409 #endif 7410 } 7411 7412 static inline void 7413 _bdev_io_complete(void *ctx) 7414 { 7415 struct spdk_bdev_io *bdev_io = ctx; 7416 7417 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7418 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7419 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7420 } 7421 7422 assert(bdev_io->internal.cb != NULL); 7423 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7424 7425 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7426 bdev_io->internal.caller_ctx); 7427 } 7428 7429 static inline void 7430 bdev_io_complete(void *ctx) 7431 { 7432 struct spdk_bdev_io *bdev_io = ctx; 7433 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7434 uint64_t tsc, tsc_diff; 7435 7436 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7437 /* 7438 * Defer completion to avoid potential infinite recursion if the 7439 * user's completion callback issues a new I/O. 7440 */ 7441 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7442 bdev_io_complete, bdev_io); 7443 return; 7444 } 7445 7446 tsc = spdk_get_ticks(); 7447 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7448 7449 bdev_ch_remove_from_io_submitted(bdev_io); 7450 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7451 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7452 7453 if (bdev_ch->histogram) { 7454 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7455 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7456 /* 7457 * Tally all I/O types if the histogram_io_type is set to 0. 7458 */ 7459 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7460 } 7461 } 7462 7463 bdev_io_update_io_stat(bdev_io, tsc_diff); 7464 _bdev_io_complete(bdev_io); 7465 } 7466 7467 /* The difference between this function and bdev_io_complete() is that this should be called to 7468 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7469 * io_submitted list and don't have submit_tsc updated. 7470 */ 7471 static inline void 7472 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7473 { 7474 /* Since the IO hasn't been submitted it's bound to be failed */ 7475 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7476 7477 /* At this point we don't know if the IO is completed from submission context or not, but, 7478 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7479 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7480 _bdev_io_complete, bdev_io); 7481 } 7482 7483 static void bdev_destroy_cb(void *io_device); 7484 7485 static inline void 7486 _bdev_reset_complete(void *ctx) 7487 { 7488 struct spdk_bdev_io *bdev_io = ctx; 7489 7490 /* Put the channel reference we got in submission. */ 7491 assert(bdev_io->u.reset.ch_ref != NULL); 7492 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7493 bdev_io->u.reset.ch_ref = NULL; 7494 7495 bdev_io_complete(bdev_io); 7496 } 7497 7498 static void 7499 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7500 { 7501 struct spdk_bdev_io *bdev_io = _ctx; 7502 bdev_io_tailq_t queued_resets; 7503 struct spdk_bdev_io *queued_reset; 7504 7505 assert(bdev_io == bdev->internal.reset_in_progress); 7506 7507 TAILQ_INIT(&queued_resets); 7508 7509 spdk_spin_lock(&bdev->internal.spinlock); 7510 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7511 spdk_bdev_io, internal.link); 7512 bdev->internal.reset_in_progress = NULL; 7513 spdk_spin_unlock(&bdev->internal.spinlock); 7514 7515 while (!TAILQ_EMPTY(&queued_resets)) { 7516 queued_reset = TAILQ_FIRST(&queued_resets); 7517 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7518 queued_reset->internal.status = bdev_io->internal.status; 7519 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7520 _bdev_reset_complete, queued_reset); 7521 } 7522 7523 _bdev_reset_complete(bdev_io); 7524 7525 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7526 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7527 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7528 } 7529 } 7530 7531 static void 7532 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7533 struct spdk_io_channel *_ch, void *_ctx) 7534 { 7535 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7536 7537 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7538 7539 spdk_bdev_for_each_channel_continue(i, 0); 7540 } 7541 7542 static void 7543 bdev_io_complete_sequence_cb(void *ctx, int status) 7544 { 7545 struct spdk_bdev_io *bdev_io = ctx; 7546 7547 /* u.bdev.accel_sequence should have already been cleared at this point */ 7548 assert(bdev_io->u.bdev.accel_sequence == NULL); 7549 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7550 bdev_io->internal.f.has_accel_sequence = false; 7551 7552 if (spdk_unlikely(status != 0)) { 7553 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7554 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7555 } 7556 7557 bdev_io_complete(bdev_io); 7558 } 7559 7560 void 7561 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7562 { 7563 struct spdk_bdev *bdev = bdev_io->bdev; 7564 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7565 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7566 7567 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7568 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7569 spdk_bdev_get_module_name(bdev), 7570 bdev_io_status_get_string(bdev_io->internal.status)); 7571 assert(false); 7572 } 7573 bdev_io->internal.status = status; 7574 7575 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7576 assert(bdev_io == bdev->internal.reset_in_progress); 7577 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7578 bdev_reset_complete); 7579 return; 7580 } else { 7581 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7582 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7583 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7584 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7585 return; 7586 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7587 !bdev_io_use_accel_sequence(bdev_io))) { 7588 _bdev_io_push_bounce_data_buffer(bdev_io, 7589 _bdev_io_complete_push_bounce_done); 7590 /* bdev IO will be completed in the callback */ 7591 return; 7592 } 7593 } 7594 7595 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7596 return; 7597 } 7598 } 7599 7600 bdev_io_complete(bdev_io); 7601 } 7602 7603 void 7604 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7605 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7606 { 7607 enum spdk_bdev_io_status status; 7608 7609 if (sc == SPDK_SCSI_STATUS_GOOD) { 7610 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7611 } else { 7612 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7613 bdev_io->internal.error.scsi.sc = sc; 7614 bdev_io->internal.error.scsi.sk = sk; 7615 bdev_io->internal.error.scsi.asc = asc; 7616 bdev_io->internal.error.scsi.ascq = ascq; 7617 } 7618 7619 spdk_bdev_io_complete(bdev_io, status); 7620 } 7621 7622 void 7623 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7624 int *sc, int *sk, int *asc, int *ascq) 7625 { 7626 assert(sc != NULL); 7627 assert(sk != NULL); 7628 assert(asc != NULL); 7629 assert(ascq != NULL); 7630 7631 switch (bdev_io->internal.status) { 7632 case SPDK_BDEV_IO_STATUS_SUCCESS: 7633 *sc = SPDK_SCSI_STATUS_GOOD; 7634 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7635 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7636 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7637 break; 7638 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7639 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7640 break; 7641 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7642 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7643 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7644 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7645 *ascq = bdev_io->internal.error.scsi.ascq; 7646 break; 7647 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7648 *sc = bdev_io->internal.error.scsi.sc; 7649 *sk = bdev_io->internal.error.scsi.sk; 7650 *asc = bdev_io->internal.error.scsi.asc; 7651 *ascq = bdev_io->internal.error.scsi.ascq; 7652 break; 7653 default: 7654 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7655 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7656 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7657 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7658 break; 7659 } 7660 } 7661 7662 void 7663 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7664 { 7665 enum spdk_bdev_io_status status; 7666 7667 if (aio_result == 0) { 7668 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7669 } else { 7670 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7671 } 7672 7673 bdev_io->internal.error.aio_result = aio_result; 7674 7675 spdk_bdev_io_complete(bdev_io, status); 7676 } 7677 7678 void 7679 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7680 { 7681 assert(aio_result != NULL); 7682 7683 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7684 *aio_result = bdev_io->internal.error.aio_result; 7685 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7686 *aio_result = 0; 7687 } else { 7688 *aio_result = -EIO; 7689 } 7690 } 7691 7692 void 7693 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7694 { 7695 enum spdk_bdev_io_status status; 7696 7697 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7698 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7699 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7700 status = SPDK_BDEV_IO_STATUS_ABORTED; 7701 } else { 7702 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7703 } 7704 7705 bdev_io->internal.error.nvme.cdw0 = cdw0; 7706 bdev_io->internal.error.nvme.sct = sct; 7707 bdev_io->internal.error.nvme.sc = sc; 7708 7709 spdk_bdev_io_complete(bdev_io, status); 7710 } 7711 7712 void 7713 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7714 { 7715 assert(sct != NULL); 7716 assert(sc != NULL); 7717 assert(cdw0 != NULL); 7718 7719 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7720 *sct = SPDK_NVME_SCT_GENERIC; 7721 *sc = SPDK_NVME_SC_SUCCESS; 7722 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7723 *cdw0 = 0; 7724 } else { 7725 *cdw0 = 1U; 7726 } 7727 return; 7728 } 7729 7730 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7731 *sct = SPDK_NVME_SCT_GENERIC; 7732 *sc = SPDK_NVME_SC_SUCCESS; 7733 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7734 *sct = bdev_io->internal.error.nvme.sct; 7735 *sc = bdev_io->internal.error.nvme.sc; 7736 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7737 *sct = SPDK_NVME_SCT_GENERIC; 7738 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7739 } else { 7740 *sct = SPDK_NVME_SCT_GENERIC; 7741 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7742 } 7743 7744 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7745 } 7746 7747 void 7748 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7749 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7750 { 7751 assert(first_sct != NULL); 7752 assert(first_sc != NULL); 7753 assert(second_sct != NULL); 7754 assert(second_sc != NULL); 7755 assert(cdw0 != NULL); 7756 7757 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7758 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7759 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7760 *first_sct = bdev_io->internal.error.nvme.sct; 7761 *first_sc = bdev_io->internal.error.nvme.sc; 7762 *second_sct = SPDK_NVME_SCT_GENERIC; 7763 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7764 } else { 7765 *first_sct = SPDK_NVME_SCT_GENERIC; 7766 *first_sc = SPDK_NVME_SC_SUCCESS; 7767 *second_sct = bdev_io->internal.error.nvme.sct; 7768 *second_sc = bdev_io->internal.error.nvme.sc; 7769 } 7770 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7771 *first_sct = SPDK_NVME_SCT_GENERIC; 7772 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7773 *second_sct = SPDK_NVME_SCT_GENERIC; 7774 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7775 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7776 *first_sct = SPDK_NVME_SCT_GENERIC; 7777 *first_sc = SPDK_NVME_SC_SUCCESS; 7778 *second_sct = SPDK_NVME_SCT_GENERIC; 7779 *second_sc = SPDK_NVME_SC_SUCCESS; 7780 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7781 *first_sct = SPDK_NVME_SCT_GENERIC; 7782 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7783 *second_sct = SPDK_NVME_SCT_GENERIC; 7784 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7785 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7786 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7787 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7788 *second_sct = SPDK_NVME_SCT_GENERIC; 7789 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7790 } else { 7791 *first_sct = SPDK_NVME_SCT_GENERIC; 7792 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7793 *second_sct = SPDK_NVME_SCT_GENERIC; 7794 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7795 } 7796 7797 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7798 } 7799 7800 void 7801 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7802 const struct spdk_bdev_io *base_io) 7803 { 7804 switch (base_io->internal.status) { 7805 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7806 spdk_bdev_io_complete_nvme_status(bdev_io, 7807 base_io->internal.error.nvme.cdw0, 7808 base_io->internal.error.nvme.sct, 7809 base_io->internal.error.nvme.sc); 7810 break; 7811 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7812 spdk_bdev_io_complete_scsi_status(bdev_io, 7813 base_io->internal.error.scsi.sc, 7814 base_io->internal.error.scsi.sk, 7815 base_io->internal.error.scsi.asc, 7816 base_io->internal.error.scsi.ascq); 7817 break; 7818 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7819 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7820 break; 7821 default: 7822 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7823 break; 7824 } 7825 } 7826 7827 struct spdk_thread * 7828 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7829 { 7830 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7831 } 7832 7833 struct spdk_io_channel * 7834 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7835 { 7836 return bdev_io->internal.ch->channel; 7837 } 7838 7839 static int 7840 bdev_register(struct spdk_bdev *bdev) 7841 { 7842 char *bdev_name; 7843 char uuid[SPDK_UUID_STRING_LEN]; 7844 struct spdk_iobuf_opts iobuf_opts; 7845 int ret; 7846 7847 assert(bdev->module != NULL); 7848 7849 if (!bdev->name) { 7850 SPDK_ERRLOG("Bdev name is NULL\n"); 7851 return -EINVAL; 7852 } 7853 7854 if (!strlen(bdev->name)) { 7855 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7856 return -EINVAL; 7857 } 7858 7859 /* Users often register their own I/O devices using the bdev name. In 7860 * order to avoid conflicts, prepend bdev_. */ 7861 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7862 if (!bdev_name) { 7863 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7864 return -ENOMEM; 7865 } 7866 7867 bdev->internal.stat = bdev_alloc_io_stat(true); 7868 if (!bdev->internal.stat) { 7869 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7870 free(bdev_name); 7871 return -ENOMEM; 7872 } 7873 7874 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7875 bdev->internal.measured_queue_depth = UINT64_MAX; 7876 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7877 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7878 bdev->internal.qd_poller = NULL; 7879 bdev->internal.qos = NULL; 7880 7881 TAILQ_INIT(&bdev->internal.open_descs); 7882 TAILQ_INIT(&bdev->internal.locked_ranges); 7883 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7884 TAILQ_INIT(&bdev->internal.queued_resets); 7885 TAILQ_INIT(&bdev->aliases); 7886 7887 /* UUID may be specified by the user or defined by bdev itself. 7888 * Otherwise it will be generated here, so this field will never be empty. */ 7889 if (spdk_uuid_is_null(&bdev->uuid)) { 7890 spdk_uuid_generate(&bdev->uuid); 7891 } 7892 7893 /* Add the UUID alias only if it's different than the name */ 7894 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7895 if (strcmp(bdev->name, uuid) != 0) { 7896 ret = spdk_bdev_alias_add(bdev, uuid); 7897 if (ret != 0) { 7898 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7899 bdev_free_io_stat(bdev->internal.stat); 7900 free(bdev_name); 7901 return ret; 7902 } 7903 } 7904 7905 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7906 if (spdk_bdev_get_buf_align(bdev) > 1) { 7907 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7908 iobuf_opts.large_bufsize / bdev->blocklen); 7909 } 7910 7911 /* If the user didn't specify a write unit size, set it to one. */ 7912 if (bdev->write_unit_size == 0) { 7913 bdev->write_unit_size = 1; 7914 } 7915 7916 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7917 if (bdev->acwu == 0) { 7918 bdev->acwu = bdev->write_unit_size; 7919 } 7920 7921 if (bdev->phys_blocklen == 0) { 7922 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7923 } 7924 7925 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7926 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7927 } 7928 7929 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7930 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7931 } 7932 7933 bdev->internal.reset_in_progress = NULL; 7934 bdev->internal.qd_poll_in_progress = false; 7935 bdev->internal.period = 0; 7936 bdev->internal.new_period = 0; 7937 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7938 7939 /* 7940 * Initialize spinlock before registering IO device because spinlock is used in 7941 * bdev_channel_create 7942 */ 7943 spdk_spin_init(&bdev->internal.spinlock); 7944 7945 spdk_io_device_register(__bdev_to_io_dev(bdev), 7946 bdev_channel_create, bdev_channel_destroy, 7947 sizeof(struct spdk_bdev_channel), 7948 bdev_name); 7949 7950 /* 7951 * Register bdev name only after the bdev object is ready. 7952 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7953 * create IO channels... 7954 */ 7955 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7956 if (ret != 0) { 7957 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7958 bdev_free_io_stat(bdev->internal.stat); 7959 spdk_spin_destroy(&bdev->internal.spinlock); 7960 free(bdev_name); 7961 return ret; 7962 } 7963 7964 free(bdev_name); 7965 7966 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7967 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7968 7969 return 0; 7970 } 7971 7972 static void 7973 bdev_destroy_cb(void *io_device) 7974 { 7975 int rc; 7976 struct spdk_bdev *bdev; 7977 spdk_bdev_unregister_cb cb_fn; 7978 void *cb_arg; 7979 7980 bdev = __bdev_from_io_dev(io_device); 7981 7982 if (bdev->internal.unregister_td != spdk_get_thread()) { 7983 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7984 return; 7985 } 7986 7987 cb_fn = bdev->internal.unregister_cb; 7988 cb_arg = bdev->internal.unregister_ctx; 7989 7990 spdk_spin_destroy(&bdev->internal.spinlock); 7991 free(bdev->internal.qos); 7992 bdev_free_io_stat(bdev->internal.stat); 7993 spdk_trace_unregister_owner(bdev->internal.trace_id); 7994 7995 rc = bdev->fn_table->destruct(bdev->ctxt); 7996 if (rc < 0) { 7997 SPDK_ERRLOG("destruct failed\n"); 7998 } 7999 if (rc <= 0 && cb_fn != NULL) { 8000 cb_fn(cb_arg, rc); 8001 } 8002 } 8003 8004 void 8005 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 8006 { 8007 if (bdev->internal.unregister_cb != NULL) { 8008 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 8009 } 8010 } 8011 8012 static void 8013 _remove_notify(void *arg) 8014 { 8015 struct spdk_bdev_desc *desc = arg; 8016 8017 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 8018 } 8019 8020 /* returns: 0 - bdev removed and ready to be destructed. 8021 * -EBUSY - bdev can't be destructed yet. */ 8022 static int 8023 bdev_unregister_unsafe(struct spdk_bdev *bdev) 8024 { 8025 struct spdk_bdev_desc *desc, *tmp; 8026 struct spdk_bdev_alias *alias; 8027 int rc = 0; 8028 char uuid[SPDK_UUID_STRING_LEN]; 8029 8030 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 8031 assert(spdk_spin_held(&bdev->internal.spinlock)); 8032 8033 /* Notify each descriptor about hotremoval */ 8034 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 8035 rc = -EBUSY; 8036 /* 8037 * Defer invocation of the event_cb to a separate message that will 8038 * run later on its thread. This ensures this context unwinds and 8039 * we don't recursively unregister this bdev again if the event_cb 8040 * immediately closes its descriptor. 8041 */ 8042 event_notify(desc, _remove_notify); 8043 } 8044 8045 /* If there are no descriptors, proceed removing the bdev */ 8046 if (rc == 0) { 8047 bdev_examine_allowlist_remove(bdev->name); 8048 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 8049 bdev_examine_allowlist_remove(alias->alias.name); 8050 } 8051 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 8052 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 8053 8054 /* Delete the name and the UUID alias */ 8055 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8056 bdev_name_del_unsafe(&bdev->internal.bdev_name); 8057 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 8058 8059 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 8060 8061 if (bdev->internal.reset_in_progress != NULL) { 8062 /* If reset is in progress, let the completion callback for reset 8063 * unregister the bdev. 8064 */ 8065 rc = -EBUSY; 8066 } 8067 } 8068 8069 return rc; 8070 } 8071 8072 static void 8073 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8074 struct spdk_io_channel *io_ch, void *_ctx) 8075 { 8076 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8077 8078 bdev_channel_abort_queued_ios(bdev_ch); 8079 spdk_bdev_for_each_channel_continue(i, 0); 8080 } 8081 8082 static void 8083 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8084 { 8085 int rc; 8086 8087 spdk_spin_lock(&g_bdev_mgr.spinlock); 8088 spdk_spin_lock(&bdev->internal.spinlock); 8089 /* 8090 * Set the status to REMOVING after completing to abort channels. Otherwise, 8091 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8092 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8093 * may fail. 8094 */ 8095 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8096 rc = bdev_unregister_unsafe(bdev); 8097 spdk_spin_unlock(&bdev->internal.spinlock); 8098 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8099 8100 if (rc == 0) { 8101 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8102 } 8103 } 8104 8105 void 8106 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8107 { 8108 struct spdk_thread *thread; 8109 8110 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8111 8112 thread = spdk_get_thread(); 8113 if (!thread) { 8114 /* The user called this from a non-SPDK thread. */ 8115 if (cb_fn != NULL) { 8116 cb_fn(cb_arg, -ENOTSUP); 8117 } 8118 return; 8119 } 8120 8121 spdk_spin_lock(&g_bdev_mgr.spinlock); 8122 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8123 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8124 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8125 if (cb_fn) { 8126 cb_fn(cb_arg, -EBUSY); 8127 } 8128 return; 8129 } 8130 8131 spdk_spin_lock(&bdev->internal.spinlock); 8132 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8133 bdev->internal.unregister_cb = cb_fn; 8134 bdev->internal.unregister_ctx = cb_arg; 8135 bdev->internal.unregister_td = thread; 8136 spdk_spin_unlock(&bdev->internal.spinlock); 8137 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8138 8139 spdk_bdev_set_qd_sampling_period(bdev, 0); 8140 8141 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8142 bdev_unregister); 8143 } 8144 8145 int 8146 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8147 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8148 { 8149 struct spdk_bdev_desc *desc; 8150 struct spdk_bdev *bdev; 8151 int rc; 8152 8153 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8154 if (rc != 0) { 8155 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8156 return rc; 8157 } 8158 8159 bdev = spdk_bdev_desc_get_bdev(desc); 8160 8161 if (bdev->module != module) { 8162 spdk_bdev_close(desc); 8163 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8164 bdev_name); 8165 return -ENODEV; 8166 } 8167 8168 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8169 8170 spdk_bdev_close(desc); 8171 8172 return 0; 8173 } 8174 8175 static int 8176 bdev_start_qos(struct spdk_bdev *bdev) 8177 { 8178 struct set_qos_limit_ctx *ctx; 8179 8180 /* Enable QoS */ 8181 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8182 ctx = calloc(1, sizeof(*ctx)); 8183 if (ctx == NULL) { 8184 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8185 return -ENOMEM; 8186 } 8187 ctx->bdev = bdev; 8188 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8189 } 8190 8191 return 0; 8192 } 8193 8194 static void 8195 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8196 struct spdk_bdev *bdev) 8197 { 8198 enum spdk_bdev_claim_type type; 8199 const char *typename, *modname; 8200 extern struct spdk_log_flag SPDK_LOG_bdev; 8201 8202 assert(spdk_spin_held(&bdev->internal.spinlock)); 8203 8204 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8205 return; 8206 } 8207 8208 type = bdev->internal.claim_type; 8209 typename = spdk_bdev_claim_get_name(type); 8210 8211 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8212 modname = bdev->internal.claim.v1.module->name; 8213 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8214 bdev->name, detail, typename, modname); 8215 return; 8216 } 8217 8218 if (claim_type_is_v2(type)) { 8219 struct spdk_bdev_module_claim *claim; 8220 8221 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8222 modname = claim->module->name; 8223 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8224 bdev->name, detail, typename, modname); 8225 } 8226 return; 8227 } 8228 8229 assert(false); 8230 } 8231 8232 static int 8233 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8234 { 8235 struct spdk_thread *thread; 8236 int rc = 0; 8237 8238 thread = spdk_get_thread(); 8239 if (!thread) { 8240 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8241 return -ENOTSUP; 8242 } 8243 8244 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8245 spdk_get_thread()); 8246 8247 desc->bdev = bdev; 8248 desc->thread = thread; 8249 desc->write = write; 8250 8251 spdk_spin_lock(&bdev->internal.spinlock); 8252 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8253 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8254 spdk_spin_unlock(&bdev->internal.spinlock); 8255 return -ENODEV; 8256 } 8257 8258 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8259 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8260 spdk_spin_unlock(&bdev->internal.spinlock); 8261 return -EPERM; 8262 } 8263 8264 rc = bdev_start_qos(bdev); 8265 if (rc != 0) { 8266 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8267 spdk_spin_unlock(&bdev->internal.spinlock); 8268 return rc; 8269 } 8270 8271 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8272 8273 spdk_spin_unlock(&bdev->internal.spinlock); 8274 8275 return 0; 8276 } 8277 8278 static void 8279 bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size) 8280 { 8281 if (!opts) { 8282 SPDK_ERRLOG("opts should not be NULL.\n"); 8283 return; 8284 } 8285 8286 if (!opts_size) { 8287 SPDK_ERRLOG("opts_size should not be zero.\n"); 8288 return; 8289 } 8290 8291 memset(opts, 0, opts_size); 8292 opts->size = opts_size; 8293 8294 #define FIELD_OK(field) \ 8295 offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size 8296 8297 #define SET_FIELD(field, value) \ 8298 if (FIELD_OK(field)) { \ 8299 opts->field = value; \ 8300 } \ 8301 8302 SET_FIELD(hide_metadata, false); 8303 8304 #undef FIELD_OK 8305 #undef SET_FIELD 8306 } 8307 8308 static void 8309 bdev_open_opts_copy(struct spdk_bdev_open_opts *opts, 8310 const struct spdk_bdev_open_opts *opts_src, size_t opts_size) 8311 { 8312 assert(opts); 8313 assert(opts_src); 8314 8315 #define SET_FIELD(field) \ 8316 if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \ 8317 opts->field = opts_src->field; \ 8318 } \ 8319 8320 SET_FIELD(hide_metadata); 8321 8322 opts->size = opts_src->size; 8323 8324 /* We should not remove this statement, but need to update the assert statement 8325 * if we add a new field, and also add a corresponding SET_FIELD statement. 8326 */ 8327 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size"); 8328 8329 #undef SET_FIELD 8330 } 8331 8332 void 8333 spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size) 8334 { 8335 struct spdk_bdev_open_opts opts_local; 8336 8337 bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local)); 8338 bdev_open_opts_copy(opts, &opts_local, opts_size); 8339 } 8340 8341 static int 8342 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8343 struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc) 8344 { 8345 struct spdk_bdev_desc *desc; 8346 struct spdk_bdev_open_opts opts; 8347 unsigned int i; 8348 8349 bdev_open_opts_get_defaults(&opts, sizeof(opts)); 8350 if (user_opts != NULL) { 8351 bdev_open_opts_copy(&opts, user_opts, user_opts->size); 8352 } 8353 8354 desc = calloc(1, sizeof(*desc)); 8355 if (desc == NULL) { 8356 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8357 return -ENOMEM; 8358 } 8359 8360 desc->opts = opts; 8361 8362 TAILQ_INIT(&desc->pending_media_events); 8363 TAILQ_INIT(&desc->free_media_events); 8364 8365 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8366 desc->callback.event_fn = event_cb; 8367 desc->callback.ctx = event_ctx; 8368 spdk_spin_init(&desc->spinlock); 8369 8370 if (desc->opts.hide_metadata) { 8371 if (spdk_bdev_is_md_separate(bdev)) { 8372 SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n"); 8373 bdev_desc_free(desc); 8374 return -EINVAL; 8375 } 8376 } 8377 8378 if (bdev->media_events) { 8379 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8380 sizeof(*desc->media_events_buffer)); 8381 if (desc->media_events_buffer == NULL) { 8382 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8383 bdev_desc_free(desc); 8384 return -ENOMEM; 8385 } 8386 8387 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8388 TAILQ_INSERT_TAIL(&desc->free_media_events, 8389 &desc->media_events_buffer[i], tailq); 8390 } 8391 } 8392 8393 if (bdev->fn_table->accel_sequence_supported != NULL) { 8394 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8395 desc->accel_sequence_supported[i] = 8396 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8397 (enum spdk_bdev_io_type)i); 8398 } 8399 } 8400 8401 *_desc = desc; 8402 8403 return 0; 8404 } 8405 8406 static int 8407 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8408 void *event_ctx, struct spdk_bdev_open_opts *opts, 8409 struct spdk_bdev_desc **_desc) 8410 { 8411 struct spdk_bdev_desc *desc; 8412 struct spdk_bdev *bdev; 8413 int rc; 8414 8415 bdev = bdev_get_by_name(bdev_name); 8416 8417 if (bdev == NULL) { 8418 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8419 return -ENODEV; 8420 } 8421 8422 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc); 8423 if (rc != 0) { 8424 return rc; 8425 } 8426 8427 rc = bdev_open(bdev, write, desc); 8428 if (rc != 0) { 8429 bdev_desc_free(desc); 8430 desc = NULL; 8431 } 8432 8433 *_desc = desc; 8434 8435 return rc; 8436 } 8437 8438 int 8439 spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8440 void *event_ctx, struct spdk_bdev_open_opts *opts, 8441 struct spdk_bdev_desc **_desc) 8442 { 8443 int rc; 8444 8445 if (event_cb == NULL) { 8446 SPDK_ERRLOG("Missing event callback function\n"); 8447 return -EINVAL; 8448 } 8449 8450 spdk_spin_lock(&g_bdev_mgr.spinlock); 8451 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc); 8452 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8453 8454 return rc; 8455 } 8456 8457 int 8458 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8459 void *event_ctx, struct spdk_bdev_desc **_desc) 8460 { 8461 return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc); 8462 } 8463 8464 struct spdk_bdev_open_async_ctx { 8465 char *bdev_name; 8466 spdk_bdev_event_cb_t event_cb; 8467 void *event_ctx; 8468 bool write; 8469 int rc; 8470 spdk_bdev_open_async_cb_t cb_fn; 8471 void *cb_arg; 8472 struct spdk_bdev_desc *desc; 8473 struct spdk_bdev_open_async_opts opts; 8474 uint64_t start_ticks; 8475 struct spdk_thread *orig_thread; 8476 struct spdk_poller *poller; 8477 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8478 }; 8479 8480 static void 8481 bdev_open_async_done(void *arg) 8482 { 8483 struct spdk_bdev_open_async_ctx *ctx = arg; 8484 8485 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8486 8487 free(ctx->bdev_name); 8488 free(ctx); 8489 } 8490 8491 static void 8492 bdev_open_async_cancel(void *arg) 8493 { 8494 struct spdk_bdev_open_async_ctx *ctx = arg; 8495 8496 assert(ctx->rc == -ESHUTDOWN); 8497 8498 spdk_poller_unregister(&ctx->poller); 8499 8500 bdev_open_async_done(ctx); 8501 } 8502 8503 /* This is called when the bdev library finishes at shutdown. */ 8504 static void 8505 bdev_open_async_fini(void) 8506 { 8507 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8508 8509 spdk_spin_lock(&g_bdev_mgr.spinlock); 8510 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8511 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8512 /* 8513 * We have to move to ctx->orig_thread to unregister ctx->poller. 8514 * However, there is a chance that ctx->poller is executed before 8515 * message is executed, which could result in bdev_open_async_done() 8516 * being called twice. To avoid such race condition, set ctx->rc to 8517 * -ESHUTDOWN. 8518 */ 8519 ctx->rc = -ESHUTDOWN; 8520 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8521 } 8522 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8523 } 8524 8525 static int bdev_open_async(void *arg); 8526 8527 static void 8528 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8529 { 8530 uint64_t timeout_ticks; 8531 8532 if (ctx->rc == -ESHUTDOWN) { 8533 /* This context is being canceled. Do nothing. */ 8534 return; 8535 } 8536 8537 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8538 NULL, &ctx->desc); 8539 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8540 goto exit; 8541 } 8542 8543 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8544 if (spdk_get_ticks() >= timeout_ticks) { 8545 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8546 ctx->rc = -ETIMEDOUT; 8547 goto exit; 8548 } 8549 8550 return; 8551 8552 exit: 8553 spdk_poller_unregister(&ctx->poller); 8554 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8555 8556 /* Completion callback is processed after stack unwinding. */ 8557 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8558 } 8559 8560 static int 8561 bdev_open_async(void *arg) 8562 { 8563 struct spdk_bdev_open_async_ctx *ctx = arg; 8564 8565 spdk_spin_lock(&g_bdev_mgr.spinlock); 8566 8567 _bdev_open_async(ctx); 8568 8569 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8570 8571 return SPDK_POLLER_BUSY; 8572 } 8573 8574 static void 8575 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8576 struct spdk_bdev_open_async_opts *opts_src, 8577 size_t size) 8578 { 8579 assert(opts); 8580 assert(opts_src); 8581 8582 opts->size = size; 8583 8584 #define SET_FIELD(field) \ 8585 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8586 opts->field = opts_src->field; \ 8587 } \ 8588 8589 SET_FIELD(timeout_ms); 8590 8591 /* Do not remove this statement, you should always update this statement when you adding a new field, 8592 * and do not forget to add the SET_FIELD statement for your added field. */ 8593 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8594 8595 #undef SET_FIELD 8596 } 8597 8598 static void 8599 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8600 { 8601 assert(opts); 8602 8603 opts->size = size; 8604 8605 #define SET_FIELD(field, value) \ 8606 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8607 opts->field = value; \ 8608 } \ 8609 8610 SET_FIELD(timeout_ms, 0); 8611 8612 #undef SET_FIELD 8613 } 8614 8615 int 8616 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8617 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8618 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8619 { 8620 struct spdk_bdev_open_async_ctx *ctx; 8621 8622 if (event_cb == NULL) { 8623 SPDK_ERRLOG("Missing event callback function\n"); 8624 return -EINVAL; 8625 } 8626 8627 if (open_cb == NULL) { 8628 SPDK_ERRLOG("Missing open callback function\n"); 8629 return -EINVAL; 8630 } 8631 8632 if (opts != NULL && opts->size == 0) { 8633 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8634 return -EINVAL; 8635 } 8636 8637 ctx = calloc(1, sizeof(*ctx)); 8638 if (ctx == NULL) { 8639 SPDK_ERRLOG("Failed to allocate open context\n"); 8640 return -ENOMEM; 8641 } 8642 8643 ctx->bdev_name = strdup(bdev_name); 8644 if (ctx->bdev_name == NULL) { 8645 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8646 free(ctx); 8647 return -ENOMEM; 8648 } 8649 8650 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8651 if (ctx->poller == NULL) { 8652 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8653 free(ctx->bdev_name); 8654 free(ctx); 8655 return -ENOMEM; 8656 } 8657 8658 ctx->cb_fn = open_cb; 8659 ctx->cb_arg = open_cb_arg; 8660 ctx->write = write; 8661 ctx->event_cb = event_cb; 8662 ctx->event_ctx = event_ctx; 8663 ctx->orig_thread = spdk_get_thread(); 8664 ctx->start_ticks = spdk_get_ticks(); 8665 8666 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8667 if (opts != NULL) { 8668 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8669 } 8670 8671 spdk_spin_lock(&g_bdev_mgr.spinlock); 8672 8673 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8674 _bdev_open_async(ctx); 8675 8676 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8677 8678 return 0; 8679 } 8680 8681 static void 8682 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8683 { 8684 int rc; 8685 8686 spdk_spin_lock(&bdev->internal.spinlock); 8687 spdk_spin_lock(&desc->spinlock); 8688 8689 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8690 8691 desc->closed = true; 8692 8693 if (desc->claim != NULL) { 8694 bdev_desc_release_claims(desc); 8695 } 8696 8697 if (0 == desc->refs) { 8698 spdk_spin_unlock(&desc->spinlock); 8699 bdev_desc_free(desc); 8700 } else { 8701 spdk_spin_unlock(&desc->spinlock); 8702 } 8703 8704 /* If no more descriptors, kill QoS channel */ 8705 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8706 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8707 bdev->name, spdk_get_thread()); 8708 8709 if (bdev_qos_destroy(bdev)) { 8710 /* There isn't anything we can do to recover here. Just let the 8711 * old QoS poller keep running. The QoS handling won't change 8712 * cores when the user allocates a new channel, but it won't break. */ 8713 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8714 } 8715 } 8716 8717 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8718 rc = bdev_unregister_unsafe(bdev); 8719 spdk_spin_unlock(&bdev->internal.spinlock); 8720 8721 if (rc == 0) { 8722 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8723 } 8724 } else { 8725 spdk_spin_unlock(&bdev->internal.spinlock); 8726 } 8727 } 8728 8729 void 8730 spdk_bdev_close(struct spdk_bdev_desc *desc) 8731 { 8732 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8733 8734 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8735 spdk_get_thread()); 8736 8737 assert(desc->thread == spdk_get_thread()); 8738 8739 spdk_poller_unregister(&desc->io_timeout_poller); 8740 8741 spdk_spin_lock(&g_bdev_mgr.spinlock); 8742 8743 bdev_close(bdev, desc); 8744 8745 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8746 } 8747 8748 int32_t 8749 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8750 { 8751 if (bdev->numa.id_valid) { 8752 return bdev->numa.id; 8753 } else { 8754 return SPDK_ENV_NUMA_ID_ANY; 8755 } 8756 } 8757 8758 static void 8759 bdev_register_finished(void *arg) 8760 { 8761 struct spdk_bdev_desc *desc = arg; 8762 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8763 8764 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8765 8766 spdk_spin_lock(&g_bdev_mgr.spinlock); 8767 8768 bdev_close(bdev, desc); 8769 8770 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8771 } 8772 8773 int 8774 spdk_bdev_register(struct spdk_bdev *bdev) 8775 { 8776 struct spdk_bdev_desc *desc; 8777 struct spdk_thread *thread = spdk_get_thread(); 8778 int rc; 8779 8780 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8781 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8782 thread ? spdk_thread_get_name(thread) : "null"); 8783 return -EINVAL; 8784 } 8785 8786 rc = bdev_register(bdev); 8787 if (rc != 0) { 8788 return rc; 8789 } 8790 8791 /* A descriptor is opened to prevent bdev deletion during examination */ 8792 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 8793 if (rc != 0) { 8794 spdk_bdev_unregister(bdev, NULL, NULL); 8795 return rc; 8796 } 8797 8798 rc = bdev_open(bdev, false, desc); 8799 if (rc != 0) { 8800 bdev_desc_free(desc); 8801 spdk_bdev_unregister(bdev, NULL, NULL); 8802 return rc; 8803 } 8804 8805 /* Examine configuration before initializing I/O */ 8806 bdev_examine(bdev); 8807 8808 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8809 if (rc != 0) { 8810 bdev_close(bdev, desc); 8811 spdk_bdev_unregister(bdev, NULL, NULL); 8812 } 8813 8814 return rc; 8815 } 8816 8817 int 8818 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8819 struct spdk_bdev_module *module) 8820 { 8821 spdk_spin_lock(&bdev->internal.spinlock); 8822 8823 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8824 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8825 spdk_spin_unlock(&bdev->internal.spinlock); 8826 return -EPERM; 8827 } 8828 8829 if (desc && !desc->write) { 8830 desc->write = true; 8831 } 8832 8833 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8834 bdev->internal.claim.v1.module = module; 8835 8836 spdk_spin_unlock(&bdev->internal.spinlock); 8837 return 0; 8838 } 8839 8840 void 8841 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8842 { 8843 spdk_spin_lock(&bdev->internal.spinlock); 8844 8845 assert(bdev->internal.claim.v1.module != NULL); 8846 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8847 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8848 bdev->internal.claim.v1.module = NULL; 8849 8850 spdk_spin_unlock(&bdev->internal.spinlock); 8851 } 8852 8853 /* 8854 * Start claims v2 8855 */ 8856 8857 const char * 8858 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8859 { 8860 switch (type) { 8861 case SPDK_BDEV_CLAIM_NONE: 8862 return "not_claimed"; 8863 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8864 return "exclusive_write"; 8865 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8866 return "read_many_write_one"; 8867 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8868 return "read_many_write_none"; 8869 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8870 return "read_many_write_many"; 8871 default: 8872 break; 8873 } 8874 return "invalid_claim"; 8875 } 8876 8877 static bool 8878 claim_type_is_v2(enum spdk_bdev_claim_type type) 8879 { 8880 switch (type) { 8881 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8882 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8883 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8884 return true; 8885 default: 8886 break; 8887 } 8888 return false; 8889 } 8890 8891 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8892 static bool 8893 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8894 { 8895 switch (type) { 8896 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8897 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8898 return true; 8899 default: 8900 break; 8901 } 8902 return false; 8903 } 8904 8905 void 8906 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8907 { 8908 if (opts == NULL) { 8909 SPDK_ERRLOG("opts should not be NULL\n"); 8910 assert(opts != NULL); 8911 return; 8912 } 8913 if (size == 0) { 8914 SPDK_ERRLOG("size should not be zero\n"); 8915 assert(size != 0); 8916 return; 8917 } 8918 8919 memset(opts, 0, size); 8920 opts->opts_size = size; 8921 8922 #define FIELD_OK(field) \ 8923 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8924 8925 #define SET_FIELD(field, value) \ 8926 if (FIELD_OK(field)) { \ 8927 opts->field = value; \ 8928 } \ 8929 8930 SET_FIELD(shared_claim_key, 0); 8931 8932 #undef FIELD_OK 8933 #undef SET_FIELD 8934 } 8935 8936 static int 8937 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8938 { 8939 if (src->opts_size == 0) { 8940 SPDK_ERRLOG("size should not be zero\n"); 8941 return -1; 8942 } 8943 8944 memset(dst, 0, sizeof(*dst)); 8945 dst->opts_size = src->opts_size; 8946 8947 #define FIELD_OK(field) \ 8948 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8949 8950 #define SET_FIELD(field) \ 8951 if (FIELD_OK(field)) { \ 8952 dst->field = src->field; \ 8953 } \ 8954 8955 if (FIELD_OK(name)) { 8956 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8957 } 8958 8959 SET_FIELD(shared_claim_key); 8960 8961 /* You should not remove this statement, but need to update the assert statement 8962 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8963 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8964 8965 #undef FIELD_OK 8966 #undef SET_FIELD 8967 return 0; 8968 } 8969 8970 /* Returns 0 if a read-write-once claim can be taken. */ 8971 static int 8972 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8973 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8974 { 8975 struct spdk_bdev *bdev = desc->bdev; 8976 struct spdk_bdev_desc *open_desc; 8977 8978 assert(spdk_spin_held(&bdev->internal.spinlock)); 8979 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8980 8981 if (opts->shared_claim_key != 0) { 8982 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8983 bdev->name); 8984 return -EINVAL; 8985 } 8986 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8987 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8988 return -EPERM; 8989 } 8990 if (desc->claim != NULL) { 8991 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8992 bdev->name, desc->claim->module->name); 8993 return -EPERM; 8994 } 8995 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8996 if (desc != open_desc && open_desc->write) { 8997 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8998 "another descriptor is open for writing\n", 8999 bdev->name); 9000 return -EPERM; 9001 } 9002 } 9003 9004 return 0; 9005 } 9006 9007 /* Returns 0 if a read-only-many claim can be taken. */ 9008 static int 9009 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9010 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9011 { 9012 struct spdk_bdev *bdev = desc->bdev; 9013 struct spdk_bdev_desc *open_desc; 9014 9015 assert(spdk_spin_held(&bdev->internal.spinlock)); 9016 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 9017 assert(desc->claim == NULL); 9018 9019 if (desc->write) { 9020 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 9021 bdev->name); 9022 return -EINVAL; 9023 } 9024 if (opts->shared_claim_key != 0) { 9025 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 9026 return -EINVAL; 9027 } 9028 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9029 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9030 if (open_desc->write) { 9031 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 9032 "another descriptor is open for writing\n", 9033 bdev->name); 9034 return -EPERM; 9035 } 9036 } 9037 } 9038 9039 return 0; 9040 } 9041 9042 /* Returns 0 if a read-write-many claim can be taken. */ 9043 static int 9044 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9045 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9046 { 9047 struct spdk_bdev *bdev = desc->bdev; 9048 struct spdk_bdev_desc *open_desc; 9049 9050 assert(spdk_spin_held(&bdev->internal.spinlock)); 9051 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 9052 assert(desc->claim == NULL); 9053 9054 if (opts->shared_claim_key == 0) { 9055 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 9056 bdev->name); 9057 return -EINVAL; 9058 } 9059 switch (bdev->internal.claim_type) { 9060 case SPDK_BDEV_CLAIM_NONE: 9061 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9062 if (open_desc == desc) { 9063 continue; 9064 } 9065 if (open_desc->write) { 9066 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 9067 "another descriptor is open for writing without a " 9068 "claim\n", bdev->name); 9069 return -EPERM; 9070 } 9071 } 9072 break; 9073 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9074 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 9075 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 9076 return -EPERM; 9077 } 9078 break; 9079 default: 9080 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9081 return -EBUSY; 9082 } 9083 9084 return 0; 9085 } 9086 9087 /* Updates desc and its bdev with a v2 claim. */ 9088 static int 9089 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9090 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9091 { 9092 struct spdk_bdev *bdev = desc->bdev; 9093 struct spdk_bdev_module_claim *claim; 9094 9095 assert(spdk_spin_held(&bdev->internal.spinlock)); 9096 assert(claim_type_is_v2(type)); 9097 assert(desc->claim == NULL); 9098 9099 claim = calloc(1, sizeof(*desc->claim)); 9100 if (claim == NULL) { 9101 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 9102 return -ENOMEM; 9103 } 9104 claim->module = module; 9105 claim->desc = desc; 9106 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 9107 memcpy(claim->name, opts->name, sizeof(claim->name)); 9108 desc->claim = claim; 9109 9110 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9111 bdev->internal.claim_type = type; 9112 TAILQ_INIT(&bdev->internal.claim.v2.claims); 9113 bdev->internal.claim.v2.key = opts->shared_claim_key; 9114 } 9115 assert(type == bdev->internal.claim_type); 9116 9117 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 9118 9119 if (!desc->write && claim_type_promotes_to_write(type)) { 9120 desc->write = true; 9121 } 9122 9123 return 0; 9124 } 9125 9126 int 9127 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9128 struct spdk_bdev_claim_opts *_opts, 9129 struct spdk_bdev_module *module) 9130 { 9131 struct spdk_bdev *bdev; 9132 struct spdk_bdev_claim_opts opts; 9133 int rc = 0; 9134 9135 if (desc == NULL) { 9136 SPDK_ERRLOG("descriptor must not be NULL\n"); 9137 return -EINVAL; 9138 } 9139 9140 bdev = desc->bdev; 9141 9142 if (_opts == NULL) { 9143 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 9144 } else if (claim_opts_copy(_opts, &opts) != 0) { 9145 return -EINVAL; 9146 } 9147 9148 spdk_spin_lock(&bdev->internal.spinlock); 9149 9150 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9151 bdev->internal.claim_type != type) { 9152 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9153 spdk_spin_unlock(&bdev->internal.spinlock); 9154 return -EPERM; 9155 } 9156 9157 if (claim_type_is_v2(type) && desc->claim != NULL) { 9158 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9159 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9160 spdk_spin_unlock(&bdev->internal.spinlock); 9161 return -EPERM; 9162 } 9163 9164 switch (type) { 9165 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9166 spdk_spin_unlock(&bdev->internal.spinlock); 9167 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9168 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9169 rc = claim_verify_rwo(desc, type, &opts, module); 9170 break; 9171 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9172 rc = claim_verify_rom(desc, type, &opts, module); 9173 break; 9174 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9175 rc = claim_verify_rwm(desc, type, &opts, module); 9176 break; 9177 default: 9178 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9179 rc = -ENOTSUP; 9180 } 9181 9182 if (rc == 0) { 9183 rc = claim_bdev(desc, type, &opts, module); 9184 } 9185 9186 spdk_spin_unlock(&bdev->internal.spinlock); 9187 return rc; 9188 } 9189 9190 static void 9191 claim_reset(struct spdk_bdev *bdev) 9192 { 9193 assert(spdk_spin_held(&bdev->internal.spinlock)); 9194 assert(claim_type_is_v2(bdev->internal.claim_type)); 9195 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9196 9197 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9198 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9199 } 9200 9201 static void 9202 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9203 { 9204 struct spdk_bdev *bdev = desc->bdev; 9205 9206 assert(spdk_spin_held(&bdev->internal.spinlock)); 9207 assert(claim_type_is_v2(bdev->internal.claim_type)); 9208 9209 if (bdev->internal.examine_in_progress == 0) { 9210 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9211 free(desc->claim); 9212 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9213 claim_reset(bdev); 9214 } 9215 } else { 9216 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9217 desc->claim->module = NULL; 9218 desc->claim->desc = NULL; 9219 } 9220 desc->claim = NULL; 9221 } 9222 9223 /* 9224 * End claims v2 9225 */ 9226 9227 struct spdk_bdev * 9228 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9229 { 9230 assert(desc != NULL); 9231 return desc->bdev; 9232 } 9233 9234 int 9235 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9236 { 9237 struct spdk_bdev *bdev, *tmp; 9238 struct spdk_bdev_desc *desc; 9239 int rc = 0; 9240 9241 assert(fn != NULL); 9242 9243 spdk_spin_lock(&g_bdev_mgr.spinlock); 9244 bdev = spdk_bdev_first(); 9245 while (bdev != NULL) { 9246 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9247 if (rc != 0) { 9248 break; 9249 } 9250 rc = bdev_open(bdev, false, desc); 9251 if (rc != 0) { 9252 bdev_desc_free(desc); 9253 if (rc == -ENODEV) { 9254 /* Ignore the error and move to the next bdev. */ 9255 rc = 0; 9256 bdev = spdk_bdev_next(bdev); 9257 continue; 9258 } 9259 break; 9260 } 9261 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9262 9263 rc = fn(ctx, bdev); 9264 9265 spdk_spin_lock(&g_bdev_mgr.spinlock); 9266 tmp = spdk_bdev_next(bdev); 9267 bdev_close(bdev, desc); 9268 if (rc != 0) { 9269 break; 9270 } 9271 bdev = tmp; 9272 } 9273 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9274 9275 return rc; 9276 } 9277 9278 int 9279 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9280 { 9281 struct spdk_bdev *bdev, *tmp; 9282 struct spdk_bdev_desc *desc; 9283 int rc = 0; 9284 9285 assert(fn != NULL); 9286 9287 spdk_spin_lock(&g_bdev_mgr.spinlock); 9288 bdev = spdk_bdev_first_leaf(); 9289 while (bdev != NULL) { 9290 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9291 if (rc != 0) { 9292 break; 9293 } 9294 rc = bdev_open(bdev, false, desc); 9295 if (rc != 0) { 9296 bdev_desc_free(desc); 9297 if (rc == -ENODEV) { 9298 /* Ignore the error and move to the next bdev. */ 9299 rc = 0; 9300 bdev = spdk_bdev_next_leaf(bdev); 9301 continue; 9302 } 9303 break; 9304 } 9305 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9306 9307 rc = fn(ctx, bdev); 9308 9309 spdk_spin_lock(&g_bdev_mgr.spinlock); 9310 tmp = spdk_bdev_next_leaf(bdev); 9311 bdev_close(bdev, desc); 9312 if (rc != 0) { 9313 break; 9314 } 9315 bdev = tmp; 9316 } 9317 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9318 9319 return rc; 9320 } 9321 9322 void 9323 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9324 { 9325 struct iovec *iovs; 9326 int iovcnt; 9327 9328 if (bdev_io == NULL) { 9329 return; 9330 } 9331 9332 switch (bdev_io->type) { 9333 case SPDK_BDEV_IO_TYPE_READ: 9334 case SPDK_BDEV_IO_TYPE_WRITE: 9335 case SPDK_BDEV_IO_TYPE_ZCOPY: 9336 iovs = bdev_io->u.bdev.iovs; 9337 iovcnt = bdev_io->u.bdev.iovcnt; 9338 break; 9339 default: 9340 iovs = NULL; 9341 iovcnt = 0; 9342 break; 9343 } 9344 9345 if (iovp) { 9346 *iovp = iovs; 9347 } 9348 if (iovcntp) { 9349 *iovcntp = iovcnt; 9350 } 9351 } 9352 9353 void * 9354 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9355 { 9356 if (bdev_io == NULL) { 9357 return NULL; 9358 } 9359 9360 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9361 return NULL; 9362 } 9363 9364 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9365 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9366 return bdev_io->u.bdev.md_buf; 9367 } 9368 9369 return NULL; 9370 } 9371 9372 void * 9373 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9374 { 9375 if (bdev_io == NULL) { 9376 assert(false); 9377 return NULL; 9378 } 9379 9380 return bdev_io->internal.caller_ctx; 9381 } 9382 9383 void 9384 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9385 { 9386 9387 if (spdk_bdev_module_list_find(bdev_module->name)) { 9388 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9389 assert(false); 9390 } 9391 9392 spdk_spin_init(&bdev_module->internal.spinlock); 9393 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9394 9395 /* 9396 * Modules with examine callbacks must be initialized first, so they are 9397 * ready to handle examine callbacks from later modules that will 9398 * register physical bdevs. 9399 */ 9400 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9401 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9402 } else { 9403 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9404 } 9405 } 9406 9407 struct spdk_bdev_module * 9408 spdk_bdev_module_list_find(const char *name) 9409 { 9410 struct spdk_bdev_module *bdev_module; 9411 9412 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9413 if (strcmp(name, bdev_module->name) == 0) { 9414 break; 9415 } 9416 } 9417 9418 return bdev_module; 9419 } 9420 9421 static int 9422 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9423 { 9424 uint64_t num_blocks; 9425 void *md_buf = NULL; 9426 9427 num_blocks = bdev_io->u.bdev.num_blocks; 9428 9429 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9430 md_buf = (char *)g_bdev_mgr.zero_buffer + 9431 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9432 } 9433 9434 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9435 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9436 g_bdev_mgr.zero_buffer, md_buf, 9437 bdev_io->u.bdev.offset_blocks, num_blocks, 9438 bdev_write_zero_buffer_done, bdev_io); 9439 } 9440 9441 static void 9442 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9443 { 9444 struct spdk_bdev_io *parent_io = cb_arg; 9445 9446 spdk_bdev_free_io(bdev_io); 9447 9448 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9449 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9450 } 9451 9452 static void 9453 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9454 { 9455 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9456 ctx->bdev->internal.qos_mod_in_progress = false; 9457 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9458 9459 if (ctx->cb_fn) { 9460 ctx->cb_fn(ctx->cb_arg, status); 9461 } 9462 free(ctx); 9463 } 9464 9465 static void 9466 bdev_disable_qos_done(void *cb_arg) 9467 { 9468 struct set_qos_limit_ctx *ctx = cb_arg; 9469 struct spdk_bdev *bdev = ctx->bdev; 9470 struct spdk_bdev_qos *qos; 9471 9472 spdk_spin_lock(&bdev->internal.spinlock); 9473 qos = bdev->internal.qos; 9474 bdev->internal.qos = NULL; 9475 spdk_spin_unlock(&bdev->internal.spinlock); 9476 9477 if (qos->thread != NULL) { 9478 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9479 spdk_poller_unregister(&qos->poller); 9480 } 9481 9482 free(qos); 9483 9484 bdev_set_qos_limit_done(ctx, 0); 9485 } 9486 9487 static void 9488 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9489 { 9490 struct set_qos_limit_ctx *ctx = _ctx; 9491 struct spdk_thread *thread; 9492 9493 spdk_spin_lock(&bdev->internal.spinlock); 9494 thread = bdev->internal.qos->thread; 9495 spdk_spin_unlock(&bdev->internal.spinlock); 9496 9497 if (thread != NULL) { 9498 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9499 } else { 9500 bdev_disable_qos_done(ctx); 9501 } 9502 } 9503 9504 static void 9505 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9506 struct spdk_io_channel *ch, void *_ctx) 9507 { 9508 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9509 struct spdk_bdev_io *bdev_io; 9510 9511 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9512 9513 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9514 /* Re-submit the queued I/O. */ 9515 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9516 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9517 _bdev_io_submit(bdev_io); 9518 } 9519 9520 spdk_bdev_for_each_channel_continue(i, 0); 9521 } 9522 9523 static void 9524 bdev_update_qos_rate_limit_msg(void *cb_arg) 9525 { 9526 struct set_qos_limit_ctx *ctx = cb_arg; 9527 struct spdk_bdev *bdev = ctx->bdev; 9528 9529 spdk_spin_lock(&bdev->internal.spinlock); 9530 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9531 spdk_spin_unlock(&bdev->internal.spinlock); 9532 9533 bdev_set_qos_limit_done(ctx, 0); 9534 } 9535 9536 static void 9537 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9538 struct spdk_io_channel *ch, void *_ctx) 9539 { 9540 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9541 9542 spdk_spin_lock(&bdev->internal.spinlock); 9543 bdev_enable_qos(bdev, bdev_ch); 9544 spdk_spin_unlock(&bdev->internal.spinlock); 9545 spdk_bdev_for_each_channel_continue(i, 0); 9546 } 9547 9548 static void 9549 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9550 { 9551 struct set_qos_limit_ctx *ctx = _ctx; 9552 9553 bdev_set_qos_limit_done(ctx, status); 9554 } 9555 9556 static void 9557 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9558 { 9559 int i; 9560 9561 assert(bdev->internal.qos != NULL); 9562 9563 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9564 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9565 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9566 9567 if (limits[i] == 0) { 9568 bdev->internal.qos->rate_limits[i].limit = 9569 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9570 } 9571 } 9572 } 9573 } 9574 9575 void 9576 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9577 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9578 { 9579 struct set_qos_limit_ctx *ctx; 9580 uint32_t limit_set_complement; 9581 uint64_t min_limit_per_sec; 9582 int i; 9583 bool disable_rate_limit = true; 9584 9585 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9586 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9587 continue; 9588 } 9589 9590 if (limits[i] > 0) { 9591 disable_rate_limit = false; 9592 } 9593 9594 if (bdev_qos_is_iops_rate_limit(i) == true) { 9595 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9596 } else { 9597 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9598 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9599 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9600 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9601 } 9602 /* Change from megabyte to byte rate limit */ 9603 limits[i] = limits[i] * 1024 * 1024; 9604 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9605 } 9606 9607 limit_set_complement = limits[i] % min_limit_per_sec; 9608 if (limit_set_complement) { 9609 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9610 limits[i], min_limit_per_sec); 9611 limits[i] += min_limit_per_sec - limit_set_complement; 9612 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9613 } 9614 } 9615 9616 ctx = calloc(1, sizeof(*ctx)); 9617 if (ctx == NULL) { 9618 cb_fn(cb_arg, -ENOMEM); 9619 return; 9620 } 9621 9622 ctx->cb_fn = cb_fn; 9623 ctx->cb_arg = cb_arg; 9624 ctx->bdev = bdev; 9625 9626 spdk_spin_lock(&bdev->internal.spinlock); 9627 if (bdev->internal.qos_mod_in_progress) { 9628 spdk_spin_unlock(&bdev->internal.spinlock); 9629 free(ctx); 9630 cb_fn(cb_arg, -EAGAIN); 9631 return; 9632 } 9633 bdev->internal.qos_mod_in_progress = true; 9634 9635 if (disable_rate_limit == true && bdev->internal.qos) { 9636 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9637 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9638 (bdev->internal.qos->rate_limits[i].limit > 0 && 9639 bdev->internal.qos->rate_limits[i].limit != 9640 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9641 disable_rate_limit = false; 9642 break; 9643 } 9644 } 9645 } 9646 9647 if (disable_rate_limit == false) { 9648 if (bdev->internal.qos == NULL) { 9649 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9650 if (!bdev->internal.qos) { 9651 spdk_spin_unlock(&bdev->internal.spinlock); 9652 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9653 bdev_set_qos_limit_done(ctx, -ENOMEM); 9654 return; 9655 } 9656 } 9657 9658 if (bdev->internal.qos->thread == NULL) { 9659 /* Enabling */ 9660 bdev_set_qos_rate_limits(bdev, limits); 9661 9662 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9663 bdev_enable_qos_done); 9664 } else { 9665 /* Updating */ 9666 bdev_set_qos_rate_limits(bdev, limits); 9667 9668 spdk_thread_send_msg(bdev->internal.qos->thread, 9669 bdev_update_qos_rate_limit_msg, ctx); 9670 } 9671 } else { 9672 if (bdev->internal.qos != NULL) { 9673 bdev_set_qos_rate_limits(bdev, limits); 9674 9675 /* Disabling */ 9676 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9677 bdev_disable_qos_msg_done); 9678 } else { 9679 spdk_spin_unlock(&bdev->internal.spinlock); 9680 bdev_set_qos_limit_done(ctx, 0); 9681 return; 9682 } 9683 } 9684 9685 spdk_spin_unlock(&bdev->internal.spinlock); 9686 } 9687 9688 struct spdk_bdev_histogram_ctx { 9689 spdk_bdev_histogram_status_cb cb_fn; 9690 void *cb_arg; 9691 struct spdk_bdev *bdev; 9692 int status; 9693 }; 9694 9695 static void 9696 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9697 { 9698 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9699 9700 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9701 ctx->bdev->internal.histogram_in_progress = false; 9702 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9703 ctx->cb_fn(ctx->cb_arg, ctx->status); 9704 free(ctx); 9705 } 9706 9707 static void 9708 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9709 struct spdk_io_channel *_ch, void *_ctx) 9710 { 9711 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9712 9713 if (ch->histogram != NULL) { 9714 spdk_histogram_data_free(ch->histogram); 9715 ch->histogram = NULL; 9716 } 9717 spdk_bdev_for_each_channel_continue(i, 0); 9718 } 9719 9720 static void 9721 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9722 { 9723 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9724 9725 if (status != 0) { 9726 ctx->status = status; 9727 ctx->bdev->internal.histogram_enabled = false; 9728 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9729 bdev_histogram_disable_channel_cb); 9730 } else { 9731 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9732 ctx->bdev->internal.histogram_in_progress = false; 9733 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9734 ctx->cb_fn(ctx->cb_arg, ctx->status); 9735 free(ctx); 9736 } 9737 } 9738 9739 static void 9740 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9741 struct spdk_io_channel *_ch, void *_ctx) 9742 { 9743 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9744 int status = 0; 9745 9746 if (ch->histogram == NULL) { 9747 ch->histogram = spdk_histogram_data_alloc(); 9748 if (ch->histogram == NULL) { 9749 status = -ENOMEM; 9750 } 9751 } 9752 9753 spdk_bdev_for_each_channel_continue(i, status); 9754 } 9755 9756 void 9757 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9758 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9759 { 9760 struct spdk_bdev_histogram_ctx *ctx; 9761 9762 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9763 if (ctx == NULL) { 9764 cb_fn(cb_arg, -ENOMEM); 9765 return; 9766 } 9767 9768 ctx->bdev = bdev; 9769 ctx->status = 0; 9770 ctx->cb_fn = cb_fn; 9771 ctx->cb_arg = cb_arg; 9772 9773 spdk_spin_lock(&bdev->internal.spinlock); 9774 if (bdev->internal.histogram_in_progress) { 9775 spdk_spin_unlock(&bdev->internal.spinlock); 9776 free(ctx); 9777 cb_fn(cb_arg, -EAGAIN); 9778 return; 9779 } 9780 9781 bdev->internal.histogram_in_progress = true; 9782 spdk_spin_unlock(&bdev->internal.spinlock); 9783 9784 bdev->internal.histogram_enabled = enable; 9785 bdev->internal.histogram_io_type = opts->io_type; 9786 9787 if (enable) { 9788 /* Allocate histogram for each channel */ 9789 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9790 bdev_histogram_enable_channel_cb); 9791 } else { 9792 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9793 bdev_histogram_disable_channel_cb); 9794 } 9795 } 9796 9797 void 9798 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9799 { 9800 if (opts == NULL) { 9801 SPDK_ERRLOG("opts should not be NULL\n"); 9802 assert(opts != NULL); 9803 return; 9804 } 9805 if (size == 0) { 9806 SPDK_ERRLOG("size should not be zero\n"); 9807 assert(size != 0); 9808 return; 9809 } 9810 9811 memset(opts, 0, size); 9812 opts->size = size; 9813 9814 #define FIELD_OK(field) \ 9815 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9816 9817 #define SET_FIELD(field, value) \ 9818 if (FIELD_OK(field)) { \ 9819 opts->field = value; \ 9820 } \ 9821 9822 SET_FIELD(io_type, 0); 9823 9824 /* You should not remove this statement, but need to update the assert statement 9825 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9826 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9827 9828 #undef FIELD_OK 9829 #undef SET_FIELD 9830 } 9831 9832 void 9833 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9834 void *cb_arg, bool enable) 9835 { 9836 struct spdk_bdev_enable_histogram_opts opts; 9837 9838 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9839 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9840 } 9841 9842 struct spdk_bdev_histogram_data_ctx { 9843 spdk_bdev_histogram_data_cb cb_fn; 9844 void *cb_arg; 9845 struct spdk_bdev *bdev; 9846 /** merged histogram data from all channels */ 9847 struct spdk_histogram_data *histogram; 9848 }; 9849 9850 static void 9851 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9852 { 9853 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9854 9855 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9856 free(ctx); 9857 } 9858 9859 static void 9860 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9861 struct spdk_io_channel *_ch, void *_ctx) 9862 { 9863 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9864 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9865 int status = 0; 9866 9867 if (ch->histogram == NULL) { 9868 status = -EFAULT; 9869 } else { 9870 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9871 } 9872 9873 spdk_bdev_for_each_channel_continue(i, status); 9874 } 9875 9876 void 9877 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9878 spdk_bdev_histogram_data_cb cb_fn, 9879 void *cb_arg) 9880 { 9881 struct spdk_bdev_histogram_data_ctx *ctx; 9882 9883 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9884 if (ctx == NULL) { 9885 cb_fn(cb_arg, -ENOMEM, NULL); 9886 return; 9887 } 9888 9889 ctx->bdev = bdev; 9890 ctx->cb_fn = cb_fn; 9891 ctx->cb_arg = cb_arg; 9892 9893 ctx->histogram = histogram; 9894 9895 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9896 bdev_histogram_get_channel_cb); 9897 } 9898 9899 void 9900 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9901 void *cb_arg) 9902 { 9903 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9904 int status = 0; 9905 9906 assert(cb_fn != NULL); 9907 9908 if (bdev_ch->histogram == NULL) { 9909 status = -EFAULT; 9910 } 9911 cb_fn(cb_arg, status, bdev_ch->histogram); 9912 } 9913 9914 size_t 9915 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9916 size_t max_events) 9917 { 9918 struct media_event_entry *entry; 9919 size_t num_events = 0; 9920 9921 for (; num_events < max_events; ++num_events) { 9922 entry = TAILQ_FIRST(&desc->pending_media_events); 9923 if (entry == NULL) { 9924 break; 9925 } 9926 9927 events[num_events] = entry->event; 9928 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9929 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9930 } 9931 9932 return num_events; 9933 } 9934 9935 int 9936 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9937 size_t num_events) 9938 { 9939 struct spdk_bdev_desc *desc; 9940 struct media_event_entry *entry; 9941 size_t event_id; 9942 int rc = 0; 9943 9944 assert(bdev->media_events); 9945 9946 spdk_spin_lock(&bdev->internal.spinlock); 9947 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9948 if (desc->write) { 9949 break; 9950 } 9951 } 9952 9953 if (desc == NULL || desc->media_events_buffer == NULL) { 9954 rc = -ENODEV; 9955 goto out; 9956 } 9957 9958 for (event_id = 0; event_id < num_events; ++event_id) { 9959 entry = TAILQ_FIRST(&desc->free_media_events); 9960 if (entry == NULL) { 9961 break; 9962 } 9963 9964 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9965 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9966 entry->event = events[event_id]; 9967 } 9968 9969 rc = event_id; 9970 out: 9971 spdk_spin_unlock(&bdev->internal.spinlock); 9972 return rc; 9973 } 9974 9975 static void 9976 _media_management_notify(void *arg) 9977 { 9978 struct spdk_bdev_desc *desc = arg; 9979 9980 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9981 } 9982 9983 void 9984 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9985 { 9986 struct spdk_bdev_desc *desc; 9987 9988 spdk_spin_lock(&bdev->internal.spinlock); 9989 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9990 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9991 event_notify(desc, _media_management_notify); 9992 } 9993 } 9994 spdk_spin_unlock(&bdev->internal.spinlock); 9995 } 9996 9997 struct locked_lba_range_ctx { 9998 struct lba_range range; 9999 struct lba_range *current_range; 10000 struct lba_range *owner_range; 10001 struct spdk_poller *poller; 10002 lock_range_cb cb_fn; 10003 void *cb_arg; 10004 }; 10005 10006 static void 10007 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10008 { 10009 struct locked_lba_range_ctx *ctx = _ctx; 10010 10011 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 10012 free(ctx); 10013 } 10014 10015 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 10016 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 10017 10018 static void 10019 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10020 { 10021 struct locked_lba_range_ctx *ctx = _ctx; 10022 10023 if (status == -ENOMEM) { 10024 /* One of the channels could not allocate a range object. 10025 * So we have to go back and clean up any ranges that were 10026 * allocated successfully before we return error status to 10027 * the caller. We can reuse the unlock function to do that 10028 * clean up. 10029 */ 10030 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10031 bdev_lock_error_cleanup_cb); 10032 return; 10033 } 10034 10035 /* All channels have locked this range and no I/O overlapping the range 10036 * are outstanding! Set the owner_ch for the range object for the 10037 * locking channel, so that this channel will know that it is allowed 10038 * to write to this range. 10039 */ 10040 if (ctx->owner_range != NULL) { 10041 ctx->owner_range->owner_ch = ctx->range.owner_ch; 10042 } 10043 10044 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10045 10046 /* Don't free the ctx here. Its range is in the bdev's global list of 10047 * locked ranges still, and will be removed and freed when this range 10048 * is later unlocked. 10049 */ 10050 } 10051 10052 static int 10053 bdev_lock_lba_range_check_io(void *_i) 10054 { 10055 struct spdk_bdev_channel_iter *i = _i; 10056 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 10057 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10058 struct locked_lba_range_ctx *ctx = i->ctx; 10059 struct lba_range *range = ctx->current_range; 10060 struct spdk_bdev_io *bdev_io; 10061 10062 spdk_poller_unregister(&ctx->poller); 10063 10064 /* The range is now in the locked_ranges, so no new IO can be submitted to this 10065 * range. But we need to wait until any outstanding IO overlapping with this range 10066 * are completed. 10067 */ 10068 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 10069 if (bdev_io_range_is_locked(bdev_io, range)) { 10070 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 10071 return SPDK_POLLER_BUSY; 10072 } 10073 } 10074 10075 spdk_bdev_for_each_channel_continue(i, 0); 10076 return SPDK_POLLER_BUSY; 10077 } 10078 10079 static void 10080 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10081 struct spdk_io_channel *_ch, void *_ctx) 10082 { 10083 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10084 struct locked_lba_range_ctx *ctx = _ctx; 10085 struct lba_range *range; 10086 10087 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10088 if (range->length == ctx->range.length && 10089 range->offset == ctx->range.offset && 10090 range->locked_ctx == ctx->range.locked_ctx) { 10091 /* This range already exists on this channel, so don't add 10092 * it again. This can happen when a new channel is created 10093 * while the for_each_channel operation is in progress. 10094 * Do not check for outstanding I/O in that case, since the 10095 * range was locked before any I/O could be submitted to the 10096 * new channel. 10097 */ 10098 spdk_bdev_for_each_channel_continue(i, 0); 10099 return; 10100 } 10101 } 10102 10103 range = calloc(1, sizeof(*range)); 10104 if (range == NULL) { 10105 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 10106 return; 10107 } 10108 10109 range->length = ctx->range.length; 10110 range->offset = ctx->range.offset; 10111 range->locked_ctx = ctx->range.locked_ctx; 10112 range->quiesce = ctx->range.quiesce; 10113 ctx->current_range = range; 10114 if (ctx->range.owner_ch == ch) { 10115 /* This is the range object for the channel that will hold 10116 * the lock. Store it in the ctx object so that we can easily 10117 * set its owner_ch after the lock is finally acquired. 10118 */ 10119 ctx->owner_range = range; 10120 } 10121 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 10122 bdev_lock_lba_range_check_io(i); 10123 } 10124 10125 static void 10126 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 10127 { 10128 assert(spdk_get_thread() == ctx->range.owner_thread); 10129 assert(ctx->range.owner_ch == NULL || 10130 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 10131 10132 /* We will add a copy of this range to each channel now. */ 10133 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 10134 bdev_lock_lba_range_cb); 10135 } 10136 10137 static bool 10138 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 10139 { 10140 struct lba_range *r; 10141 10142 TAILQ_FOREACH(r, tailq, tailq) { 10143 if (bdev_lba_range_overlapped(range, r)) { 10144 return true; 10145 } 10146 } 10147 return false; 10148 } 10149 10150 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10151 10152 static int 10153 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10154 uint64_t offset, uint64_t length, 10155 lock_range_cb cb_fn, void *cb_arg) 10156 { 10157 struct locked_lba_range_ctx *ctx; 10158 10159 ctx = calloc(1, sizeof(*ctx)); 10160 if (ctx == NULL) { 10161 return -ENOMEM; 10162 } 10163 10164 ctx->range.offset = offset; 10165 ctx->range.length = length; 10166 ctx->range.owner_thread = spdk_get_thread(); 10167 ctx->range.owner_ch = ch; 10168 ctx->range.locked_ctx = cb_arg; 10169 ctx->range.bdev = bdev; 10170 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10171 ctx->cb_fn = cb_fn; 10172 ctx->cb_arg = cb_arg; 10173 10174 spdk_spin_lock(&bdev->internal.spinlock); 10175 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10176 /* There is an active lock overlapping with this range. 10177 * Put it on the pending list until this range no 10178 * longer overlaps with another. 10179 */ 10180 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10181 } else { 10182 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10183 bdev_lock_lba_range_ctx(bdev, ctx); 10184 } 10185 spdk_spin_unlock(&bdev->internal.spinlock); 10186 return 0; 10187 } 10188 10189 static int 10190 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10191 uint64_t offset, uint64_t length, 10192 lock_range_cb cb_fn, void *cb_arg) 10193 { 10194 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10195 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10196 10197 if (cb_arg == NULL) { 10198 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10199 return -EINVAL; 10200 } 10201 10202 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10203 } 10204 10205 static void 10206 bdev_lock_lba_range_ctx_msg(void *_ctx) 10207 { 10208 struct locked_lba_range_ctx *ctx = _ctx; 10209 10210 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10211 } 10212 10213 static void 10214 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10215 { 10216 struct locked_lba_range_ctx *ctx = _ctx; 10217 struct locked_lba_range_ctx *pending_ctx; 10218 struct lba_range *range, *tmp; 10219 10220 spdk_spin_lock(&bdev->internal.spinlock); 10221 /* Check if there are any pending locked ranges that overlap with this range 10222 * that was just unlocked. If there are, check that it doesn't overlap with any 10223 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10224 * the lock process. 10225 */ 10226 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10227 if (bdev_lba_range_overlapped(range, &ctx->range) && 10228 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10229 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10230 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10231 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10232 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10233 bdev_lock_lba_range_ctx_msg, pending_ctx); 10234 } 10235 } 10236 spdk_spin_unlock(&bdev->internal.spinlock); 10237 10238 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10239 free(ctx); 10240 } 10241 10242 static void 10243 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10244 struct spdk_io_channel *_ch, void *_ctx) 10245 { 10246 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10247 struct locked_lba_range_ctx *ctx = _ctx; 10248 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10249 struct spdk_bdev_io *bdev_io; 10250 struct lba_range *range; 10251 10252 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10253 if (ctx->range.offset == range->offset && 10254 ctx->range.length == range->length && 10255 ctx->range.locked_ctx == range->locked_ctx) { 10256 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10257 free(range); 10258 break; 10259 } 10260 } 10261 10262 /* Note: we should almost always be able to assert that the range specified 10263 * was found. But there are some very rare corner cases where a new channel 10264 * gets created simultaneously with a range unlock, where this function 10265 * would execute on that new channel and wouldn't have the range. 10266 * We also use this to clean up range allocations when a later allocation 10267 * fails in the locking path. 10268 * So we can't actually assert() here. 10269 */ 10270 10271 /* Swap the locked IO into a temporary list, and then try to submit them again. 10272 * We could hyper-optimize this to only resubmit locked I/O that overlap 10273 * with the range that was just unlocked, but this isn't a performance path so 10274 * we go for simplicity here. 10275 */ 10276 TAILQ_INIT(&io_locked); 10277 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10278 while (!TAILQ_EMPTY(&io_locked)) { 10279 bdev_io = TAILQ_FIRST(&io_locked); 10280 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10281 bdev_io_submit(bdev_io); 10282 } 10283 10284 spdk_bdev_for_each_channel_continue(i, 0); 10285 } 10286 10287 static int 10288 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10289 lock_range_cb cb_fn, void *cb_arg) 10290 { 10291 struct locked_lba_range_ctx *ctx; 10292 struct lba_range *range; 10293 10294 spdk_spin_lock(&bdev->internal.spinlock); 10295 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10296 * and remove it. This ensures new channels don't inherit the locked range. 10297 * Then we will send a message to each channel to remove the range from its 10298 * per-channel list. 10299 */ 10300 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10301 if (range->offset == offset && range->length == length && 10302 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10303 break; 10304 } 10305 } 10306 if (range == NULL) { 10307 assert(false); 10308 spdk_spin_unlock(&bdev->internal.spinlock); 10309 return -EINVAL; 10310 } 10311 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10312 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10313 spdk_spin_unlock(&bdev->internal.spinlock); 10314 10315 ctx->cb_fn = cb_fn; 10316 ctx->cb_arg = cb_arg; 10317 10318 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10319 bdev_unlock_lba_range_cb); 10320 return 0; 10321 } 10322 10323 static int 10324 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10325 uint64_t offset, uint64_t length, 10326 lock_range_cb cb_fn, void *cb_arg) 10327 { 10328 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10329 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10330 struct lba_range *range; 10331 bool range_found = false; 10332 10333 /* Let's make sure the specified channel actually has a lock on 10334 * the specified range. Note that the range must match exactly. 10335 */ 10336 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10337 if (range->offset == offset && range->length == length && 10338 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10339 range_found = true; 10340 break; 10341 } 10342 } 10343 10344 if (!range_found) { 10345 return -EINVAL; 10346 } 10347 10348 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10349 } 10350 10351 struct bdev_quiesce_ctx { 10352 spdk_bdev_quiesce_cb cb_fn; 10353 void *cb_arg; 10354 }; 10355 10356 static void 10357 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10358 { 10359 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10360 10361 if (quiesce_ctx->cb_fn != NULL) { 10362 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10363 } 10364 10365 free(quiesce_ctx); 10366 } 10367 10368 static void 10369 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10370 { 10371 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10372 struct spdk_bdev_module *module = range->bdev->module; 10373 10374 if (status != 0) { 10375 if (quiesce_ctx->cb_fn != NULL) { 10376 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10377 } 10378 free(quiesce_ctx); 10379 return; 10380 } 10381 10382 spdk_spin_lock(&module->internal.spinlock); 10383 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10384 spdk_spin_unlock(&module->internal.spinlock); 10385 10386 if (quiesce_ctx->cb_fn != NULL) { 10387 /* copy the context in case the range is unlocked by the callback */ 10388 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10389 10390 quiesce_ctx->cb_fn = NULL; 10391 quiesce_ctx->cb_arg = NULL; 10392 10393 tmp.cb_fn(tmp.cb_arg, status); 10394 } 10395 /* quiesce_ctx will be freed on unquiesce */ 10396 } 10397 10398 static int 10399 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10400 uint64_t offset, uint64_t length, 10401 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10402 bool unquiesce) 10403 { 10404 struct bdev_quiesce_ctx *quiesce_ctx; 10405 int rc; 10406 10407 if (module != bdev->module) { 10408 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10409 return -EINVAL; 10410 } 10411 10412 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10413 return -EINVAL; 10414 } 10415 10416 if (unquiesce) { 10417 struct lba_range *range; 10418 10419 /* Make sure the specified range is actually quiesced in the specified module and 10420 * then remove it from the list. Note that the range must match exactly. 10421 */ 10422 spdk_spin_lock(&module->internal.spinlock); 10423 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10424 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10425 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10426 break; 10427 } 10428 } 10429 spdk_spin_unlock(&module->internal.spinlock); 10430 10431 if (range == NULL) { 10432 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10433 return -EINVAL; 10434 } 10435 10436 quiesce_ctx = range->locked_ctx; 10437 quiesce_ctx->cb_fn = cb_fn; 10438 quiesce_ctx->cb_arg = cb_arg; 10439 10440 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10441 } else { 10442 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10443 if (quiesce_ctx == NULL) { 10444 return -ENOMEM; 10445 } 10446 10447 quiesce_ctx->cb_fn = cb_fn; 10448 quiesce_ctx->cb_arg = cb_arg; 10449 10450 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10451 if (rc != 0) { 10452 free(quiesce_ctx); 10453 } 10454 } 10455 10456 return rc; 10457 } 10458 10459 int 10460 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10461 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10462 { 10463 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10464 } 10465 10466 int 10467 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10468 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10469 { 10470 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10471 } 10472 10473 int 10474 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10475 uint64_t offset, uint64_t length, 10476 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10477 { 10478 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10479 } 10480 10481 int 10482 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10483 uint64_t offset, uint64_t length, 10484 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10485 { 10486 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10487 } 10488 10489 int 10490 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10491 int array_size) 10492 { 10493 if (!bdev) { 10494 return -EINVAL; 10495 } 10496 10497 if (bdev->fn_table->get_memory_domains) { 10498 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10499 } 10500 10501 return 0; 10502 } 10503 10504 struct spdk_bdev_for_each_io_ctx { 10505 void *ctx; 10506 spdk_bdev_io_fn fn; 10507 spdk_bdev_for_each_io_cb cb; 10508 }; 10509 10510 static void 10511 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10512 struct spdk_io_channel *io_ch, void *_ctx) 10513 { 10514 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10515 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10516 struct spdk_bdev_io *bdev_io; 10517 int rc = 0; 10518 10519 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10520 rc = ctx->fn(ctx->ctx, bdev_io); 10521 if (rc != 0) { 10522 break; 10523 } 10524 } 10525 10526 spdk_bdev_for_each_channel_continue(i, rc); 10527 } 10528 10529 static void 10530 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10531 { 10532 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10533 10534 ctx->cb(ctx->ctx, status); 10535 10536 free(ctx); 10537 } 10538 10539 void 10540 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10541 spdk_bdev_for_each_io_cb cb) 10542 { 10543 struct spdk_bdev_for_each_io_ctx *ctx; 10544 10545 assert(fn != NULL && cb != NULL); 10546 10547 ctx = calloc(1, sizeof(*ctx)); 10548 if (ctx == NULL) { 10549 SPDK_ERRLOG("Failed to allocate context.\n"); 10550 cb(_ctx, -ENOMEM); 10551 return; 10552 } 10553 10554 ctx->ctx = _ctx; 10555 ctx->fn = fn; 10556 ctx->cb = cb; 10557 10558 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10559 bdev_for_each_io_done); 10560 } 10561 10562 void 10563 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10564 { 10565 spdk_for_each_channel_continue(iter->i, status); 10566 } 10567 10568 static struct spdk_bdev * 10569 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10570 { 10571 void *io_device = spdk_io_channel_iter_get_io_device(i); 10572 10573 return __bdev_from_io_dev(io_device); 10574 } 10575 10576 static void 10577 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10578 { 10579 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10580 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10581 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10582 10583 iter->i = i; 10584 iter->fn(iter, bdev, ch, iter->ctx); 10585 } 10586 10587 static void 10588 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10589 { 10590 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10591 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10592 10593 iter->i = i; 10594 iter->cpl(bdev, iter->ctx, status); 10595 10596 free(iter); 10597 } 10598 10599 void 10600 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10601 void *ctx, spdk_bdev_for_each_channel_done cpl) 10602 { 10603 struct spdk_bdev_channel_iter *iter; 10604 10605 assert(bdev != NULL && fn != NULL && ctx != NULL); 10606 10607 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10608 if (iter == NULL) { 10609 SPDK_ERRLOG("Unable to allocate iterator\n"); 10610 assert(false); 10611 return; 10612 } 10613 10614 iter->fn = fn; 10615 iter->cpl = cpl; 10616 iter->ctx = ctx; 10617 10618 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10619 iter, bdev_each_channel_cpl); 10620 } 10621 10622 static void 10623 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10624 { 10625 struct spdk_bdev_io *parent_io = cb_arg; 10626 10627 spdk_bdev_free_io(bdev_io); 10628 10629 /* Check return status of write */ 10630 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10631 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10632 } 10633 10634 static void 10635 bdev_copy_do_write(void *_bdev_io) 10636 { 10637 struct spdk_bdev_io *bdev_io = _bdev_io; 10638 int rc; 10639 10640 /* Write blocks */ 10641 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10642 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10643 bdev_io->u.bdev.iovs[0].iov_base, 10644 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10645 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10646 10647 if (rc == -ENOMEM) { 10648 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10649 } else if (rc != 0) { 10650 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10651 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10652 } 10653 } 10654 10655 static void 10656 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10657 { 10658 struct spdk_bdev_io *parent_io = cb_arg; 10659 10660 spdk_bdev_free_io(bdev_io); 10661 10662 /* Check return status of read */ 10663 if (!success) { 10664 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10665 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10666 return; 10667 } 10668 10669 /* Do write */ 10670 bdev_copy_do_write(parent_io); 10671 } 10672 10673 static void 10674 bdev_copy_do_read(void *_bdev_io) 10675 { 10676 struct spdk_bdev_io *bdev_io = _bdev_io; 10677 int rc; 10678 10679 /* Read blocks */ 10680 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10681 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10682 bdev_io->u.bdev.iovs[0].iov_base, 10683 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10684 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10685 10686 if (rc == -ENOMEM) { 10687 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10688 } else if (rc != 0) { 10689 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10690 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10691 } 10692 } 10693 10694 static void 10695 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10696 { 10697 if (!success) { 10698 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10699 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10700 return; 10701 } 10702 10703 bdev_copy_do_read(bdev_io); 10704 } 10705 10706 int 10707 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10708 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10709 spdk_bdev_io_completion_cb cb, void *cb_arg) 10710 { 10711 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10712 struct spdk_bdev_io *bdev_io; 10713 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10714 10715 if (!desc->write) { 10716 return -EBADF; 10717 } 10718 10719 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10720 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10721 SPDK_DEBUGLOG(bdev, 10722 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10723 dst_offset_blocks, src_offset_blocks, num_blocks); 10724 return -EINVAL; 10725 } 10726 10727 bdev_io = bdev_channel_get_io(channel); 10728 if (!bdev_io) { 10729 return -ENOMEM; 10730 } 10731 10732 bdev_io->internal.ch = channel; 10733 bdev_io->internal.desc = desc; 10734 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10735 10736 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10737 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10738 bdev_io->u.bdev.num_blocks = num_blocks; 10739 bdev_io->u.bdev.memory_domain = NULL; 10740 bdev_io->u.bdev.memory_domain_ctx = NULL; 10741 bdev_io->u.bdev.iovs = NULL; 10742 bdev_io->u.bdev.iovcnt = 0; 10743 bdev_io->u.bdev.md_buf = NULL; 10744 bdev_io->u.bdev.accel_sequence = NULL; 10745 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10746 10747 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10748 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10749 return 0; 10750 } 10751 10752 10753 /* If the copy size is large and should be split, use the generic split logic 10754 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10755 * 10756 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10757 * emulate it using regular read and write requests otherwise. 10758 */ 10759 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10760 bdev_io->internal.f.split) { 10761 bdev_io_submit(bdev_io); 10762 return 0; 10763 } 10764 10765 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10766 10767 return 0; 10768 } 10769 10770 SPDK_LOG_REGISTER_COMPONENT(bdev) 10771 10772 static void 10773 bdev_trace(void) 10774 { 10775 struct spdk_trace_tpoint_opts opts[] = { 10776 { 10777 "BDEV_IO_START", TRACE_BDEV_IO_START, 10778 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10779 { 10780 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10781 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10782 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10783 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10784 } 10785 }, 10786 { 10787 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10788 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10789 { 10790 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10791 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10792 } 10793 }, 10794 { 10795 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10796 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10797 { 10798 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10799 } 10800 }, 10801 { 10802 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10803 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10804 { 10805 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10806 } 10807 }, 10808 }; 10809 10810 10811 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10812 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10813 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10814 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10815 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10816 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10817 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10818 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 10819 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 10820 } 10821 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10822