1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 bool write; 337 bool memory_domains_supported; 338 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 339 struct spdk_bdev_open_opts opts; 340 struct spdk_thread *thread; 341 struct { 342 spdk_bdev_event_cb_t event_fn; 343 void *ctx; 344 } callback; 345 bool closed; 346 struct spdk_spinlock spinlock; 347 uint32_t refs; 348 TAILQ_HEAD(, media_event_entry) pending_media_events; 349 TAILQ_HEAD(, media_event_entry) free_media_events; 350 struct media_event_entry *media_events_buffer; 351 TAILQ_ENTRY(spdk_bdev_desc) link; 352 353 uint64_t timeout_in_sec; 354 spdk_bdev_io_timeout_cb cb_fn; 355 void *cb_arg; 356 struct spdk_poller *io_timeout_poller; 357 struct spdk_bdev_module_claim *claim; 358 }; 359 360 struct spdk_bdev_iostat_ctx { 361 struct spdk_bdev_io_stat *stat; 362 enum spdk_bdev_reset_stat_mode reset_mode; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 }; 392 393 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 394 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 395 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 396 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 397 398 static inline void bdev_io_complete(void *ctx); 399 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 404 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 405 406 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 407 struct spdk_io_channel *ch, void *_ctx); 408 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 409 410 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 412 uint64_t num_blocks, 413 struct spdk_memory_domain *domain, void *domain_ctx, 414 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 415 spdk_bdev_io_completion_cb cb, void *cb_arg); 416 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 417 struct iovec *iov, int iovcnt, void *md_buf, 418 uint64_t offset_blocks, uint64_t num_blocks, 419 struct spdk_memory_domain *domain, void *domain_ctx, 420 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 421 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 422 spdk_bdev_io_completion_cb cb, void *cb_arg); 423 424 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 425 uint64_t offset, uint64_t length, 426 lock_range_cb cb_fn, void *cb_arg); 427 428 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 429 uint64_t offset, uint64_t length, 430 lock_range_cb cb_fn, void *cb_arg); 431 432 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 433 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 434 435 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 436 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 437 static void claim_reset(struct spdk_bdev *bdev); 438 439 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 440 441 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 442 443 #define bdev_get_ext_io_opt(opts, field, defval) \ 444 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 445 446 static inline void 447 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 448 { 449 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 450 bdev_io->internal.ch->queue_depth++; 451 } 452 453 static inline void 454 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 455 { 456 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 457 bdev_io->internal.ch->queue_depth--; 458 } 459 460 void 461 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 462 { 463 if (!opts) { 464 SPDK_ERRLOG("opts should not be NULL\n"); 465 return; 466 } 467 468 if (!opts_size) { 469 SPDK_ERRLOG("opts_size should not be zero value\n"); 470 return; 471 } 472 473 opts->opts_size = opts_size; 474 475 #define SET_FIELD(field) \ 476 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 477 opts->field = g_bdev_opts.field; \ 478 } \ 479 480 SET_FIELD(bdev_io_pool_size); 481 SET_FIELD(bdev_io_cache_size); 482 SET_FIELD(bdev_auto_examine); 483 SET_FIELD(iobuf_small_cache_size); 484 SET_FIELD(iobuf_large_cache_size); 485 486 /* Do not remove this statement, you should always update this statement when you adding a new field, 487 * and do not forget to add the SET_FIELD statement for your added field. */ 488 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 489 490 #undef SET_FIELD 491 } 492 493 int 494 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 495 { 496 uint32_t min_pool_size; 497 498 if (!opts) { 499 SPDK_ERRLOG("opts cannot be NULL\n"); 500 return -1; 501 } 502 503 if (!opts->opts_size) { 504 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 505 return -1; 506 } 507 508 /* 509 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 510 * initialization. A second mgmt_ch will be created on the same thread when the application starts 511 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 512 */ 513 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 514 if (opts->bdev_io_pool_size < min_pool_size) { 515 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 516 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 517 spdk_thread_get_count()); 518 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 519 return -1; 520 } 521 522 #define SET_FIELD(field) \ 523 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 524 g_bdev_opts.field = opts->field; \ 525 } \ 526 527 SET_FIELD(bdev_io_pool_size); 528 SET_FIELD(bdev_io_cache_size); 529 SET_FIELD(bdev_auto_examine); 530 SET_FIELD(iobuf_small_cache_size); 531 SET_FIELD(iobuf_large_cache_size); 532 533 g_bdev_opts.opts_size = opts->opts_size; 534 535 #undef SET_FIELD 536 537 return 0; 538 } 539 540 static struct spdk_bdev * 541 bdev_get_by_name(const char *bdev_name) 542 { 543 struct spdk_bdev_name find; 544 struct spdk_bdev_name *res; 545 546 find.name = (char *)bdev_name; 547 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 548 if (res != NULL) { 549 return res->bdev; 550 } 551 552 return NULL; 553 } 554 555 struct spdk_bdev * 556 spdk_bdev_get_by_name(const char *bdev_name) 557 { 558 struct spdk_bdev *bdev; 559 560 spdk_spin_lock(&g_bdev_mgr.spinlock); 561 bdev = bdev_get_by_name(bdev_name); 562 spdk_spin_unlock(&g_bdev_mgr.spinlock); 563 564 return bdev; 565 } 566 567 struct bdev_io_status_string { 568 enum spdk_bdev_io_status status; 569 const char *str; 570 }; 571 572 static const struct bdev_io_status_string bdev_io_status_strings[] = { 573 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 574 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 575 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 576 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 577 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 578 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 579 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 580 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 581 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 582 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 583 }; 584 585 static const char * 586 bdev_io_status_get_string(enum spdk_bdev_io_status status) 587 { 588 uint32_t i; 589 590 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 591 if (bdev_io_status_strings[i].status == status) { 592 return bdev_io_status_strings[i].str; 593 } 594 } 595 596 return "reserved"; 597 } 598 599 struct spdk_bdev_wait_for_examine_ctx { 600 struct spdk_poller *poller; 601 spdk_bdev_wait_for_examine_cb cb_fn; 602 void *cb_arg; 603 }; 604 605 static bool bdev_module_all_actions_completed(void); 606 607 static int 608 bdev_wait_for_examine_cb(void *arg) 609 { 610 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 611 612 if (!bdev_module_all_actions_completed()) { 613 return SPDK_POLLER_IDLE; 614 } 615 616 spdk_poller_unregister(&ctx->poller); 617 ctx->cb_fn(ctx->cb_arg); 618 free(ctx); 619 620 return SPDK_POLLER_BUSY; 621 } 622 623 int 624 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 625 { 626 struct spdk_bdev_wait_for_examine_ctx *ctx; 627 628 ctx = calloc(1, sizeof(*ctx)); 629 if (ctx == NULL) { 630 return -ENOMEM; 631 } 632 ctx->cb_fn = cb_fn; 633 ctx->cb_arg = cb_arg; 634 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 635 636 return 0; 637 } 638 639 struct spdk_bdev_examine_item { 640 char *name; 641 TAILQ_ENTRY(spdk_bdev_examine_item) link; 642 }; 643 644 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 645 646 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 647 g_bdev_examine_allowlist); 648 649 static inline bool 650 bdev_examine_allowlist_check(const char *name) 651 { 652 struct spdk_bdev_examine_item *item; 653 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 654 if (strcmp(name, item->name) == 0) { 655 return true; 656 } 657 } 658 return false; 659 } 660 661 static inline void 662 bdev_examine_allowlist_remove(const char *name) 663 { 664 struct spdk_bdev_examine_item *item; 665 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 666 if (strcmp(name, item->name) == 0) { 667 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 668 free(item->name); 669 free(item); 670 break; 671 } 672 } 673 } 674 675 static inline void 676 bdev_examine_allowlist_free(void) 677 { 678 struct spdk_bdev_examine_item *item; 679 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 680 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 681 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 682 free(item->name); 683 free(item); 684 } 685 } 686 687 static inline bool 688 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 689 { 690 struct spdk_bdev_alias *tmp; 691 if (bdev_examine_allowlist_check(bdev->name)) { 692 return true; 693 } 694 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 695 if (bdev_examine_allowlist_check(tmp->alias.name)) { 696 return true; 697 } 698 } 699 return false; 700 } 701 702 static inline bool 703 bdev_ok_to_examine(struct spdk_bdev *bdev) 704 { 705 /* Some bdevs may not support the READ command. 706 * Do not try to examine them. 707 */ 708 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 709 return false; 710 } 711 712 if (g_bdev_opts.bdev_auto_examine) { 713 return true; 714 } else { 715 return bdev_in_examine_allowlist(bdev); 716 } 717 } 718 719 static void 720 bdev_examine(struct spdk_bdev *bdev) 721 { 722 struct spdk_bdev_module *module; 723 struct spdk_bdev_module_claim *claim, *tmpclaim; 724 uint32_t action; 725 726 if (!bdev_ok_to_examine(bdev)) { 727 return; 728 } 729 730 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 731 if (module->examine_config) { 732 spdk_spin_lock(&module->internal.spinlock); 733 action = module->internal.action_in_progress; 734 module->internal.action_in_progress++; 735 spdk_spin_unlock(&module->internal.spinlock); 736 module->examine_config(bdev); 737 if (action != module->internal.action_in_progress) { 738 SPDK_ERRLOG("examine_config for module %s did not call " 739 "spdk_bdev_module_examine_done()\n", module->name); 740 } 741 } 742 } 743 744 spdk_spin_lock(&bdev->internal.spinlock); 745 746 switch (bdev->internal.claim_type) { 747 case SPDK_BDEV_CLAIM_NONE: 748 /* Examine by all bdev modules */ 749 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 750 if (module->examine_disk) { 751 spdk_spin_lock(&module->internal.spinlock); 752 module->internal.action_in_progress++; 753 spdk_spin_unlock(&module->internal.spinlock); 754 spdk_spin_unlock(&bdev->internal.spinlock); 755 module->examine_disk(bdev); 756 spdk_spin_lock(&bdev->internal.spinlock); 757 } 758 } 759 break; 760 case SPDK_BDEV_CLAIM_EXCL_WRITE: 761 /* Examine by the one bdev module with a v1 claim */ 762 module = bdev->internal.claim.v1.module; 763 if (module->examine_disk) { 764 spdk_spin_lock(&module->internal.spinlock); 765 module->internal.action_in_progress++; 766 spdk_spin_unlock(&module->internal.spinlock); 767 spdk_spin_unlock(&bdev->internal.spinlock); 768 module->examine_disk(bdev); 769 return; 770 } 771 break; 772 default: 773 /* Examine by all bdev modules with a v2 claim */ 774 assert(claim_type_is_v2(bdev->internal.claim_type)); 775 /* 776 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 777 * list, perhaps accessing freed memory. Without protection, this could happen 778 * while the lock is dropped during the examine callback. 779 */ 780 bdev->internal.examine_in_progress++; 781 782 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 783 module = claim->module; 784 785 if (module == NULL) { 786 /* This is a vestigial claim, held by examine_count */ 787 continue; 788 } 789 790 if (module->examine_disk == NULL) { 791 continue; 792 } 793 794 spdk_spin_lock(&module->internal.spinlock); 795 module->internal.action_in_progress++; 796 spdk_spin_unlock(&module->internal.spinlock); 797 798 /* Call examine_disk without holding internal.spinlock. */ 799 spdk_spin_unlock(&bdev->internal.spinlock); 800 module->examine_disk(bdev); 801 spdk_spin_lock(&bdev->internal.spinlock); 802 } 803 804 assert(bdev->internal.examine_in_progress > 0); 805 bdev->internal.examine_in_progress--; 806 if (bdev->internal.examine_in_progress == 0) { 807 /* Remove any claims that were released during examine_disk */ 808 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 809 if (claim->desc != NULL) { 810 continue; 811 } 812 813 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 814 free(claim); 815 } 816 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 817 claim_reset(bdev); 818 } 819 } 820 } 821 822 spdk_spin_unlock(&bdev->internal.spinlock); 823 } 824 825 int 826 spdk_bdev_examine(const char *name) 827 { 828 struct spdk_bdev *bdev; 829 struct spdk_bdev_examine_item *item; 830 struct spdk_thread *thread = spdk_get_thread(); 831 832 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 833 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 834 thread ? spdk_thread_get_name(thread) : "null"); 835 return -EINVAL; 836 } 837 838 if (g_bdev_opts.bdev_auto_examine) { 839 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 840 return -EINVAL; 841 } 842 843 if (bdev_examine_allowlist_check(name)) { 844 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 845 return -EEXIST; 846 } 847 848 item = calloc(1, sizeof(*item)); 849 if (!item) { 850 return -ENOMEM; 851 } 852 item->name = strdup(name); 853 if (!item->name) { 854 free(item); 855 return -ENOMEM; 856 } 857 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 858 859 bdev = spdk_bdev_get_by_name(name); 860 if (bdev) { 861 bdev_examine(bdev); 862 } 863 return 0; 864 } 865 866 static inline void 867 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 868 { 869 struct spdk_bdev_examine_item *item; 870 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 871 spdk_json_write_object_begin(w); 872 spdk_json_write_named_string(w, "method", "bdev_examine"); 873 spdk_json_write_named_object_begin(w, "params"); 874 spdk_json_write_named_string(w, "name", item->name); 875 spdk_json_write_object_end(w); 876 spdk_json_write_object_end(w); 877 } 878 } 879 880 struct spdk_bdev * 881 spdk_bdev_first(void) 882 { 883 struct spdk_bdev *bdev; 884 885 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 struct spdk_bdev * 894 spdk_bdev_next(struct spdk_bdev *prev) 895 { 896 struct spdk_bdev *bdev; 897 898 bdev = TAILQ_NEXT(prev, internal.link); 899 if (bdev) { 900 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 901 } 902 903 return bdev; 904 } 905 906 static struct spdk_bdev * 907 _bdev_next_leaf(struct spdk_bdev *bdev) 908 { 909 while (bdev != NULL) { 910 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 911 return bdev; 912 } else { 913 bdev = TAILQ_NEXT(bdev, internal.link); 914 } 915 } 916 917 return bdev; 918 } 919 920 struct spdk_bdev * 921 spdk_bdev_first_leaf(void) 922 { 923 struct spdk_bdev *bdev; 924 925 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 926 927 if (bdev) { 928 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 929 } 930 931 return bdev; 932 } 933 934 struct spdk_bdev * 935 spdk_bdev_next_leaf(struct spdk_bdev *prev) 936 { 937 struct spdk_bdev *bdev; 938 939 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 940 941 if (bdev) { 942 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 943 } 944 945 return bdev; 946 } 947 948 static inline bool 949 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 950 { 951 return bdev_io->internal.f.has_memory_domain; 952 } 953 954 static inline bool 955 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 956 { 957 return bdev_io->internal.f.has_accel_sequence; 958 } 959 960 static inline void 961 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 962 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 963 { 964 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 965 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 966 * channels we will instead wait for half to complete. 967 */ 968 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 969 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 970 971 assert(state != BDEV_IO_RETRY_STATE_INVALID); 972 bdev_io->internal.retry_state = state; 973 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 974 } 975 976 static inline void 977 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 978 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 979 { 980 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 981 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 982 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 983 984 assert(state != BDEV_IO_RETRY_STATE_INVALID); 985 bdev_io->internal.retry_state = state; 986 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 987 } 988 989 void 990 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 991 { 992 struct iovec *iovs; 993 994 if (bdev_io->u.bdev.iovs == NULL) { 995 bdev_io->u.bdev.iovs = &bdev_io->iov; 996 bdev_io->u.bdev.iovcnt = 1; 997 } 998 999 iovs = bdev_io->u.bdev.iovs; 1000 1001 assert(iovs != NULL); 1002 assert(bdev_io->u.bdev.iovcnt >= 1); 1003 1004 iovs[0].iov_base = buf; 1005 iovs[0].iov_len = len; 1006 } 1007 1008 void 1009 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1010 { 1011 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1012 bdev_io->u.bdev.md_buf = md_buf; 1013 } 1014 1015 static bool 1016 _is_buf_allocated(const struct iovec *iovs) 1017 { 1018 if (iovs == NULL) { 1019 return false; 1020 } 1021 1022 return iovs[0].iov_base != NULL; 1023 } 1024 1025 static bool 1026 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1027 { 1028 int i; 1029 uintptr_t iov_base; 1030 1031 if (spdk_likely(alignment == 1)) { 1032 return true; 1033 } 1034 1035 for (i = 0; i < iovcnt; i++) { 1036 iov_base = (uintptr_t)iovs[i].iov_base; 1037 if ((iov_base & (alignment - 1)) != 0) { 1038 return false; 1039 } 1040 } 1041 1042 return true; 1043 } 1044 1045 static inline bool 1046 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1047 { 1048 if (!bdev_io_use_accel_sequence(bdev_io)) { 1049 return false; 1050 } 1051 1052 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1053 * bdev module didn't support accel sequences */ 1054 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1055 } 1056 1057 static inline void 1058 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1059 struct spdk_bdev_shared_resource *shared_resource) 1060 { 1061 bdev_ch->io_outstanding++; 1062 shared_resource->io_outstanding++; 1063 } 1064 1065 static inline void 1066 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1067 struct spdk_bdev_shared_resource *shared_resource) 1068 { 1069 assert(bdev_ch->io_outstanding > 0); 1070 assert(shared_resource->io_outstanding > 0); 1071 bdev_ch->io_outstanding--; 1072 shared_resource->io_outstanding--; 1073 } 1074 1075 static void 1076 bdev_io_submit_sequence_cb(void *ctx, int status) 1077 { 1078 struct spdk_bdev_io *bdev_io = ctx; 1079 1080 assert(bdev_io_use_accel_sequence(bdev_io)); 1081 1082 bdev_io->u.bdev.accel_sequence = NULL; 1083 bdev_io->internal.f.has_accel_sequence = false; 1084 1085 if (spdk_unlikely(status != 0)) { 1086 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1087 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1088 bdev_io_complete_unsubmitted(bdev_io); 1089 return; 1090 } 1091 1092 bdev_io_submit(bdev_io); 1093 } 1094 1095 static void 1096 bdev_io_exec_sequence_cb(void *ctx, int status) 1097 { 1098 struct spdk_bdev_io *bdev_io = ctx; 1099 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1100 1101 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1102 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1103 1104 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1105 bdev_ch_retry_io(ch); 1106 } 1107 1108 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1109 } 1110 1111 static void 1112 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1113 { 1114 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1115 1116 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1117 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1118 assert(bdev_io_use_accel_sequence(bdev_io)); 1119 1120 /* Since the operations are appended during submission, they're in the opposite order than 1121 * how we want to execute them for reads (i.e. we need to execute the most recently added 1122 * operation first), so reverse the sequence before executing it. 1123 */ 1124 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1125 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1126 } 1127 1128 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1129 bdev_io_increment_outstanding(ch, ch->shared_resource); 1130 bdev_io->internal.data_transfer_cpl = cb_fn; 1131 1132 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1133 bdev_io_exec_sequence_cb, bdev_io); 1134 } 1135 1136 static void 1137 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1138 { 1139 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1140 void *buf; 1141 1142 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1143 buf = bdev_io->internal.buf.ptr; 1144 bdev_io->internal.buf.ptr = NULL; 1145 bdev_io->internal.f.has_buf = false; 1146 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1147 bdev_io->internal.get_aux_buf_cb = NULL; 1148 } else { 1149 assert(bdev_io->internal.get_buf_cb != NULL); 1150 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1151 bdev_io->internal.get_buf_cb = NULL; 1152 } 1153 } 1154 1155 static void 1156 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1157 { 1158 struct spdk_bdev_io *bdev_io = ctx; 1159 1160 if (rc) { 1161 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1162 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1163 } 1164 bdev_io_get_buf_complete(bdev_io, !rc); 1165 } 1166 1167 static void 1168 bdev_io_pull_md_buf_done(void *ctx, int status) 1169 { 1170 struct spdk_bdev_io *bdev_io = ctx; 1171 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1172 1173 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1174 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1175 1176 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1177 bdev_ch_retry_io(ch); 1178 } 1179 1180 assert(bdev_io->internal.data_transfer_cpl); 1181 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1182 } 1183 1184 static void 1185 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1186 { 1187 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1188 int rc = 0; 1189 1190 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1191 assert(bdev_io->internal.f.has_bounce_buf); 1192 if (bdev_io_use_memory_domain(bdev_io)) { 1193 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1194 bdev_io_increment_outstanding(ch, ch->shared_resource); 1195 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1196 bdev_io->internal.memory_domain_ctx, 1197 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1198 &bdev_io->internal.bounce_buf.md_iov, 1, 1199 bdev_io_pull_md_buf_done, bdev_io); 1200 if (rc == 0) { 1201 /* Continue to submit IO in completion callback */ 1202 return; 1203 } 1204 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1205 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1206 if (rc != -ENOMEM) { 1207 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1208 spdk_memory_domain_get_dma_device_id( 1209 bdev_io->internal.memory_domain), rc); 1210 } 1211 } else { 1212 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1213 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1214 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1215 } 1216 } 1217 1218 if (spdk_unlikely(rc == -ENOMEM)) { 1219 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1220 } else { 1221 assert(bdev_io->internal.data_transfer_cpl); 1222 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1223 } 1224 } 1225 1226 static void 1227 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1228 { 1229 assert(bdev_io->internal.f.has_bounce_buf); 1230 1231 /* save original md_buf */ 1232 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1233 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1234 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1235 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1236 /* set bounce md_buf */ 1237 bdev_io->u.bdev.md_buf = md_buf; 1238 1239 bdev_io_pull_md_buf(bdev_io); 1240 } 1241 1242 static void 1243 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1244 { 1245 struct spdk_bdev *bdev = bdev_io->bdev; 1246 uint64_t md_len; 1247 void *buf; 1248 1249 if (spdk_bdev_is_md_separate(bdev)) { 1250 assert(!bdev_io_use_accel_sequence(bdev_io)); 1251 1252 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1253 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1254 1255 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1256 1257 if (bdev_io->u.bdev.md_buf != NULL) { 1258 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1259 return; 1260 } else { 1261 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1262 } 1263 } 1264 1265 bdev_io_get_buf_complete(bdev_io, true); 1266 } 1267 1268 static inline void 1269 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1270 { 1271 if (rc) { 1272 SPDK_ERRLOG("Failed to get data buffer\n"); 1273 assert(bdev_io->internal.data_transfer_cpl); 1274 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1275 return; 1276 } 1277 1278 _bdev_io_set_md_buf(bdev_io); 1279 } 1280 1281 static void 1282 bdev_io_pull_data_done_and_track(void *ctx, int status) 1283 { 1284 struct spdk_bdev_io *bdev_io = ctx; 1285 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1286 1287 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1288 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1289 1290 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1291 bdev_ch_retry_io(ch); 1292 } 1293 1294 bdev_io_pull_data_done(bdev_io, status); 1295 } 1296 1297 static void 1298 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1299 { 1300 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1301 int rc = 0; 1302 1303 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1304 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1305 * operation */ 1306 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1307 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1308 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1309 assert(bdev_io_use_accel_sequence(bdev_io)); 1310 assert(bdev_io->internal.f.has_bounce_buf); 1311 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1312 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1313 NULL, NULL, 1314 bdev_io->internal.bounce_buf.orig_iovs, 1315 bdev_io->internal.bounce_buf.orig_iovcnt, 1316 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1317 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1318 NULL, NULL); 1319 } else { 1320 /* We need to reverse the src/dst for reads */ 1321 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1322 assert(bdev_io_use_accel_sequence(bdev_io)); 1323 assert(bdev_io->internal.f.has_bounce_buf); 1324 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1325 bdev_io->internal.bounce_buf.orig_iovs, 1326 bdev_io->internal.bounce_buf.orig_iovcnt, 1327 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1328 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1329 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1330 NULL, NULL, NULL, NULL); 1331 } 1332 1333 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1334 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1335 bdev_io->internal.accel_sequence); 1336 } 1337 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1338 /* if this is write path, copy data from original buffer to bounce buffer */ 1339 if (bdev_io_use_memory_domain(bdev_io)) { 1340 assert(bdev_io->internal.f.has_bounce_buf); 1341 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1342 bdev_io_increment_outstanding(ch, ch->shared_resource); 1343 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1344 bdev_io->internal.memory_domain_ctx, 1345 bdev_io->internal.bounce_buf.orig_iovs, 1346 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1347 bdev_io->u.bdev.iovs, 1, 1348 bdev_io_pull_data_done_and_track, 1349 bdev_io); 1350 if (rc == 0) { 1351 /* Continue to submit IO in completion callback */ 1352 return; 1353 } 1354 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1355 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1356 if (rc != -ENOMEM) { 1357 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1358 spdk_memory_domain_get_dma_device_id( 1359 bdev_io->internal.memory_domain)); 1360 } 1361 } else { 1362 assert(bdev_io->u.bdev.iovcnt == 1); 1363 assert(bdev_io->internal.f.has_bounce_buf); 1364 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1365 bdev_io->u.bdev.iovs[0].iov_len, 1366 bdev_io->internal.bounce_buf.orig_iovs, 1367 bdev_io->internal.bounce_buf.orig_iovcnt); 1368 } 1369 } 1370 1371 if (spdk_unlikely(rc == -ENOMEM)) { 1372 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1373 } else { 1374 bdev_io_pull_data_done(bdev_io, rc); 1375 } 1376 } 1377 1378 static void 1379 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1380 bdev_copy_bounce_buffer_cpl cpl_cb) 1381 { 1382 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1383 1384 assert(bdev_io->internal.f.has_bounce_buf == false); 1385 1386 bdev_io->internal.data_transfer_cpl = cpl_cb; 1387 bdev_io->internal.f.has_bounce_buf = true; 1388 /* save original iovec */ 1389 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1390 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1391 /* zero the other data members */ 1392 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1393 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1394 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1395 /* set bounce iov */ 1396 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1397 bdev_io->u.bdev.iovcnt = 1; 1398 /* set bounce buffer for this operation */ 1399 bdev_io->u.bdev.iovs[0].iov_base = buf; 1400 bdev_io->u.bdev.iovs[0].iov_len = len; 1401 /* Now we use 1 iov, the split condition could have been changed */ 1402 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1403 1404 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1405 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1406 } else { 1407 bdev_io_pull_data(bdev_io); 1408 } 1409 } 1410 1411 static void 1412 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1413 { 1414 struct spdk_bdev *bdev = bdev_io->bdev; 1415 bool buf_allocated; 1416 uint64_t alignment; 1417 void *aligned_buf; 1418 1419 bdev_io->internal.buf.ptr = buf; 1420 bdev_io->internal.f.has_buf = true; 1421 1422 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1423 bdev_io_get_buf_complete(bdev_io, true); 1424 return; 1425 } 1426 1427 alignment = spdk_bdev_get_buf_align(bdev); 1428 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1429 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1430 1431 if (buf_allocated) { 1432 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1433 /* Continue in completion callback */ 1434 return; 1435 } else { 1436 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1437 } 1438 1439 _bdev_io_set_md_buf(bdev_io); 1440 } 1441 1442 static inline uint64_t 1443 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1444 { 1445 struct spdk_bdev *bdev = bdev_io->bdev; 1446 uint64_t md_len, alignment; 1447 1448 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1449 1450 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1451 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1452 1453 return len + alignment + md_len; 1454 } 1455 1456 static void 1457 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1458 { 1459 struct spdk_bdev_mgmt_channel *ch; 1460 1461 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1462 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1463 } 1464 1465 static void 1466 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1467 { 1468 assert(bdev_io->internal.f.has_buf); 1469 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1470 bdev_io->internal.buf.ptr = NULL; 1471 bdev_io->internal.f.has_buf = false; 1472 } 1473 1474 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1475 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1476 1477 void 1478 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1479 { 1480 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1481 1482 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1483 1484 assert(buf != NULL); 1485 _bdev_io_put_buf(bdev_io, buf, len); 1486 } 1487 1488 static inline void 1489 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1490 struct spdk_bdev_io *bdev_io) 1491 { 1492 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1493 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1494 * sequence pointer to make sure we won't touch it anymore. */ 1495 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1496 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1497 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1498 bdev_io->internal.f.has_accel_sequence = false; 1499 } 1500 1501 bdev->fn_table->submit_request(ioch, bdev_io); 1502 } 1503 1504 static inline void 1505 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1506 { 1507 struct spdk_bdev *bdev = bdev_io->bdev; 1508 1509 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1510 bdev_io->internal.error.nvme.cdw0 = 0; 1511 bdev_io->num_retries++; 1512 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1513 } 1514 1515 static void 1516 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1517 { 1518 struct spdk_bdev_io *bdev_io; 1519 1520 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1521 /* 1522 * Allow some more I/O to complete before retrying the nomem_io queue. 1523 * Some drivers (such as nvme) cannot immediately take a new I/O in 1524 * the context of a completion, because the resources for the I/O are 1525 * not released until control returns to the bdev poller. Also, we 1526 * may require several small I/O to complete before a larger I/O 1527 * (that requires splitting) can be submitted. 1528 */ 1529 return; 1530 } 1531 1532 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1533 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1534 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1535 1536 switch (bdev_io->internal.retry_state) { 1537 case BDEV_IO_RETRY_STATE_SUBMIT: 1538 bdev_ch_resubmit_io(shared_resource, bdev_io); 1539 break; 1540 case BDEV_IO_RETRY_STATE_PULL: 1541 bdev_io_pull_data(bdev_io); 1542 break; 1543 case BDEV_IO_RETRY_STATE_PULL_MD: 1544 bdev_io_pull_md_buf(bdev_io); 1545 break; 1546 case BDEV_IO_RETRY_STATE_PUSH: 1547 bdev_io_push_bounce_data(bdev_io); 1548 break; 1549 case BDEV_IO_RETRY_STATE_PUSH_MD: 1550 bdev_io_push_bounce_md_buf(bdev_io); 1551 break; 1552 default: 1553 assert(0 && "invalid retry state"); 1554 break; 1555 } 1556 1557 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1558 /* This IO completed again with NOMEM status, so break the loop and 1559 * don't try anymore. Note that a bdev_io that fails with NOMEM 1560 * always gets requeued at the front of the list, to maintain 1561 * ordering. 1562 */ 1563 break; 1564 } 1565 } 1566 } 1567 1568 static void 1569 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1570 { 1571 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1572 } 1573 1574 static int 1575 bdev_no_mem_poller(void *ctx) 1576 { 1577 struct spdk_bdev_shared_resource *shared_resource = ctx; 1578 1579 spdk_poller_unregister(&shared_resource->nomem_poller); 1580 1581 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1582 bdev_shared_ch_retry_io(shared_resource); 1583 } 1584 /* the retry cb may re-register the poller so double check */ 1585 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1586 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1587 /* No IOs were submitted, try again */ 1588 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1589 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1590 } 1591 1592 return SPDK_POLLER_BUSY; 1593 } 1594 1595 static inline bool 1596 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1597 { 1598 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1599 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1600 1601 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1602 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1603 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1604 1605 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1606 /* Special case when we have nomem IOs and no outstanding IOs which completions 1607 * could trigger retry of queued IOs 1608 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1609 * new IOs submitted, e.g. qd==1 */ 1610 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1611 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1612 } 1613 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1614 * ownership of that sequence is transferred back to the bdev layer, so we need to 1615 * restore internal.accel_sequence to make sure that the sequence is handled 1616 * correctly in case the I/O is later aborted. */ 1617 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1618 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1619 assert(!bdev_io_use_accel_sequence(bdev_io)); 1620 bdev_io->internal.f.has_accel_sequence = true; 1621 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1622 } 1623 1624 return true; 1625 } 1626 1627 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1628 bdev_ch_retry_io(bdev_ch); 1629 } 1630 1631 return false; 1632 } 1633 1634 static void 1635 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1636 { 1637 struct spdk_bdev_io *bdev_io = ctx; 1638 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1639 1640 if (rc) { 1641 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1642 } 1643 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1644 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1645 */ 1646 bdev_io_put_buf(bdev_io); 1647 1648 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1649 bdev_ch_retry_io(ch); 1650 } 1651 1652 /* Continue with IO completion flow */ 1653 bdev_io_complete(bdev_io); 1654 } 1655 1656 static void 1657 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1658 { 1659 struct spdk_bdev_io *bdev_io = ctx; 1660 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1661 1662 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1663 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1664 bdev_io->internal.f.has_bounce_buf = false; 1665 1666 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1667 bdev_ch_retry_io(ch); 1668 } 1669 1670 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1671 } 1672 1673 static inline void 1674 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1675 { 1676 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1677 int rc = 0; 1678 1679 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1680 assert(bdev_io->internal.f.has_bounce_buf); 1681 1682 /* do the same for metadata buffer */ 1683 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1684 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1685 1686 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1687 if (bdev_io_use_memory_domain(bdev_io)) { 1688 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1689 bdev_io_increment_outstanding(ch, ch->shared_resource); 1690 /* If memory domain is used then we need to call async push function */ 1691 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1692 bdev_io->internal.memory_domain_ctx, 1693 &bdev_io->internal.bounce_buf.orig_md_iov, 1694 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1695 &bdev_io->internal.bounce_buf.md_iov, 1, 1696 bdev_io_push_bounce_md_buf_done, 1697 bdev_io); 1698 if (rc == 0) { 1699 /* Continue IO completion in async callback */ 1700 return; 1701 } 1702 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1703 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1704 if (rc != -ENOMEM) { 1705 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1706 spdk_memory_domain_get_dma_device_id( 1707 bdev_io->internal.memory_domain)); 1708 } 1709 } else { 1710 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1711 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1712 } 1713 } 1714 } 1715 1716 if (spdk_unlikely(rc == -ENOMEM)) { 1717 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1718 } else { 1719 assert(bdev_io->internal.data_transfer_cpl); 1720 bdev_io->internal.f.has_bounce_buf = false; 1721 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1722 } 1723 } 1724 1725 static inline void 1726 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1727 { 1728 assert(bdev_io->internal.data_transfer_cpl); 1729 if (rc) { 1730 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1731 return; 1732 } 1733 1734 /* set original buffer for this io */ 1735 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1736 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1737 1738 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1739 * we still need to clear the md buf */ 1740 1741 bdev_io_push_bounce_md_buf(bdev_io); 1742 } 1743 1744 static void 1745 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1746 { 1747 struct spdk_bdev_io *bdev_io = ctx; 1748 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1749 1750 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1751 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1752 1753 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1754 bdev_ch_retry_io(ch); 1755 } 1756 1757 bdev_io_push_bounce_data_done(bdev_io, status); 1758 } 1759 1760 static inline void 1761 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1762 { 1763 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1764 int rc = 0; 1765 1766 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1767 assert(!bdev_io_use_accel_sequence(bdev_io)); 1768 assert(bdev_io->internal.f.has_bounce_buf); 1769 1770 /* if this is read path, copy data from bounce buffer to original buffer */ 1771 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1772 if (bdev_io_use_memory_domain(bdev_io)) { 1773 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1774 bdev_io_increment_outstanding(ch, ch->shared_resource); 1775 /* If memory domain is used then we need to call async push function */ 1776 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1777 bdev_io->internal.memory_domain_ctx, 1778 bdev_io->internal.bounce_buf.orig_iovs, 1779 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1780 &bdev_io->internal.bounce_buf.iov, 1, 1781 bdev_io_push_bounce_data_done_and_track, 1782 bdev_io); 1783 if (rc == 0) { 1784 /* Continue IO completion in async callback */ 1785 return; 1786 } 1787 1788 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1789 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1790 if (rc != -ENOMEM) { 1791 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1792 spdk_memory_domain_get_dma_device_id( 1793 bdev_io->internal.memory_domain)); 1794 } 1795 } else { 1796 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1797 bdev_io->internal.bounce_buf.orig_iovcnt, 1798 bdev_io->internal.bounce_buf.iov.iov_base, 1799 bdev_io->internal.bounce_buf.iov.iov_len); 1800 } 1801 } 1802 1803 if (spdk_unlikely(rc == -ENOMEM)) { 1804 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1805 } else { 1806 bdev_io_push_bounce_data_done(bdev_io, rc); 1807 } 1808 } 1809 1810 static inline void 1811 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1812 { 1813 bdev_io->internal.data_transfer_cpl = cpl_cb; 1814 bdev_io_push_bounce_data(bdev_io); 1815 } 1816 1817 static void 1818 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1819 { 1820 struct spdk_bdev_io *bdev_io; 1821 1822 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1823 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1824 } 1825 1826 static void 1827 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1828 { 1829 struct spdk_bdev_mgmt_channel *mgmt_ch; 1830 uint64_t max_len; 1831 void *buf; 1832 1833 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1834 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1835 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1836 1837 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1838 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1839 bdev_io_get_buf_complete(bdev_io, false); 1840 return; 1841 } 1842 1843 bdev_io->internal.buf.len = len; 1844 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1845 bdev_io_get_iobuf_cb); 1846 if (buf != NULL) { 1847 _bdev_io_set_buf(bdev_io, buf, len); 1848 } 1849 } 1850 1851 void 1852 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1853 { 1854 struct spdk_bdev *bdev = bdev_io->bdev; 1855 uint64_t alignment; 1856 1857 assert(cb != NULL); 1858 bdev_io->internal.get_buf_cb = cb; 1859 1860 alignment = spdk_bdev_get_buf_align(bdev); 1861 1862 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1863 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1864 /* Buffer already present and aligned */ 1865 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1866 return; 1867 } 1868 1869 bdev_io_get_buf(bdev_io, len); 1870 } 1871 1872 static void 1873 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1874 bool success) 1875 { 1876 if (!success) { 1877 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1878 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1879 bdev_io_complete_unsubmitted(bdev_io); 1880 return; 1881 } 1882 1883 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1884 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1885 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1886 return; 1887 } 1888 /* For reads we'll execute the sequence after the data is read, so, for now, only 1889 * clear out accel_sequence pointer and submit the IO */ 1890 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1891 bdev_io->u.bdev.accel_sequence = NULL; 1892 } 1893 1894 bdev_io_submit(bdev_io); 1895 } 1896 1897 static void 1898 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1899 uint64_t len) 1900 { 1901 assert(cb != NULL); 1902 bdev_io->internal.get_buf_cb = cb; 1903 1904 bdev_io_get_buf(bdev_io, len); 1905 } 1906 1907 1908 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 1909 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 1910 1911 void 1912 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1913 { 1914 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1915 1916 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 1917 1918 assert(cb != NULL); 1919 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1920 bdev_io->internal.get_aux_buf_cb = cb; 1921 bdev_io_get_buf(bdev_io, len); 1922 } 1923 1924 static int 1925 bdev_module_get_max_ctx_size(void) 1926 { 1927 struct spdk_bdev_module *bdev_module; 1928 int max_bdev_module_size = 0; 1929 1930 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1931 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1932 max_bdev_module_size = bdev_module->get_ctx_size(); 1933 } 1934 } 1935 1936 return max_bdev_module_size; 1937 } 1938 1939 static void 1940 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1941 { 1942 if (!bdev->internal.histogram_enabled) { 1943 return; 1944 } 1945 1946 spdk_json_write_object_begin(w); 1947 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1948 1949 spdk_json_write_named_object_begin(w, "params"); 1950 spdk_json_write_named_string(w, "name", bdev->name); 1951 1952 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1953 1954 if (bdev->internal.histogram_io_type) { 1955 spdk_json_write_named_string(w, "opc", 1956 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1957 } 1958 1959 spdk_json_write_object_end(w); 1960 1961 spdk_json_write_object_end(w); 1962 } 1963 1964 static void 1965 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1966 { 1967 int i; 1968 struct spdk_bdev_qos *qos = bdev->internal.qos; 1969 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1970 1971 if (!qos) { 1972 return; 1973 } 1974 1975 spdk_bdev_get_qos_rate_limits(bdev, limits); 1976 1977 spdk_json_write_object_begin(w); 1978 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1979 1980 spdk_json_write_named_object_begin(w, "params"); 1981 spdk_json_write_named_string(w, "name", bdev->name); 1982 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1983 if (limits[i] > 0) { 1984 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1985 } 1986 } 1987 spdk_json_write_object_end(w); 1988 1989 spdk_json_write_object_end(w); 1990 } 1991 1992 void 1993 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1994 { 1995 struct spdk_bdev_module *bdev_module; 1996 struct spdk_bdev *bdev; 1997 1998 assert(w != NULL); 1999 2000 spdk_json_write_array_begin(w); 2001 2002 spdk_json_write_object_begin(w); 2003 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2004 spdk_json_write_named_object_begin(w, "params"); 2005 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2006 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2007 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2008 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2009 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2010 spdk_json_write_object_end(w); 2011 spdk_json_write_object_end(w); 2012 2013 bdev_examine_allowlist_config_json(w); 2014 2015 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2016 if (bdev_module->config_json) { 2017 bdev_module->config_json(w); 2018 } 2019 } 2020 2021 spdk_spin_lock(&g_bdev_mgr.spinlock); 2022 2023 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2024 if (bdev->fn_table->write_config_json) { 2025 bdev->fn_table->write_config_json(bdev, w); 2026 } 2027 2028 bdev_qos_config_json(bdev, w); 2029 bdev_enable_histogram_config_json(bdev, w); 2030 } 2031 2032 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2033 2034 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2035 spdk_json_write_object_begin(w); 2036 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2037 spdk_json_write_object_end(w); 2038 2039 spdk_json_write_array_end(w); 2040 } 2041 2042 static void 2043 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2044 { 2045 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2046 struct spdk_bdev_io *bdev_io; 2047 2048 spdk_iobuf_channel_fini(&ch->iobuf); 2049 2050 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2051 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2052 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2053 ch->per_thread_cache_count--; 2054 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2055 } 2056 2057 assert(ch->per_thread_cache_count == 0); 2058 } 2059 2060 static int 2061 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2062 { 2063 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2064 struct spdk_bdev_io *bdev_io; 2065 uint32_t i; 2066 int rc; 2067 2068 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2069 g_bdev_opts.iobuf_small_cache_size, 2070 g_bdev_opts.iobuf_large_cache_size); 2071 if (rc != 0) { 2072 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2073 return -1; 2074 } 2075 2076 STAILQ_INIT(&ch->per_thread_cache); 2077 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2078 2079 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2080 ch->per_thread_cache_count = 0; 2081 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2082 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2083 if (bdev_io == NULL) { 2084 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2085 assert(false); 2086 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2087 return -1; 2088 } 2089 ch->per_thread_cache_count++; 2090 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2091 } 2092 2093 TAILQ_INIT(&ch->shared_resources); 2094 TAILQ_INIT(&ch->io_wait_queue); 2095 2096 return 0; 2097 } 2098 2099 static void 2100 bdev_init_complete(int rc) 2101 { 2102 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2103 void *cb_arg = g_init_cb_arg; 2104 struct spdk_bdev_module *m; 2105 2106 g_bdev_mgr.init_complete = true; 2107 g_init_cb_fn = NULL; 2108 g_init_cb_arg = NULL; 2109 2110 /* 2111 * For modules that need to know when subsystem init is complete, 2112 * inform them now. 2113 */ 2114 if (rc == 0) { 2115 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2116 if (m->init_complete) { 2117 m->init_complete(); 2118 } 2119 } 2120 } 2121 2122 cb_fn(cb_arg, rc); 2123 } 2124 2125 static bool 2126 bdev_module_all_actions_completed(void) 2127 { 2128 struct spdk_bdev_module *m; 2129 2130 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2131 if (m->internal.action_in_progress > 0) { 2132 return false; 2133 } 2134 } 2135 return true; 2136 } 2137 2138 static void 2139 bdev_module_action_complete(void) 2140 { 2141 /* 2142 * Don't finish bdev subsystem initialization if 2143 * module pre-initialization is still in progress, or 2144 * the subsystem been already initialized. 2145 */ 2146 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2147 return; 2148 } 2149 2150 /* 2151 * Check all bdev modules for inits/examinations in progress. If any 2152 * exist, return immediately since we cannot finish bdev subsystem 2153 * initialization until all are completed. 2154 */ 2155 if (!bdev_module_all_actions_completed()) { 2156 return; 2157 } 2158 2159 /* 2160 * Modules already finished initialization - now that all 2161 * the bdev modules have finished their asynchronous I/O 2162 * processing, the entire bdev layer can be marked as complete. 2163 */ 2164 bdev_init_complete(0); 2165 } 2166 2167 static void 2168 bdev_module_action_done(struct spdk_bdev_module *module) 2169 { 2170 spdk_spin_lock(&module->internal.spinlock); 2171 assert(module->internal.action_in_progress > 0); 2172 module->internal.action_in_progress--; 2173 spdk_spin_unlock(&module->internal.spinlock); 2174 bdev_module_action_complete(); 2175 } 2176 2177 void 2178 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2179 { 2180 assert(module->async_init); 2181 bdev_module_action_done(module); 2182 } 2183 2184 void 2185 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2186 { 2187 bdev_module_action_done(module); 2188 } 2189 2190 /** The last initialized bdev module */ 2191 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2192 2193 static void 2194 bdev_init_failed(void *cb_arg) 2195 { 2196 struct spdk_bdev_module *module = cb_arg; 2197 2198 spdk_spin_lock(&module->internal.spinlock); 2199 assert(module->internal.action_in_progress > 0); 2200 module->internal.action_in_progress--; 2201 spdk_spin_unlock(&module->internal.spinlock); 2202 bdev_init_complete(-1); 2203 } 2204 2205 static int 2206 bdev_modules_init(void) 2207 { 2208 struct spdk_bdev_module *module; 2209 int rc = 0; 2210 2211 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2212 g_resume_bdev_module = module; 2213 if (module->async_init) { 2214 spdk_spin_lock(&module->internal.spinlock); 2215 module->internal.action_in_progress = 1; 2216 spdk_spin_unlock(&module->internal.spinlock); 2217 } 2218 rc = module->module_init(); 2219 if (rc != 0) { 2220 /* Bump action_in_progress to prevent other modules from completion of modules_init 2221 * Send message to defer application shutdown until resources are cleaned up */ 2222 spdk_spin_lock(&module->internal.spinlock); 2223 module->internal.action_in_progress = 1; 2224 spdk_spin_unlock(&module->internal.spinlock); 2225 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2226 return rc; 2227 } 2228 } 2229 2230 g_resume_bdev_module = NULL; 2231 return 0; 2232 } 2233 2234 void 2235 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2236 { 2237 int rc = 0; 2238 char mempool_name[32]; 2239 2240 assert(cb_fn != NULL); 2241 2242 g_init_cb_fn = cb_fn; 2243 g_init_cb_arg = cb_arg; 2244 2245 spdk_notify_type_register("bdev_register"); 2246 spdk_notify_type_register("bdev_unregister"); 2247 2248 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2249 2250 rc = spdk_iobuf_register_module("bdev"); 2251 if (rc != 0) { 2252 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2253 bdev_init_complete(-1); 2254 return; 2255 } 2256 2257 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2258 g_bdev_opts.bdev_io_pool_size, 2259 sizeof(struct spdk_bdev_io) + 2260 bdev_module_get_max_ctx_size(), 2261 0, 2262 SPDK_ENV_NUMA_ID_ANY); 2263 2264 if (g_bdev_mgr.bdev_io_pool == NULL) { 2265 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2266 bdev_init_complete(-1); 2267 return; 2268 } 2269 2270 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2271 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2272 if (!g_bdev_mgr.zero_buffer) { 2273 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2274 bdev_init_complete(-1); 2275 return; 2276 } 2277 2278 #ifdef SPDK_CONFIG_VTUNE 2279 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2280 #endif 2281 2282 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2283 bdev_mgmt_channel_destroy, 2284 sizeof(struct spdk_bdev_mgmt_channel), 2285 "bdev_mgr"); 2286 2287 rc = bdev_modules_init(); 2288 g_bdev_mgr.module_init_complete = true; 2289 if (rc != 0) { 2290 SPDK_ERRLOG("bdev modules init failed\n"); 2291 return; 2292 } 2293 2294 bdev_module_action_complete(); 2295 } 2296 2297 static void 2298 bdev_mgr_unregister_cb(void *io_device) 2299 { 2300 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2301 2302 if (g_bdev_mgr.bdev_io_pool) { 2303 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2304 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2305 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2306 g_bdev_opts.bdev_io_pool_size); 2307 } 2308 2309 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2310 } 2311 2312 spdk_free(g_bdev_mgr.zero_buffer); 2313 2314 bdev_examine_allowlist_free(); 2315 2316 cb_fn(g_fini_cb_arg); 2317 g_fini_cb_fn = NULL; 2318 g_fini_cb_arg = NULL; 2319 g_bdev_mgr.init_complete = false; 2320 g_bdev_mgr.module_init_complete = false; 2321 } 2322 2323 static void 2324 bdev_module_fini_iter(void *arg) 2325 { 2326 struct spdk_bdev_module *bdev_module; 2327 2328 /* FIXME: Handling initialization failures is broken now, 2329 * so we won't even try cleaning up after successfully 2330 * initialized modules. if module_init_complete is false, 2331 * just call spdk_bdev_mgr_unregister_cb 2332 */ 2333 if (!g_bdev_mgr.module_init_complete) { 2334 bdev_mgr_unregister_cb(NULL); 2335 return; 2336 } 2337 2338 /* Start iterating from the last touched module */ 2339 if (!g_resume_bdev_module) { 2340 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2341 } else { 2342 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2343 internal.tailq); 2344 } 2345 2346 while (bdev_module) { 2347 if (bdev_module->async_fini) { 2348 /* Save our place so we can resume later. We must 2349 * save the variable here, before calling module_fini() 2350 * below, because in some cases the module may immediately 2351 * call spdk_bdev_module_fini_done() and re-enter 2352 * this function to continue iterating. */ 2353 g_resume_bdev_module = bdev_module; 2354 } 2355 2356 if (bdev_module->module_fini) { 2357 bdev_module->module_fini(); 2358 } 2359 2360 if (bdev_module->async_fini) { 2361 return; 2362 } 2363 2364 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2365 internal.tailq); 2366 } 2367 2368 g_resume_bdev_module = NULL; 2369 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2370 } 2371 2372 void 2373 spdk_bdev_module_fini_done(void) 2374 { 2375 if (spdk_get_thread() != g_fini_thread) { 2376 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2377 } else { 2378 bdev_module_fini_iter(NULL); 2379 } 2380 } 2381 2382 static void 2383 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2384 { 2385 struct spdk_bdev *bdev = cb_arg; 2386 2387 if (bdeverrno && bdev) { 2388 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2389 bdev->name); 2390 2391 /* 2392 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2393 * bdev; try to continue by manually removing this bdev from the list and continue 2394 * with the next bdev in the list. 2395 */ 2396 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2397 } 2398 2399 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2400 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2401 /* 2402 * Bdev module finish need to be deferred as we might be in the middle of some context 2403 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2404 * after returning. 2405 */ 2406 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2407 return; 2408 } 2409 2410 /* 2411 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2412 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2413 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2414 * base bdevs. 2415 * 2416 * Also, walk the list in the reverse order. 2417 */ 2418 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2419 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2420 spdk_spin_lock(&bdev->internal.spinlock); 2421 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2422 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2423 spdk_spin_unlock(&bdev->internal.spinlock); 2424 continue; 2425 } 2426 spdk_spin_unlock(&bdev->internal.spinlock); 2427 2428 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2429 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2430 return; 2431 } 2432 2433 /* 2434 * If any bdev fails to unclaim underlying bdev properly, we may face the 2435 * case of bdev list consisting of claimed bdevs only (if claims are managed 2436 * correctly, this would mean there's a loop in the claims graph which is 2437 * clearly impossible). Warn and unregister last bdev on the list then. 2438 */ 2439 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2440 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2441 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2442 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2443 return; 2444 } 2445 } 2446 2447 static void 2448 bdev_module_fini_start_iter(void *arg) 2449 { 2450 struct spdk_bdev_module *bdev_module; 2451 2452 if (!g_resume_bdev_module) { 2453 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2454 } else { 2455 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2456 } 2457 2458 while (bdev_module) { 2459 if (bdev_module->async_fini_start) { 2460 /* Save our place so we can resume later. We must 2461 * save the variable here, before calling fini_start() 2462 * below, because in some cases the module may immediately 2463 * call spdk_bdev_module_fini_start_done() and re-enter 2464 * this function to continue iterating. */ 2465 g_resume_bdev_module = bdev_module; 2466 } 2467 2468 if (bdev_module->fini_start) { 2469 bdev_module->fini_start(); 2470 } 2471 2472 if (bdev_module->async_fini_start) { 2473 return; 2474 } 2475 2476 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2477 } 2478 2479 g_resume_bdev_module = NULL; 2480 2481 bdev_finish_unregister_bdevs_iter(NULL, 0); 2482 } 2483 2484 void 2485 spdk_bdev_module_fini_start_done(void) 2486 { 2487 if (spdk_get_thread() != g_fini_thread) { 2488 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2489 } else { 2490 bdev_module_fini_start_iter(NULL); 2491 } 2492 } 2493 2494 static void 2495 bdev_finish_wait_for_examine_done(void *cb_arg) 2496 { 2497 bdev_module_fini_start_iter(NULL); 2498 } 2499 2500 static void bdev_open_async_fini(void); 2501 2502 void 2503 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2504 { 2505 int rc; 2506 2507 assert(cb_fn != NULL); 2508 2509 g_fini_thread = spdk_get_thread(); 2510 2511 g_fini_cb_fn = cb_fn; 2512 g_fini_cb_arg = cb_arg; 2513 2514 bdev_open_async_fini(); 2515 2516 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2517 if (rc != 0) { 2518 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2519 bdev_finish_wait_for_examine_done(NULL); 2520 } 2521 } 2522 2523 struct spdk_bdev_io * 2524 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2525 { 2526 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2527 struct spdk_bdev_io *bdev_io; 2528 2529 if (ch->per_thread_cache_count > 0) { 2530 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2531 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2532 ch->per_thread_cache_count--; 2533 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2534 /* 2535 * Don't try to look for bdev_ios in the global pool if there are 2536 * waiters on bdev_ios - we don't want this caller to jump the line. 2537 */ 2538 bdev_io = NULL; 2539 } else { 2540 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2541 } 2542 2543 return bdev_io; 2544 } 2545 2546 void 2547 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2548 { 2549 struct spdk_bdev_mgmt_channel *ch; 2550 2551 assert(bdev_io != NULL); 2552 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2553 2554 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2555 2556 if (bdev_io->internal.f.has_buf) { 2557 bdev_io_put_buf(bdev_io); 2558 } 2559 2560 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2561 ch->per_thread_cache_count++; 2562 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2563 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2564 struct spdk_bdev_io_wait_entry *entry; 2565 2566 entry = TAILQ_FIRST(&ch->io_wait_queue); 2567 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2568 entry->cb_fn(entry->cb_arg); 2569 } 2570 } else { 2571 /* We should never have a full cache with entries on the io wait queue. */ 2572 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2573 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2574 } 2575 } 2576 2577 static bool 2578 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2579 { 2580 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2581 2582 switch (limit) { 2583 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2584 return true; 2585 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2586 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2587 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2588 return false; 2589 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2590 default: 2591 return false; 2592 } 2593 } 2594 2595 static bool 2596 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2597 { 2598 switch (bdev_io->type) { 2599 case SPDK_BDEV_IO_TYPE_NVME_IO: 2600 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2601 case SPDK_BDEV_IO_TYPE_READ: 2602 case SPDK_BDEV_IO_TYPE_WRITE: 2603 return true; 2604 case SPDK_BDEV_IO_TYPE_ZCOPY: 2605 if (bdev_io->u.bdev.zcopy.start) { 2606 return true; 2607 } else { 2608 return false; 2609 } 2610 default: 2611 return false; 2612 } 2613 } 2614 2615 static bool 2616 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2617 { 2618 switch (bdev_io->type) { 2619 case SPDK_BDEV_IO_TYPE_NVME_IO: 2620 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2621 /* Bit 1 (0x2) set for read operation */ 2622 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2623 return true; 2624 } else { 2625 return false; 2626 } 2627 case SPDK_BDEV_IO_TYPE_READ: 2628 return true; 2629 case SPDK_BDEV_IO_TYPE_ZCOPY: 2630 /* Populate to read from disk */ 2631 if (bdev_io->u.bdev.zcopy.populate) { 2632 return true; 2633 } else { 2634 return false; 2635 } 2636 default: 2637 return false; 2638 } 2639 } 2640 2641 static uint64_t 2642 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2643 { 2644 struct spdk_bdev *bdev = bdev_io->bdev; 2645 2646 switch (bdev_io->type) { 2647 case SPDK_BDEV_IO_TYPE_NVME_IO: 2648 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2649 return bdev_io->u.nvme_passthru.nbytes; 2650 case SPDK_BDEV_IO_TYPE_READ: 2651 case SPDK_BDEV_IO_TYPE_WRITE: 2652 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2653 case SPDK_BDEV_IO_TYPE_ZCOPY: 2654 /* Track the data in the start phase only */ 2655 if (bdev_io->u.bdev.zcopy.start) { 2656 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2657 } else { 2658 return 0; 2659 } 2660 default: 2661 return 0; 2662 } 2663 } 2664 2665 static inline bool 2666 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2667 { 2668 int64_t remaining_this_timeslice; 2669 2670 if (!limit->max_per_timeslice) { 2671 /* The QoS is disabled */ 2672 return false; 2673 } 2674 2675 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2676 __ATOMIC_RELAXED); 2677 if (remaining_this_timeslice + (int64_t)delta > 0) { 2678 /* There was still a quota for this delta -> the IO shouldn't be queued 2679 * 2680 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2681 * quota can be allowed once a while. Such overrun then taken into account in 2682 * the QoS poller, where the next timeslice quota is calculated. 2683 */ 2684 return false; 2685 } 2686 2687 /* There was no quota for this delta -> the IO should be queued 2688 * The remaining_this_timeslice must be rewinded so it reflects the real 2689 * amount of IOs or bytes allowed. 2690 */ 2691 __atomic_add_fetch( 2692 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2693 return true; 2694 } 2695 2696 static inline void 2697 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2698 { 2699 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2700 } 2701 2702 static bool 2703 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2704 { 2705 return bdev_qos_rw_queue_io(limit, io, 1); 2706 } 2707 2708 static void 2709 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2710 { 2711 bdev_qos_rw_rewind_io(limit, io, 1); 2712 } 2713 2714 static bool 2715 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2716 { 2717 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2718 } 2719 2720 static void 2721 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2722 { 2723 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2724 } 2725 2726 static bool 2727 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2728 { 2729 if (bdev_is_read_io(io) == false) { 2730 return false; 2731 } 2732 2733 return bdev_qos_rw_bps_queue(limit, io); 2734 } 2735 2736 static void 2737 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2738 { 2739 if (bdev_is_read_io(io) != false) { 2740 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2741 } 2742 } 2743 2744 static bool 2745 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2746 { 2747 if (bdev_is_read_io(io) == true) { 2748 return false; 2749 } 2750 2751 return bdev_qos_rw_bps_queue(limit, io); 2752 } 2753 2754 static void 2755 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2756 { 2757 if (bdev_is_read_io(io) != true) { 2758 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2759 } 2760 } 2761 2762 static void 2763 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2764 { 2765 int i; 2766 2767 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2768 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2769 qos->rate_limits[i].queue_io = NULL; 2770 continue; 2771 } 2772 2773 switch (i) { 2774 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2775 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2776 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2777 break; 2778 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2779 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2780 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2781 break; 2782 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2783 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2784 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2785 break; 2786 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2787 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2788 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2789 break; 2790 default: 2791 break; 2792 } 2793 } 2794 } 2795 2796 static void 2797 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2798 struct spdk_bdev_io *bdev_io, 2799 enum spdk_bdev_io_status status) 2800 { 2801 bdev_io->internal.f.in_submit_request = true; 2802 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2803 spdk_bdev_io_complete(bdev_io, status); 2804 bdev_io->internal.f.in_submit_request = false; 2805 } 2806 2807 static inline void 2808 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2809 { 2810 struct spdk_bdev *bdev = bdev_io->bdev; 2811 struct spdk_io_channel *ch = bdev_ch->channel; 2812 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2813 2814 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2815 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2816 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2817 2818 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2819 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2820 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2821 SPDK_BDEV_IO_STATUS_SUCCESS); 2822 return; 2823 } 2824 } 2825 2826 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2827 bdev_io->bdev->split_on_write_unit && 2828 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2829 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2830 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2831 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2832 return; 2833 } 2834 2835 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2836 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2837 bdev_io->internal.f.in_submit_request = true; 2838 bdev_submit_request(bdev, ch, bdev_io); 2839 bdev_io->internal.f.in_submit_request = false; 2840 } else { 2841 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2842 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2843 /* Special case when we have nomem IOs and no outstanding IOs which completions 2844 * could trigger retry of queued IOs */ 2845 bdev_shared_ch_retry_io(shared_resource); 2846 } 2847 } 2848 } 2849 2850 static bool 2851 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2852 { 2853 int i; 2854 2855 if (bdev_qos_io_to_limit(bdev_io) == true) { 2856 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2857 if (!qos->rate_limits[i].queue_io) { 2858 continue; 2859 } 2860 2861 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2862 bdev_io) == true) { 2863 for (i -= 1; i >= 0 ; i--) { 2864 if (!qos->rate_limits[i].queue_io) { 2865 continue; 2866 } 2867 2868 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2869 } 2870 return true; 2871 } 2872 } 2873 } 2874 2875 return false; 2876 } 2877 2878 static int 2879 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2880 { 2881 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2882 int submitted_ios = 0; 2883 2884 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2885 if (!bdev_qos_queue_io(qos, bdev_io)) { 2886 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2887 bdev_io_do_submit(ch, bdev_io); 2888 2889 submitted_ios++; 2890 } 2891 } 2892 2893 return submitted_ios; 2894 } 2895 2896 static void 2897 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2898 { 2899 int rc; 2900 2901 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2902 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2903 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2904 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2905 &bdev_io->internal.waitq_entry); 2906 if (rc != 0) { 2907 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2908 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2909 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2910 } 2911 } 2912 2913 static bool 2914 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2915 { 2916 uint32_t io_boundary; 2917 struct spdk_bdev *bdev = bdev_io->bdev; 2918 uint32_t max_segment_size = bdev->max_segment_size; 2919 uint32_t max_size = bdev->max_rw_size; 2920 int max_segs = bdev->max_num_segments; 2921 2922 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2923 io_boundary = bdev->write_unit_size; 2924 } else if (bdev->split_on_optimal_io_boundary) { 2925 io_boundary = bdev->optimal_io_boundary; 2926 } else { 2927 io_boundary = 0; 2928 } 2929 2930 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2931 return false; 2932 } 2933 2934 if (io_boundary) { 2935 uint64_t start_stripe, end_stripe; 2936 2937 start_stripe = bdev_io->u.bdev.offset_blocks; 2938 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2939 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2940 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2941 start_stripe >>= spdk_u32log2(io_boundary); 2942 end_stripe >>= spdk_u32log2(io_boundary); 2943 } else { 2944 start_stripe /= io_boundary; 2945 end_stripe /= io_boundary; 2946 } 2947 2948 if (start_stripe != end_stripe) { 2949 return true; 2950 } 2951 } 2952 2953 if (max_segs) { 2954 if (bdev_io->u.bdev.iovcnt > max_segs) { 2955 return true; 2956 } 2957 } 2958 2959 if (max_segment_size) { 2960 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2961 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2962 return true; 2963 } 2964 } 2965 } 2966 2967 if (max_size) { 2968 if (bdev_io->u.bdev.num_blocks > max_size) { 2969 return true; 2970 } 2971 } 2972 2973 return false; 2974 } 2975 2976 static bool 2977 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2978 { 2979 uint32_t num_unmap_segments; 2980 2981 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2982 return false; 2983 } 2984 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2985 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2986 return true; 2987 } 2988 2989 return false; 2990 } 2991 2992 static bool 2993 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2994 { 2995 if (!bdev_io->bdev->max_write_zeroes) { 2996 return false; 2997 } 2998 2999 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 3000 return true; 3001 } 3002 3003 return false; 3004 } 3005 3006 static bool 3007 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3008 { 3009 if (bdev_io->bdev->max_copy != 0 && 3010 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3011 return true; 3012 } 3013 3014 return false; 3015 } 3016 3017 static bool 3018 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3019 { 3020 switch (bdev_io->type) { 3021 case SPDK_BDEV_IO_TYPE_READ: 3022 case SPDK_BDEV_IO_TYPE_WRITE: 3023 return bdev_rw_should_split(bdev_io); 3024 case SPDK_BDEV_IO_TYPE_UNMAP: 3025 return bdev_unmap_should_split(bdev_io); 3026 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3027 return bdev_write_zeroes_should_split(bdev_io); 3028 case SPDK_BDEV_IO_TYPE_COPY: 3029 return bdev_copy_should_split(bdev_io); 3030 default: 3031 return false; 3032 } 3033 } 3034 3035 static uint32_t 3036 _to_next_boundary(uint64_t offset, uint32_t boundary) 3037 { 3038 return (boundary - (offset % boundary)); 3039 } 3040 3041 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3042 3043 static void _bdev_rw_split(void *_bdev_io); 3044 3045 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3046 3047 static void 3048 _bdev_unmap_split(void *_bdev_io) 3049 { 3050 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3051 } 3052 3053 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3054 3055 static void 3056 _bdev_write_zeroes_split(void *_bdev_io) 3057 { 3058 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3059 } 3060 3061 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3062 3063 static void 3064 _bdev_copy_split(void *_bdev_io) 3065 { 3066 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3067 } 3068 3069 static int 3070 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3071 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3072 { 3073 int rc; 3074 uint64_t current_offset, current_remaining, current_src_offset; 3075 spdk_bdev_io_wait_cb io_wait_fn; 3076 3077 current_offset = *offset; 3078 current_remaining = *remaining; 3079 3080 assert(bdev_io->internal.f.split); 3081 3082 bdev_io->internal.split.outstanding++; 3083 3084 io_wait_fn = _bdev_rw_split; 3085 switch (bdev_io->type) { 3086 case SPDK_BDEV_IO_TYPE_READ: 3087 assert(bdev_io->u.bdev.accel_sequence == NULL); 3088 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3089 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3090 iov, iovcnt, md_buf, current_offset, 3091 num_blocks, 3092 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3093 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3094 NULL, 3095 bdev_io->u.bdev.dif_check_flags, 3096 bdev_io_split_done, bdev_io); 3097 break; 3098 case SPDK_BDEV_IO_TYPE_WRITE: 3099 assert(bdev_io->u.bdev.accel_sequence == NULL); 3100 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3101 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3102 iov, iovcnt, md_buf, current_offset, 3103 num_blocks, 3104 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3105 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3106 NULL, 3107 bdev_io->u.bdev.dif_check_flags, 3108 bdev_io->u.bdev.nvme_cdw12.raw, 3109 bdev_io->u.bdev.nvme_cdw13.raw, 3110 bdev_io_split_done, bdev_io); 3111 break; 3112 case SPDK_BDEV_IO_TYPE_UNMAP: 3113 io_wait_fn = _bdev_unmap_split; 3114 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3115 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3116 current_offset, num_blocks, 3117 bdev_io_split_done, bdev_io); 3118 break; 3119 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3120 io_wait_fn = _bdev_write_zeroes_split; 3121 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3122 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3123 current_offset, num_blocks, 3124 bdev_io_split_done, bdev_io); 3125 break; 3126 case SPDK_BDEV_IO_TYPE_COPY: 3127 io_wait_fn = _bdev_copy_split; 3128 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3129 (current_offset - bdev_io->u.bdev.offset_blocks); 3130 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3131 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3132 current_offset, current_src_offset, num_blocks, 3133 bdev_io_split_done, bdev_io); 3134 break; 3135 default: 3136 assert(false); 3137 rc = -EINVAL; 3138 break; 3139 } 3140 3141 if (rc == 0) { 3142 current_offset += num_blocks; 3143 current_remaining -= num_blocks; 3144 bdev_io->internal.split.current_offset_blocks = current_offset; 3145 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3146 *offset = current_offset; 3147 *remaining = current_remaining; 3148 } else { 3149 bdev_io->internal.split.outstanding--; 3150 if (rc == -ENOMEM) { 3151 if (bdev_io->internal.split.outstanding == 0) { 3152 /* No I/O is outstanding. Hence we should wait here. */ 3153 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3154 } 3155 } else { 3156 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3157 if (bdev_io->internal.split.outstanding == 0) { 3158 bdev_ch_remove_from_io_submitted(bdev_io); 3159 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3160 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3161 bdev_io->internal.ch->queue_depth); 3162 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3163 } 3164 } 3165 } 3166 3167 return rc; 3168 } 3169 3170 static void 3171 _bdev_rw_split(void *_bdev_io) 3172 { 3173 struct iovec *parent_iov, *iov; 3174 struct spdk_bdev_io *bdev_io = _bdev_io; 3175 struct spdk_bdev *bdev = bdev_io->bdev; 3176 uint64_t parent_offset, current_offset, remaining; 3177 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3178 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3179 uint32_t iovcnt, iov_len, child_iovsize; 3180 uint32_t blocklen = bdev->blocklen; 3181 uint32_t io_boundary; 3182 uint32_t max_segment_size = bdev->max_segment_size; 3183 uint32_t max_child_iovcnt = bdev->max_num_segments; 3184 uint32_t max_size = bdev->max_rw_size; 3185 void *md_buf = NULL; 3186 int rc; 3187 3188 max_size = max_size ? max_size : UINT32_MAX; 3189 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3190 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3191 SPDK_BDEV_IO_NUM_CHILD_IOV; 3192 3193 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3194 io_boundary = bdev->write_unit_size; 3195 } else if (bdev->split_on_optimal_io_boundary) { 3196 io_boundary = bdev->optimal_io_boundary; 3197 } else { 3198 io_boundary = UINT32_MAX; 3199 } 3200 3201 assert(bdev_io->internal.f.split); 3202 3203 remaining = bdev_io->internal.split.remaining_num_blocks; 3204 current_offset = bdev_io->internal.split.current_offset_blocks; 3205 parent_offset = bdev_io->u.bdev.offset_blocks; 3206 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3207 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3208 3209 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3210 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3211 if (parent_iov_offset < parent_iov->iov_len) { 3212 break; 3213 } 3214 parent_iov_offset -= parent_iov->iov_len; 3215 } 3216 3217 child_iovcnt = 0; 3218 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3219 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3220 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3221 to_next_boundary = spdk_min(remaining, to_next_boundary); 3222 to_next_boundary = spdk_min(max_size, to_next_boundary); 3223 to_next_boundary_bytes = to_next_boundary * blocklen; 3224 3225 iov = &bdev_io->child_iov[child_iovcnt]; 3226 iovcnt = 0; 3227 3228 if (bdev_io->u.bdev.md_buf) { 3229 md_buf = (char *)bdev_io->u.bdev.md_buf + 3230 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3231 } 3232 3233 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3234 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3235 iovcnt < child_iovsize) { 3236 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3237 iov_len = parent_iov->iov_len - parent_iov_offset; 3238 3239 iov_len = spdk_min(iov_len, max_segment_size); 3240 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3241 to_next_boundary_bytes -= iov_len; 3242 3243 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3244 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3245 3246 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3247 parent_iov_offset += iov_len; 3248 } else { 3249 parent_iovpos++; 3250 parent_iov_offset = 0; 3251 } 3252 child_iovcnt++; 3253 iovcnt++; 3254 } 3255 3256 if (to_next_boundary_bytes > 0) { 3257 /* We had to stop this child I/O early because we ran out of 3258 * child_iov space or were limited by max_num_segments. 3259 * Ensure the iovs to be aligned with block size and 3260 * then adjust to_next_boundary before starting the 3261 * child I/O. 3262 */ 3263 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3264 iovcnt == child_iovsize); 3265 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3266 if (to_last_block_bytes != 0) { 3267 uint32_t child_iovpos = child_iovcnt - 1; 3268 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3269 * so the loop will naturally end 3270 */ 3271 3272 to_last_block_bytes = blocklen - to_last_block_bytes; 3273 to_next_boundary_bytes += to_last_block_bytes; 3274 while (to_last_block_bytes > 0 && iovcnt > 0) { 3275 iov_len = spdk_min(to_last_block_bytes, 3276 bdev_io->child_iov[child_iovpos].iov_len); 3277 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3278 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3279 child_iovpos--; 3280 if (--iovcnt == 0) { 3281 /* If the child IO is less than a block size just return. 3282 * If the first child IO of any split round is less than 3283 * a block size, an error exit. 3284 */ 3285 if (bdev_io->internal.split.outstanding == 0) { 3286 SPDK_ERRLOG("The first child io was less than a block size\n"); 3287 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3288 bdev_ch_remove_from_io_submitted(bdev_io); 3289 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3290 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3291 bdev_io->internal.ch->queue_depth); 3292 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3293 } 3294 3295 return; 3296 } 3297 } 3298 3299 to_last_block_bytes -= iov_len; 3300 3301 if (parent_iov_offset == 0) { 3302 parent_iovpos--; 3303 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3304 } 3305 parent_iov_offset -= iov_len; 3306 } 3307 3308 assert(to_last_block_bytes == 0); 3309 } 3310 to_next_boundary -= to_next_boundary_bytes / blocklen; 3311 } 3312 3313 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3314 ¤t_offset, &remaining); 3315 if (spdk_unlikely(rc)) { 3316 return; 3317 } 3318 } 3319 } 3320 3321 static void 3322 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3323 { 3324 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3325 uint32_t num_children_reqs = 0; 3326 int rc; 3327 3328 assert(bdev_io->internal.f.split); 3329 3330 offset = bdev_io->internal.split.current_offset_blocks; 3331 remaining = bdev_io->internal.split.remaining_num_blocks; 3332 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3333 3334 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3335 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3336 3337 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3338 &offset, &remaining); 3339 if (spdk_likely(rc == 0)) { 3340 num_children_reqs++; 3341 } else { 3342 return; 3343 } 3344 } 3345 } 3346 3347 static void 3348 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3349 { 3350 uint64_t offset, write_zeroes_blocks, remaining; 3351 uint32_t num_children_reqs = 0; 3352 int rc; 3353 3354 assert(bdev_io->internal.f.split); 3355 3356 offset = bdev_io->internal.split.current_offset_blocks; 3357 remaining = bdev_io->internal.split.remaining_num_blocks; 3358 3359 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3360 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3361 3362 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3363 &offset, &remaining); 3364 if (spdk_likely(rc == 0)) { 3365 num_children_reqs++; 3366 } else { 3367 return; 3368 } 3369 } 3370 } 3371 3372 static void 3373 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3374 { 3375 uint64_t offset, copy_blocks, remaining; 3376 uint32_t num_children_reqs = 0; 3377 int rc; 3378 3379 assert(bdev_io->internal.f.split); 3380 3381 offset = bdev_io->internal.split.current_offset_blocks; 3382 remaining = bdev_io->internal.split.remaining_num_blocks; 3383 3384 assert(bdev_io->bdev->max_copy != 0); 3385 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3386 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3387 3388 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3389 &offset, &remaining); 3390 if (spdk_likely(rc == 0)) { 3391 num_children_reqs++; 3392 } else { 3393 return; 3394 } 3395 } 3396 } 3397 3398 static void 3399 parent_bdev_io_complete(void *ctx, int rc) 3400 { 3401 struct spdk_bdev_io *parent_io = ctx; 3402 3403 if (rc) { 3404 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3405 } 3406 3407 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3408 parent_io->internal.caller_ctx); 3409 } 3410 3411 static void 3412 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3413 { 3414 struct spdk_bdev_io *bdev_io = ctx; 3415 3416 /* u.bdev.accel_sequence should have already been cleared at this point */ 3417 assert(bdev_io->u.bdev.accel_sequence == NULL); 3418 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3419 bdev_io->internal.f.has_accel_sequence = false; 3420 3421 if (spdk_unlikely(status != 0)) { 3422 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3423 } 3424 3425 parent_bdev_io_complete(bdev_io, status); 3426 } 3427 3428 static void 3429 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3430 { 3431 struct spdk_bdev_io *parent_io = cb_arg; 3432 3433 spdk_bdev_free_io(bdev_io); 3434 3435 assert(parent_io->internal.f.split); 3436 3437 if (!success) { 3438 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3439 /* If any child I/O failed, stop further splitting process. */ 3440 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3441 parent_io->internal.split.remaining_num_blocks = 0; 3442 } 3443 parent_io->internal.split.outstanding--; 3444 if (parent_io->internal.split.outstanding != 0) { 3445 return; 3446 } 3447 3448 /* 3449 * Parent I/O finishes when all blocks are consumed. 3450 */ 3451 if (parent_io->internal.split.remaining_num_blocks == 0) { 3452 assert(parent_io->internal.cb != bdev_io_split_done); 3453 bdev_ch_remove_from_io_submitted(parent_io); 3454 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3455 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3456 parent_io->internal.ch->queue_depth); 3457 3458 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3459 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3460 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3461 return; 3462 } else if (parent_io->internal.f.has_bounce_buf && 3463 !bdev_io_use_accel_sequence(bdev_io)) { 3464 /* bdev IO will be completed in the callback */ 3465 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3466 return; 3467 } 3468 } 3469 3470 parent_bdev_io_complete(parent_io, 0); 3471 return; 3472 } 3473 3474 /* 3475 * Continue with the splitting process. This function will complete the parent I/O if the 3476 * splitting is done. 3477 */ 3478 switch (parent_io->type) { 3479 case SPDK_BDEV_IO_TYPE_READ: 3480 case SPDK_BDEV_IO_TYPE_WRITE: 3481 _bdev_rw_split(parent_io); 3482 break; 3483 case SPDK_BDEV_IO_TYPE_UNMAP: 3484 bdev_unmap_split(parent_io); 3485 break; 3486 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3487 bdev_write_zeroes_split(parent_io); 3488 break; 3489 case SPDK_BDEV_IO_TYPE_COPY: 3490 bdev_copy_split(parent_io); 3491 break; 3492 default: 3493 assert(false); 3494 break; 3495 } 3496 } 3497 3498 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3499 bool success); 3500 3501 static void 3502 bdev_io_split(struct spdk_bdev_io *bdev_io) 3503 { 3504 assert(bdev_io_should_split(bdev_io)); 3505 assert(bdev_io->internal.f.split); 3506 3507 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3508 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3509 bdev_io->internal.split.outstanding = 0; 3510 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3511 3512 switch (bdev_io->type) { 3513 case SPDK_BDEV_IO_TYPE_READ: 3514 case SPDK_BDEV_IO_TYPE_WRITE: 3515 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3516 _bdev_rw_split(bdev_io); 3517 } else { 3518 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3519 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3520 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3521 } 3522 break; 3523 case SPDK_BDEV_IO_TYPE_UNMAP: 3524 bdev_unmap_split(bdev_io); 3525 break; 3526 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3527 bdev_write_zeroes_split(bdev_io); 3528 break; 3529 case SPDK_BDEV_IO_TYPE_COPY: 3530 bdev_copy_split(bdev_io); 3531 break; 3532 default: 3533 assert(false); 3534 break; 3535 } 3536 } 3537 3538 static void 3539 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3540 { 3541 if (!success) { 3542 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3543 return; 3544 } 3545 3546 _bdev_rw_split(bdev_io); 3547 } 3548 3549 static inline void 3550 _bdev_io_submit(struct spdk_bdev_io *bdev_io) 3551 { 3552 struct spdk_bdev *bdev = bdev_io->bdev; 3553 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3554 3555 if (spdk_likely(bdev_ch->flags == 0)) { 3556 bdev_io_do_submit(bdev_ch, bdev_io); 3557 return; 3558 } 3559 3560 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3561 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3562 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3563 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3564 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3565 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3566 } else { 3567 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3568 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3569 } 3570 } else { 3571 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3572 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3573 } 3574 } 3575 3576 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3577 3578 bool 3579 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3580 { 3581 if (range1->length == 0 || range2->length == 0) { 3582 return false; 3583 } 3584 3585 if (range1->offset + range1->length <= range2->offset) { 3586 return false; 3587 } 3588 3589 if (range2->offset + range2->length <= range1->offset) { 3590 return false; 3591 } 3592 3593 return true; 3594 } 3595 3596 static bool 3597 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3598 { 3599 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3600 struct lba_range r; 3601 3602 switch (bdev_io->type) { 3603 case SPDK_BDEV_IO_TYPE_NVME_IO: 3604 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3605 /* Don't try to decode the NVMe command - just assume worst-case and that 3606 * it overlaps a locked range. 3607 */ 3608 return true; 3609 case SPDK_BDEV_IO_TYPE_READ: 3610 if (!range->quiesce) { 3611 return false; 3612 } 3613 /* fallthrough */ 3614 case SPDK_BDEV_IO_TYPE_WRITE: 3615 case SPDK_BDEV_IO_TYPE_UNMAP: 3616 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3617 case SPDK_BDEV_IO_TYPE_ZCOPY: 3618 case SPDK_BDEV_IO_TYPE_COPY: 3619 r.offset = bdev_io->u.bdev.offset_blocks; 3620 r.length = bdev_io->u.bdev.num_blocks; 3621 if (!bdev_lba_range_overlapped(range, &r)) { 3622 /* This I/O doesn't overlap the specified LBA range. */ 3623 return false; 3624 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3625 /* This I/O overlaps, but the I/O is on the same channel that locked this 3626 * range, and the caller_ctx is the same as the locked_ctx. This means 3627 * that this I/O is associated with the lock, and is allowed to execute. 3628 */ 3629 return false; 3630 } else { 3631 return true; 3632 } 3633 default: 3634 return false; 3635 } 3636 } 3637 3638 void 3639 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3640 { 3641 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3642 3643 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3644 3645 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3646 struct lba_range *range; 3647 3648 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3649 if (bdev_io_range_is_locked(bdev_io, range)) { 3650 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3651 return; 3652 } 3653 } 3654 } 3655 3656 bdev_ch_add_to_io_submitted(bdev_io); 3657 3658 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3659 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3660 ch->trace_id, bdev_io->u.bdev.num_blocks, 3661 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3662 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3663 3664 if (bdev_io->internal.f.split) { 3665 bdev_io_split(bdev_io); 3666 return; 3667 } 3668 3669 _bdev_io_submit(bdev_io); 3670 } 3671 3672 static inline void 3673 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3674 { 3675 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3676 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3677 * For write operation we need to pull buffers from memory domain before submitting IO. 3678 * Once read operation completes, we need to use memory_domain push functionality to 3679 * update data in original memory domain IO buffer 3680 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3681 assert(bdev_io->internal.f.has_memory_domain); 3682 bdev_io->u.bdev.memory_domain = NULL; 3683 bdev_io->u.bdev.memory_domain_ctx = NULL; 3684 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3685 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3686 } 3687 3688 static inline void 3689 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3690 { 3691 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3692 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3693 3694 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3695 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3696 bdev_io_complete_unsubmitted(bdev_io); 3697 return; 3698 } 3699 3700 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3701 * support them, but we need to execute an accel sequence and the data buffer is from accel 3702 * memory domain (to avoid doing a push/pull from that domain). 3703 */ 3704 if (bdev_io_use_memory_domain(bdev_io)) { 3705 if (!desc->memory_domains_supported || 3706 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3707 _bdev_io_ext_use_bounce_buffer(bdev_io); 3708 return; 3709 } 3710 } 3711 3712 if (needs_exec) { 3713 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3714 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3715 return; 3716 } 3717 /* For reads we'll execute the sequence after the data is read, so, for now, only 3718 * clear out accel_sequence pointer and submit the IO */ 3719 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3720 bdev_io->u.bdev.accel_sequence = NULL; 3721 } 3722 3723 bdev_io_submit(bdev_io); 3724 } 3725 3726 static void 3727 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3728 { 3729 struct spdk_bdev *bdev = bdev_io->bdev; 3730 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3731 struct spdk_io_channel *ch = bdev_ch->channel; 3732 3733 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3734 3735 bdev_io->internal.f.in_submit_request = true; 3736 bdev_submit_request(bdev, ch, bdev_io); 3737 bdev_io->internal.f.in_submit_request = false; 3738 } 3739 3740 void 3741 bdev_io_init(struct spdk_bdev_io *bdev_io, 3742 struct spdk_bdev *bdev, void *cb_arg, 3743 spdk_bdev_io_completion_cb cb) 3744 { 3745 bdev_io->bdev = bdev; 3746 bdev_io->internal.f.raw = 0; 3747 bdev_io->internal.caller_ctx = cb_arg; 3748 bdev_io->internal.cb = cb; 3749 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3750 bdev_io->internal.f.in_submit_request = false; 3751 bdev_io->internal.error.nvme.cdw0 = 0; 3752 bdev_io->num_retries = 0; 3753 bdev_io->internal.get_buf_cb = NULL; 3754 bdev_io->internal.get_aux_buf_cb = NULL; 3755 bdev_io->internal.data_transfer_cpl = NULL; 3756 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3757 } 3758 3759 static bool 3760 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3761 { 3762 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3763 } 3764 3765 bool 3766 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3767 { 3768 bool supported; 3769 3770 supported = bdev_io_type_supported(bdev, io_type); 3771 3772 if (!supported) { 3773 switch (io_type) { 3774 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3775 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3776 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3777 break; 3778 default: 3779 break; 3780 } 3781 } 3782 3783 return supported; 3784 } 3785 3786 static const char *g_io_type_strings[] = { 3787 [SPDK_BDEV_IO_TYPE_READ] = "read", 3788 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3789 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3790 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3791 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3792 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3793 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3794 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3795 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3796 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3797 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3798 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3799 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3800 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3801 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3802 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3803 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3804 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3805 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3806 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3807 }; 3808 3809 const char * 3810 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3811 { 3812 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3813 return NULL; 3814 } 3815 3816 return g_io_type_strings[io_type]; 3817 } 3818 3819 int 3820 spdk_bdev_get_io_type(const char *io_type_string) 3821 { 3822 int i; 3823 3824 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3825 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3826 return i; 3827 } 3828 } 3829 3830 return -1; 3831 } 3832 3833 uint64_t 3834 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3835 { 3836 return bdev_io->internal.submit_tsc; 3837 } 3838 3839 int 3840 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3841 { 3842 if (bdev->fn_table->dump_info_json) { 3843 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3844 } 3845 3846 return 0; 3847 } 3848 3849 static void 3850 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3851 { 3852 uint32_t max_per_timeslice = 0; 3853 int i; 3854 3855 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3856 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3857 qos->rate_limits[i].max_per_timeslice = 0; 3858 continue; 3859 } 3860 3861 max_per_timeslice = qos->rate_limits[i].limit * 3862 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3863 3864 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3865 qos->rate_limits[i].min_per_timeslice); 3866 3867 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3868 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3869 } 3870 3871 bdev_qos_set_ops(qos); 3872 } 3873 3874 static void 3875 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3876 struct spdk_io_channel *io_ch, void *ctx) 3877 { 3878 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3879 int status; 3880 3881 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3882 3883 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3884 /* TODO: channels round robing */ 3885 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3886 3887 spdk_bdev_for_each_channel_continue(i, status); 3888 } 3889 3890 3891 static void 3892 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3893 { 3894 3895 } 3896 3897 static int 3898 bdev_channel_poll_qos(void *arg) 3899 { 3900 struct spdk_bdev *bdev = arg; 3901 struct spdk_bdev_qos *qos = bdev->internal.qos; 3902 uint64_t now = spdk_get_ticks(); 3903 int i; 3904 int64_t remaining_last_timeslice; 3905 3906 if (spdk_unlikely(qos->thread == NULL)) { 3907 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3908 return SPDK_POLLER_IDLE; 3909 } 3910 3911 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3912 /* We received our callback earlier than expected - return 3913 * immediately and wait to do accounting until at least one 3914 * timeslice has actually expired. This should never happen 3915 * with a well-behaved timer implementation. 3916 */ 3917 return SPDK_POLLER_IDLE; 3918 } 3919 3920 /* Reset for next round of rate limiting */ 3921 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3922 /* We may have allowed the IOs or bytes to slightly overrun in the last 3923 * timeslice. remaining_this_timeslice is signed, so if it's negative 3924 * here, we'll account for the overrun so that the next timeslice will 3925 * be appropriately reduced. 3926 */ 3927 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3928 0, __ATOMIC_RELAXED); 3929 if (remaining_last_timeslice < 0) { 3930 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3931 * potentially use 2 atomic ops each, so they can intertwine. 3932 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3933 */ 3934 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3935 remaining_last_timeslice, __ATOMIC_RELAXED); 3936 } 3937 } 3938 3939 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3940 qos->last_timeslice += qos->timeslice_size; 3941 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3942 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3943 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3944 } 3945 } 3946 3947 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3948 bdev_channel_submit_qos_io_done); 3949 3950 return SPDK_POLLER_BUSY; 3951 } 3952 3953 static void 3954 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3955 { 3956 struct spdk_bdev_shared_resource *shared_resource; 3957 struct lba_range *range; 3958 3959 bdev_free_io_stat(ch->stat); 3960 #ifdef SPDK_CONFIG_VTUNE 3961 bdev_free_io_stat(ch->prev_stat); 3962 #endif 3963 3964 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3965 range = TAILQ_FIRST(&ch->locked_ranges); 3966 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3967 free(range); 3968 } 3969 3970 spdk_put_io_channel(ch->channel); 3971 spdk_put_io_channel(ch->accel_channel); 3972 3973 shared_resource = ch->shared_resource; 3974 3975 assert(TAILQ_EMPTY(&ch->io_locked)); 3976 assert(TAILQ_EMPTY(&ch->io_submitted)); 3977 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3978 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3979 assert(ch->io_outstanding == 0); 3980 assert(shared_resource->ref > 0); 3981 shared_resource->ref--; 3982 if (shared_resource->ref == 0) { 3983 assert(shared_resource->io_outstanding == 0); 3984 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3985 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3986 spdk_poller_unregister(&shared_resource->nomem_poller); 3987 free(shared_resource); 3988 } 3989 } 3990 3991 static void 3992 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3993 { 3994 struct spdk_bdev_qos *qos = bdev->internal.qos; 3995 int i; 3996 3997 assert(spdk_spin_held(&bdev->internal.spinlock)); 3998 3999 /* Rate limiting on this bdev enabled */ 4000 if (qos) { 4001 if (qos->ch == NULL) { 4002 struct spdk_io_channel *io_ch; 4003 4004 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4005 bdev->name, spdk_get_thread()); 4006 4007 /* No qos channel has been selected, so set one up */ 4008 4009 /* Take another reference to ch */ 4010 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4011 assert(io_ch != NULL); 4012 qos->ch = ch; 4013 4014 qos->thread = spdk_io_channel_get_thread(io_ch); 4015 4016 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4017 if (bdev_qos_is_iops_rate_limit(i) == true) { 4018 qos->rate_limits[i].min_per_timeslice = 4019 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4020 } else { 4021 qos->rate_limits[i].min_per_timeslice = 4022 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4023 } 4024 4025 if (qos->rate_limits[i].limit == 0) { 4026 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4027 } 4028 } 4029 bdev_qos_update_max_quota_per_timeslice(qos); 4030 qos->timeslice_size = 4031 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4032 qos->last_timeslice = spdk_get_ticks(); 4033 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4034 bdev, 4035 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4036 } 4037 4038 ch->flags |= BDEV_CH_QOS_ENABLED; 4039 } 4040 } 4041 4042 struct poll_timeout_ctx { 4043 struct spdk_bdev_desc *desc; 4044 uint64_t timeout_in_sec; 4045 spdk_bdev_io_timeout_cb cb_fn; 4046 void *cb_arg; 4047 }; 4048 4049 static void 4050 bdev_desc_free(struct spdk_bdev_desc *desc) 4051 { 4052 spdk_spin_destroy(&desc->spinlock); 4053 free(desc->media_events_buffer); 4054 free(desc); 4055 } 4056 4057 static void 4058 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4059 { 4060 struct poll_timeout_ctx *ctx = _ctx; 4061 struct spdk_bdev_desc *desc = ctx->desc; 4062 4063 free(ctx); 4064 4065 spdk_spin_lock(&desc->spinlock); 4066 desc->refs--; 4067 if (desc->closed == true && desc->refs == 0) { 4068 spdk_spin_unlock(&desc->spinlock); 4069 bdev_desc_free(desc); 4070 return; 4071 } 4072 spdk_spin_unlock(&desc->spinlock); 4073 } 4074 4075 static void 4076 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4077 struct spdk_io_channel *io_ch, void *_ctx) 4078 { 4079 struct poll_timeout_ctx *ctx = _ctx; 4080 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4081 struct spdk_bdev_desc *desc = ctx->desc; 4082 struct spdk_bdev_io *bdev_io; 4083 uint64_t now; 4084 4085 spdk_spin_lock(&desc->spinlock); 4086 if (desc->closed == true) { 4087 spdk_spin_unlock(&desc->spinlock); 4088 spdk_bdev_for_each_channel_continue(i, -1); 4089 return; 4090 } 4091 spdk_spin_unlock(&desc->spinlock); 4092 4093 now = spdk_get_ticks(); 4094 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4095 /* Exclude any I/O that are generated via splitting. */ 4096 if (bdev_io->internal.cb == bdev_io_split_done) { 4097 continue; 4098 } 4099 4100 /* Once we find an I/O that has not timed out, we can immediately 4101 * exit the loop. 4102 */ 4103 if (now < (bdev_io->internal.submit_tsc + 4104 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4105 goto end; 4106 } 4107 4108 if (bdev_io->internal.desc == desc) { 4109 ctx->cb_fn(ctx->cb_arg, bdev_io); 4110 } 4111 } 4112 4113 end: 4114 spdk_bdev_for_each_channel_continue(i, 0); 4115 } 4116 4117 static int 4118 bdev_poll_timeout_io(void *arg) 4119 { 4120 struct spdk_bdev_desc *desc = arg; 4121 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4122 struct poll_timeout_ctx *ctx; 4123 4124 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4125 if (!ctx) { 4126 SPDK_ERRLOG("failed to allocate memory\n"); 4127 return SPDK_POLLER_BUSY; 4128 } 4129 ctx->desc = desc; 4130 ctx->cb_arg = desc->cb_arg; 4131 ctx->cb_fn = desc->cb_fn; 4132 ctx->timeout_in_sec = desc->timeout_in_sec; 4133 4134 /* Take a ref on the descriptor in case it gets closed while we are checking 4135 * all of the channels. 4136 */ 4137 spdk_spin_lock(&desc->spinlock); 4138 desc->refs++; 4139 spdk_spin_unlock(&desc->spinlock); 4140 4141 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4142 bdev_channel_poll_timeout_io_done); 4143 4144 return SPDK_POLLER_BUSY; 4145 } 4146 4147 int 4148 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4149 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4150 { 4151 assert(desc->thread == spdk_get_thread()); 4152 4153 spdk_poller_unregister(&desc->io_timeout_poller); 4154 4155 if (timeout_in_sec) { 4156 assert(cb_fn != NULL); 4157 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4158 desc, 4159 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4160 1000); 4161 if (desc->io_timeout_poller == NULL) { 4162 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4163 return -1; 4164 } 4165 } 4166 4167 desc->cb_fn = cb_fn; 4168 desc->cb_arg = cb_arg; 4169 desc->timeout_in_sec = timeout_in_sec; 4170 4171 return 0; 4172 } 4173 4174 static int 4175 bdev_channel_create(void *io_device, void *ctx_buf) 4176 { 4177 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4178 struct spdk_bdev_channel *ch = ctx_buf; 4179 struct spdk_io_channel *mgmt_io_ch; 4180 struct spdk_bdev_mgmt_channel *mgmt_ch; 4181 struct spdk_bdev_shared_resource *shared_resource; 4182 struct lba_range *range; 4183 4184 ch->bdev = bdev; 4185 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4186 if (!ch->channel) { 4187 return -1; 4188 } 4189 4190 ch->accel_channel = spdk_accel_get_io_channel(); 4191 if (!ch->accel_channel) { 4192 spdk_put_io_channel(ch->channel); 4193 return -1; 4194 } 4195 4196 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4197 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4198 4199 assert(ch->histogram == NULL); 4200 if (bdev->internal.histogram_enabled) { 4201 ch->histogram = spdk_histogram_data_alloc(); 4202 if (ch->histogram == NULL) { 4203 SPDK_ERRLOG("Could not allocate histogram\n"); 4204 } 4205 } 4206 4207 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4208 if (!mgmt_io_ch) { 4209 spdk_put_io_channel(ch->channel); 4210 spdk_put_io_channel(ch->accel_channel); 4211 return -1; 4212 } 4213 4214 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4215 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4216 if (shared_resource->shared_ch == ch->channel) { 4217 spdk_put_io_channel(mgmt_io_ch); 4218 shared_resource->ref++; 4219 break; 4220 } 4221 } 4222 4223 if (shared_resource == NULL) { 4224 shared_resource = calloc(1, sizeof(*shared_resource)); 4225 if (shared_resource == NULL) { 4226 spdk_put_io_channel(ch->channel); 4227 spdk_put_io_channel(ch->accel_channel); 4228 spdk_put_io_channel(mgmt_io_ch); 4229 return -1; 4230 } 4231 4232 shared_resource->mgmt_ch = mgmt_ch; 4233 shared_resource->io_outstanding = 0; 4234 TAILQ_INIT(&shared_resource->nomem_io); 4235 shared_resource->nomem_threshold = 0; 4236 shared_resource->shared_ch = ch->channel; 4237 shared_resource->ref = 1; 4238 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4239 } 4240 4241 ch->io_outstanding = 0; 4242 TAILQ_INIT(&ch->locked_ranges); 4243 TAILQ_INIT(&ch->qos_queued_io); 4244 ch->flags = 0; 4245 ch->trace_id = bdev->internal.trace_id; 4246 ch->shared_resource = shared_resource; 4247 4248 TAILQ_INIT(&ch->io_submitted); 4249 TAILQ_INIT(&ch->io_locked); 4250 TAILQ_INIT(&ch->io_accel_exec); 4251 TAILQ_INIT(&ch->io_memory_domain); 4252 4253 ch->stat = bdev_alloc_io_stat(false); 4254 if (ch->stat == NULL) { 4255 bdev_channel_destroy_resource(ch); 4256 return -1; 4257 } 4258 4259 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4260 4261 #ifdef SPDK_CONFIG_VTUNE 4262 { 4263 char *name; 4264 __itt_init_ittlib(NULL, 0); 4265 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4266 if (!name) { 4267 bdev_channel_destroy_resource(ch); 4268 return -1; 4269 } 4270 ch->handle = __itt_string_handle_create(name); 4271 free(name); 4272 ch->start_tsc = spdk_get_ticks(); 4273 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4274 ch->prev_stat = bdev_alloc_io_stat(false); 4275 if (ch->prev_stat == NULL) { 4276 bdev_channel_destroy_resource(ch); 4277 return -1; 4278 } 4279 } 4280 #endif 4281 4282 spdk_spin_lock(&bdev->internal.spinlock); 4283 bdev_enable_qos(bdev, ch); 4284 4285 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4286 struct lba_range *new_range; 4287 4288 new_range = calloc(1, sizeof(*new_range)); 4289 if (new_range == NULL) { 4290 spdk_spin_unlock(&bdev->internal.spinlock); 4291 bdev_channel_destroy_resource(ch); 4292 return -1; 4293 } 4294 new_range->length = range->length; 4295 new_range->offset = range->offset; 4296 new_range->locked_ctx = range->locked_ctx; 4297 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4298 } 4299 4300 spdk_spin_unlock(&bdev->internal.spinlock); 4301 4302 return 0; 4303 } 4304 4305 static int 4306 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4307 void *cb_ctx) 4308 { 4309 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4310 struct spdk_bdev_io *bdev_io; 4311 uint64_t buf_len; 4312 4313 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4314 if (bdev_io->internal.ch == bdev_ch) { 4315 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4316 spdk_iobuf_entry_abort(ch, entry, buf_len); 4317 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4318 } 4319 4320 return 0; 4321 } 4322 4323 /* 4324 * Abort I/O that are waiting on a data buffer. 4325 */ 4326 static void 4327 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4328 { 4329 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4330 } 4331 4332 /* 4333 * Abort I/O that are queued waiting for submission. These types of I/O are 4334 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4335 */ 4336 static void 4337 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4338 { 4339 struct spdk_bdev_io *bdev_io, *tmp; 4340 4341 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4342 if (bdev_io->internal.ch == ch) { 4343 TAILQ_REMOVE(queue, bdev_io, internal.link); 4344 /* 4345 * spdk_bdev_io_complete() assumes that the completed I/O had 4346 * been submitted to the bdev module. Since in this case it 4347 * hadn't, bump io_outstanding to account for the decrement 4348 * that spdk_bdev_io_complete() will do. 4349 */ 4350 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4351 bdev_io_increment_outstanding(ch, ch->shared_resource); 4352 } 4353 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4354 } 4355 } 4356 } 4357 4358 static bool 4359 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4360 { 4361 struct spdk_bdev_io *bdev_io; 4362 4363 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4364 if (bdev_io == bio_to_abort) { 4365 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4366 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4367 return true; 4368 } 4369 } 4370 4371 return false; 4372 } 4373 4374 static int 4375 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4376 { 4377 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4378 uint64_t buf_len; 4379 4380 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4381 if (bdev_io == bio_to_abort) { 4382 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4383 spdk_iobuf_entry_abort(ch, entry, buf_len); 4384 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4385 return 1; 4386 } 4387 4388 return 0; 4389 } 4390 4391 static bool 4392 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4393 { 4394 int rc; 4395 4396 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4397 return rc == 1; 4398 } 4399 4400 static void 4401 bdev_qos_channel_destroy(void *cb_arg) 4402 { 4403 struct spdk_bdev_qos *qos = cb_arg; 4404 4405 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4406 spdk_poller_unregister(&qos->poller); 4407 4408 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4409 4410 free(qos); 4411 } 4412 4413 static int 4414 bdev_qos_destroy(struct spdk_bdev *bdev) 4415 { 4416 int i; 4417 4418 /* 4419 * Cleanly shutting down the QoS poller is tricky, because 4420 * during the asynchronous operation the user could open 4421 * a new descriptor and create a new channel, spawning 4422 * a new QoS poller. 4423 * 4424 * The strategy is to create a new QoS structure here and swap it 4425 * in. The shutdown path then continues to refer to the old one 4426 * until it completes and then releases it. 4427 */ 4428 struct spdk_bdev_qos *new_qos, *old_qos; 4429 4430 old_qos = bdev->internal.qos; 4431 4432 new_qos = calloc(1, sizeof(*new_qos)); 4433 if (!new_qos) { 4434 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4435 return -ENOMEM; 4436 } 4437 4438 /* Copy the old QoS data into the newly allocated structure */ 4439 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4440 4441 /* Zero out the key parts of the QoS structure */ 4442 new_qos->ch = NULL; 4443 new_qos->thread = NULL; 4444 new_qos->poller = NULL; 4445 /* 4446 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4447 * It will be used later for the new QoS structure. 4448 */ 4449 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4450 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4451 new_qos->rate_limits[i].min_per_timeslice = 0; 4452 new_qos->rate_limits[i].max_per_timeslice = 0; 4453 } 4454 4455 bdev->internal.qos = new_qos; 4456 4457 if (old_qos->thread == NULL) { 4458 free(old_qos); 4459 } else { 4460 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4461 } 4462 4463 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4464 * been destroyed yet. The destruction path will end up waiting for the final 4465 * channel to be put before it releases resources. */ 4466 4467 return 0; 4468 } 4469 4470 void 4471 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4472 { 4473 total->bytes_read += add->bytes_read; 4474 total->num_read_ops += add->num_read_ops; 4475 total->bytes_written += add->bytes_written; 4476 total->num_write_ops += add->num_write_ops; 4477 total->bytes_unmapped += add->bytes_unmapped; 4478 total->num_unmap_ops += add->num_unmap_ops; 4479 total->bytes_copied += add->bytes_copied; 4480 total->num_copy_ops += add->num_copy_ops; 4481 total->read_latency_ticks += add->read_latency_ticks; 4482 total->write_latency_ticks += add->write_latency_ticks; 4483 total->unmap_latency_ticks += add->unmap_latency_ticks; 4484 total->copy_latency_ticks += add->copy_latency_ticks; 4485 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4486 total->max_read_latency_ticks = add->max_read_latency_ticks; 4487 } 4488 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4489 total->min_read_latency_ticks = add->min_read_latency_ticks; 4490 } 4491 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4492 total->max_write_latency_ticks = add->max_write_latency_ticks; 4493 } 4494 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4495 total->min_write_latency_ticks = add->min_write_latency_ticks; 4496 } 4497 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4498 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4499 } 4500 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4501 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4502 } 4503 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4504 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4505 } 4506 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4507 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4508 } 4509 } 4510 4511 static void 4512 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4513 { 4514 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4515 4516 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4517 memcpy(to_stat->io_error, from_stat->io_error, 4518 sizeof(struct spdk_bdev_io_error_stat)); 4519 } 4520 } 4521 4522 void 4523 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4524 { 4525 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4526 return; 4527 } 4528 4529 stat->max_read_latency_ticks = 0; 4530 stat->min_read_latency_ticks = UINT64_MAX; 4531 stat->max_write_latency_ticks = 0; 4532 stat->min_write_latency_ticks = UINT64_MAX; 4533 stat->max_unmap_latency_ticks = 0; 4534 stat->min_unmap_latency_ticks = UINT64_MAX; 4535 stat->max_copy_latency_ticks = 0; 4536 stat->min_copy_latency_ticks = UINT64_MAX; 4537 4538 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4539 return; 4540 } 4541 4542 stat->bytes_read = 0; 4543 stat->num_read_ops = 0; 4544 stat->bytes_written = 0; 4545 stat->num_write_ops = 0; 4546 stat->bytes_unmapped = 0; 4547 stat->num_unmap_ops = 0; 4548 stat->bytes_copied = 0; 4549 stat->num_copy_ops = 0; 4550 stat->read_latency_ticks = 0; 4551 stat->write_latency_ticks = 0; 4552 stat->unmap_latency_ticks = 0; 4553 stat->copy_latency_ticks = 0; 4554 4555 if (stat->io_error != NULL) { 4556 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4557 } 4558 } 4559 4560 struct spdk_bdev_io_stat * 4561 bdev_alloc_io_stat(bool io_error_stat) 4562 { 4563 struct spdk_bdev_io_stat *stat; 4564 4565 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4566 if (stat == NULL) { 4567 return NULL; 4568 } 4569 4570 if (io_error_stat) { 4571 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4572 if (stat->io_error == NULL) { 4573 free(stat); 4574 return NULL; 4575 } 4576 } else { 4577 stat->io_error = NULL; 4578 } 4579 4580 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4581 4582 return stat; 4583 } 4584 4585 void 4586 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4587 { 4588 if (stat != NULL) { 4589 free(stat->io_error); 4590 free(stat); 4591 } 4592 } 4593 4594 void 4595 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4596 { 4597 int i; 4598 4599 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4600 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4601 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4602 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4603 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4604 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4605 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4606 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4607 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4608 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4609 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4610 stat->min_read_latency_ticks != UINT64_MAX ? 4611 stat->min_read_latency_ticks : 0); 4612 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4613 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4614 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4615 stat->min_write_latency_ticks != UINT64_MAX ? 4616 stat->min_write_latency_ticks : 0); 4617 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4618 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4619 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4620 stat->min_unmap_latency_ticks != UINT64_MAX ? 4621 stat->min_unmap_latency_ticks : 0); 4622 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4623 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4624 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4625 stat->min_copy_latency_ticks != UINT64_MAX ? 4626 stat->min_copy_latency_ticks : 0); 4627 4628 if (stat->io_error != NULL) { 4629 spdk_json_write_named_object_begin(w, "io_error"); 4630 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4631 if (stat->io_error->error_status[i] != 0) { 4632 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4633 stat->io_error->error_status[i]); 4634 } 4635 } 4636 spdk_json_write_object_end(w); 4637 } 4638 } 4639 4640 static void 4641 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4642 { 4643 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4644 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4645 4646 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4647 bdev_abort_all_buf_io(mgmt_ch, ch); 4648 } 4649 4650 static void 4651 bdev_channel_destroy(void *io_device, void *ctx_buf) 4652 { 4653 struct spdk_bdev_channel *ch = ctx_buf; 4654 4655 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4656 spdk_get_thread()); 4657 4658 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4659 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4660 4661 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4662 spdk_spin_lock(&ch->bdev->internal.spinlock); 4663 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4664 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4665 4666 bdev_channel_abort_queued_ios(ch); 4667 4668 if (ch->histogram) { 4669 spdk_histogram_data_free(ch->histogram); 4670 } 4671 4672 bdev_channel_destroy_resource(ch); 4673 } 4674 4675 /* 4676 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4677 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4678 */ 4679 static int 4680 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4681 { 4682 struct spdk_bdev_name *tmp; 4683 4684 bdev_name->name = strdup(name); 4685 if (bdev_name->name == NULL) { 4686 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4687 return -ENOMEM; 4688 } 4689 4690 bdev_name->bdev = bdev; 4691 4692 spdk_spin_lock(&g_bdev_mgr.spinlock); 4693 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4694 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4695 4696 if (tmp != NULL) { 4697 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4698 free(bdev_name->name); 4699 return -EEXIST; 4700 } 4701 4702 return 0; 4703 } 4704 4705 static void 4706 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4707 { 4708 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4709 free(bdev_name->name); 4710 } 4711 4712 static void 4713 bdev_name_del(struct spdk_bdev_name *bdev_name) 4714 { 4715 spdk_spin_lock(&g_bdev_mgr.spinlock); 4716 bdev_name_del_unsafe(bdev_name); 4717 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4718 } 4719 4720 int 4721 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4722 { 4723 struct spdk_bdev_alias *tmp; 4724 int ret; 4725 4726 if (alias == NULL) { 4727 SPDK_ERRLOG("Empty alias passed\n"); 4728 return -EINVAL; 4729 } 4730 4731 tmp = calloc(1, sizeof(*tmp)); 4732 if (tmp == NULL) { 4733 SPDK_ERRLOG("Unable to allocate alias\n"); 4734 return -ENOMEM; 4735 } 4736 4737 ret = bdev_name_add(&tmp->alias, bdev, alias); 4738 if (ret != 0) { 4739 free(tmp); 4740 return ret; 4741 } 4742 4743 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4744 4745 return 0; 4746 } 4747 4748 static int 4749 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4750 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4751 { 4752 struct spdk_bdev_alias *tmp; 4753 4754 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4755 if (strcmp(alias, tmp->alias.name) == 0) { 4756 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4757 alias_del_fn(&tmp->alias); 4758 free(tmp); 4759 return 0; 4760 } 4761 } 4762 4763 return -ENOENT; 4764 } 4765 4766 int 4767 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4768 { 4769 int rc; 4770 4771 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4772 if (rc == -ENOENT) { 4773 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4774 } 4775 4776 return rc; 4777 } 4778 4779 void 4780 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4781 { 4782 struct spdk_bdev_alias *p, *tmp; 4783 4784 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4785 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4786 bdev_name_del(&p->alias); 4787 free(p); 4788 } 4789 } 4790 4791 struct spdk_io_channel * 4792 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4793 { 4794 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4795 } 4796 4797 void * 4798 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4799 { 4800 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4801 void *ctx = NULL; 4802 4803 if (bdev->fn_table->get_module_ctx) { 4804 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4805 } 4806 4807 return ctx; 4808 } 4809 4810 const char * 4811 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4812 { 4813 return bdev->module->name; 4814 } 4815 4816 const char * 4817 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4818 { 4819 return bdev->name; 4820 } 4821 4822 const char * 4823 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4824 { 4825 return bdev->product_name; 4826 } 4827 4828 const struct spdk_bdev_aliases_list * 4829 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4830 { 4831 return &bdev->aliases; 4832 } 4833 4834 uint32_t 4835 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4836 { 4837 return bdev->blocklen; 4838 } 4839 4840 uint32_t 4841 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4842 { 4843 return bdev->write_unit_size; 4844 } 4845 4846 uint64_t 4847 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4848 { 4849 return bdev->blockcnt; 4850 } 4851 4852 const char * 4853 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4854 { 4855 return qos_rpc_type[type]; 4856 } 4857 4858 void 4859 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4860 { 4861 int i; 4862 4863 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4864 4865 spdk_spin_lock(&bdev->internal.spinlock); 4866 if (bdev->internal.qos) { 4867 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4868 if (bdev->internal.qos->rate_limits[i].limit != 4869 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4870 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4871 if (bdev_qos_is_iops_rate_limit(i) == false) { 4872 /* Change from Byte to Megabyte which is user visible. */ 4873 limits[i] = limits[i] / 1024 / 1024; 4874 } 4875 } 4876 } 4877 } 4878 spdk_spin_unlock(&bdev->internal.spinlock); 4879 } 4880 4881 size_t 4882 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4883 { 4884 return 1 << bdev->required_alignment; 4885 } 4886 4887 uint32_t 4888 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4889 { 4890 return bdev->optimal_io_boundary; 4891 } 4892 4893 bool 4894 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4895 { 4896 return bdev->write_cache; 4897 } 4898 4899 const struct spdk_uuid * 4900 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4901 { 4902 return &bdev->uuid; 4903 } 4904 4905 uint16_t 4906 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4907 { 4908 return bdev->acwu; 4909 } 4910 4911 uint32_t 4912 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4913 { 4914 return bdev->md_len; 4915 } 4916 4917 bool 4918 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4919 { 4920 return (bdev->md_len != 0) && bdev->md_interleave; 4921 } 4922 4923 bool 4924 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4925 { 4926 return (bdev->md_len != 0) && !bdev->md_interleave; 4927 } 4928 4929 bool 4930 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4931 { 4932 return bdev->zoned; 4933 } 4934 4935 uint32_t 4936 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4937 { 4938 if (spdk_bdev_is_md_interleaved(bdev)) { 4939 return bdev->blocklen - bdev->md_len; 4940 } else { 4941 return bdev->blocklen; 4942 } 4943 } 4944 4945 uint32_t 4946 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4947 { 4948 return bdev->phys_blocklen; 4949 } 4950 4951 static uint32_t 4952 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4953 { 4954 if (!spdk_bdev_is_md_interleaved(bdev)) { 4955 return bdev->blocklen + bdev->md_len; 4956 } else { 4957 return bdev->blocklen; 4958 } 4959 } 4960 4961 /* We have to use the typedef in the function declaration to appease astyle. */ 4962 typedef enum spdk_dif_type spdk_dif_type_t; 4963 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4964 4965 spdk_dif_type_t 4966 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4967 { 4968 if (bdev->md_len != 0) { 4969 return bdev->dif_type; 4970 } else { 4971 return SPDK_DIF_DISABLE; 4972 } 4973 } 4974 4975 spdk_dif_pi_format_t 4976 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4977 { 4978 return bdev->dif_pi_format; 4979 } 4980 4981 bool 4982 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4983 { 4984 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4985 return bdev->dif_is_head_of_md; 4986 } else { 4987 return false; 4988 } 4989 } 4990 4991 bool 4992 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4993 enum spdk_dif_check_type check_type) 4994 { 4995 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4996 return false; 4997 } 4998 4999 switch (check_type) { 5000 case SPDK_DIF_CHECK_TYPE_REFTAG: 5001 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5002 case SPDK_DIF_CHECK_TYPE_APPTAG: 5003 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5004 case SPDK_DIF_CHECK_TYPE_GUARD: 5005 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5006 default: 5007 return false; 5008 } 5009 } 5010 5011 static uint32_t 5012 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5013 { 5014 uint64_t aligned_length, max_write_blocks; 5015 5016 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5017 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5018 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5019 5020 return max_write_blocks; 5021 } 5022 5023 uint32_t 5024 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5025 { 5026 return bdev->max_copy; 5027 } 5028 5029 uint64_t 5030 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5031 { 5032 return bdev->internal.measured_queue_depth; 5033 } 5034 5035 uint64_t 5036 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5037 { 5038 return bdev->internal.period; 5039 } 5040 5041 uint64_t 5042 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5043 { 5044 return bdev->internal.weighted_io_time; 5045 } 5046 5047 uint64_t 5048 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5049 { 5050 return bdev->internal.io_time; 5051 } 5052 5053 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5054 { 5055 return bdev->ctratt; 5056 } 5057 5058 uint32_t 5059 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5060 { 5061 return bdev->nsid; 5062 } 5063 5064 static void bdev_update_qd_sampling_period(void *ctx); 5065 5066 static void 5067 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5068 { 5069 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5070 5071 if (bdev->internal.measured_queue_depth) { 5072 bdev->internal.io_time += bdev->internal.period; 5073 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5074 } 5075 5076 bdev->internal.qd_poll_in_progress = false; 5077 5078 bdev_update_qd_sampling_period(bdev); 5079 } 5080 5081 static void 5082 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5083 struct spdk_io_channel *io_ch, void *_ctx) 5084 { 5085 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5086 5087 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5088 spdk_bdev_for_each_channel_continue(i, 0); 5089 } 5090 5091 static int 5092 bdev_calculate_measured_queue_depth(void *ctx) 5093 { 5094 struct spdk_bdev *bdev = ctx; 5095 5096 bdev->internal.qd_poll_in_progress = true; 5097 bdev->internal.temporary_queue_depth = 0; 5098 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5099 return SPDK_POLLER_BUSY; 5100 } 5101 5102 static void 5103 bdev_update_qd_sampling_period(void *ctx) 5104 { 5105 struct spdk_bdev *bdev = ctx; 5106 5107 if (bdev->internal.period == bdev->internal.new_period) { 5108 return; 5109 } 5110 5111 if (bdev->internal.qd_poll_in_progress) { 5112 return; 5113 } 5114 5115 bdev->internal.period = bdev->internal.new_period; 5116 5117 spdk_poller_unregister(&bdev->internal.qd_poller); 5118 if (bdev->internal.period != 0) { 5119 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5120 bdev, bdev->internal.period); 5121 } else { 5122 spdk_bdev_close(bdev->internal.qd_desc); 5123 bdev->internal.qd_desc = NULL; 5124 } 5125 } 5126 5127 static void 5128 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5129 { 5130 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5131 } 5132 5133 void 5134 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5135 { 5136 int rc; 5137 5138 if (bdev->internal.new_period == period) { 5139 return; 5140 } 5141 5142 bdev->internal.new_period = period; 5143 5144 if (bdev->internal.qd_desc != NULL) { 5145 assert(bdev->internal.period != 0); 5146 5147 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5148 bdev_update_qd_sampling_period, bdev); 5149 return; 5150 } 5151 5152 assert(bdev->internal.period == 0); 5153 5154 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5155 NULL, &bdev->internal.qd_desc); 5156 if (rc != 0) { 5157 return; 5158 } 5159 5160 bdev->internal.period = period; 5161 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5162 bdev, period); 5163 } 5164 5165 struct bdev_get_current_qd_ctx { 5166 uint64_t current_qd; 5167 spdk_bdev_get_current_qd_cb cb_fn; 5168 void *cb_arg; 5169 }; 5170 5171 static void 5172 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5173 { 5174 struct bdev_get_current_qd_ctx *ctx = _ctx; 5175 5176 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5177 5178 free(ctx); 5179 } 5180 5181 static void 5182 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5183 struct spdk_io_channel *io_ch, void *_ctx) 5184 { 5185 struct bdev_get_current_qd_ctx *ctx = _ctx; 5186 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5187 5188 ctx->current_qd += bdev_ch->io_outstanding; 5189 5190 spdk_bdev_for_each_channel_continue(i, 0); 5191 } 5192 5193 void 5194 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5195 void *cb_arg) 5196 { 5197 struct bdev_get_current_qd_ctx *ctx; 5198 5199 assert(cb_fn != NULL); 5200 5201 ctx = calloc(1, sizeof(*ctx)); 5202 if (ctx == NULL) { 5203 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5204 return; 5205 } 5206 5207 ctx->cb_fn = cb_fn; 5208 ctx->cb_arg = cb_arg; 5209 5210 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5211 } 5212 5213 static void 5214 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5215 { 5216 assert(desc->thread == spdk_get_thread()); 5217 5218 spdk_spin_lock(&desc->spinlock); 5219 desc->refs--; 5220 if (!desc->closed) { 5221 spdk_spin_unlock(&desc->spinlock); 5222 desc->callback.event_fn(type, 5223 desc->bdev, 5224 desc->callback.ctx); 5225 return; 5226 } else if (desc->refs == 0) { 5227 /* This descriptor was closed after this event_notify message was sent. 5228 * spdk_bdev_close() could not free the descriptor since this message was 5229 * in flight, so we free it now using bdev_desc_free(). 5230 */ 5231 spdk_spin_unlock(&desc->spinlock); 5232 bdev_desc_free(desc); 5233 return; 5234 } 5235 spdk_spin_unlock(&desc->spinlock); 5236 } 5237 5238 static void 5239 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5240 { 5241 spdk_spin_lock(&desc->spinlock); 5242 desc->refs++; 5243 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5244 spdk_spin_unlock(&desc->spinlock); 5245 } 5246 5247 static void 5248 _resize_notify(void *ctx) 5249 { 5250 struct spdk_bdev_desc *desc = ctx; 5251 5252 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5253 } 5254 5255 int 5256 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5257 { 5258 struct spdk_bdev_desc *desc; 5259 int ret; 5260 5261 if (size == bdev->blockcnt) { 5262 return 0; 5263 } 5264 5265 spdk_spin_lock(&bdev->internal.spinlock); 5266 5267 /* bdev has open descriptors */ 5268 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5269 bdev->blockcnt > size) { 5270 ret = -EBUSY; 5271 } else { 5272 bdev->blockcnt = size; 5273 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5274 event_notify(desc, _resize_notify); 5275 } 5276 ret = 0; 5277 } 5278 5279 spdk_spin_unlock(&bdev->internal.spinlock); 5280 5281 return ret; 5282 } 5283 5284 /* 5285 * Convert I/O offset and length from bytes to blocks. 5286 * 5287 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5288 */ 5289 static uint64_t 5290 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes, 5291 uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks) 5292 { 5293 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5294 uint32_t block_size = bdev->blocklen; 5295 uint8_t shift_cnt; 5296 5297 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5298 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5299 shift_cnt = spdk_u32log2(block_size); 5300 *offset_blocks = offset_bytes >> shift_cnt; 5301 *num_blocks = num_bytes >> shift_cnt; 5302 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5303 (num_bytes - (*num_blocks << shift_cnt)); 5304 } else { 5305 *offset_blocks = offset_bytes / block_size; 5306 *num_blocks = num_bytes / block_size; 5307 return (offset_bytes % block_size) | (num_bytes % block_size); 5308 } 5309 } 5310 5311 static bool 5312 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5313 { 5314 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5315 * has been an overflow and hence the offset has been wrapped around */ 5316 if (offset_blocks + num_blocks < offset_blocks) { 5317 return false; 5318 } 5319 5320 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5321 if (offset_blocks + num_blocks > bdev->blockcnt) { 5322 return false; 5323 } 5324 5325 return true; 5326 } 5327 5328 static void 5329 bdev_seek_complete_cb(void *ctx) 5330 { 5331 struct spdk_bdev_io *bdev_io = ctx; 5332 5333 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5334 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5335 } 5336 5337 static int 5338 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5339 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5340 spdk_bdev_io_completion_cb cb, void *cb_arg) 5341 { 5342 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5343 struct spdk_bdev_io *bdev_io; 5344 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5345 5346 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5347 5348 /* Check if offset_blocks is valid looking at the validity of one block */ 5349 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5350 return -EINVAL; 5351 } 5352 5353 bdev_io = bdev_channel_get_io(channel); 5354 if (!bdev_io) { 5355 return -ENOMEM; 5356 } 5357 5358 bdev_io->internal.ch = channel; 5359 bdev_io->internal.desc = desc; 5360 bdev_io->type = io_type; 5361 bdev_io->u.bdev.offset_blocks = offset_blocks; 5362 bdev_io->u.bdev.memory_domain = NULL; 5363 bdev_io->u.bdev.memory_domain_ctx = NULL; 5364 bdev_io->u.bdev.accel_sequence = NULL; 5365 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5366 5367 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5368 /* In case bdev doesn't support seek to next data/hole offset, 5369 * it is assumed that only data and no holes are present */ 5370 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5371 bdev_io->u.bdev.seek.offset = offset_blocks; 5372 } else { 5373 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5374 } 5375 5376 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5377 return 0; 5378 } 5379 5380 bdev_io_submit(bdev_io); 5381 return 0; 5382 } 5383 5384 int 5385 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5386 uint64_t offset_blocks, 5387 spdk_bdev_io_completion_cb cb, void *cb_arg) 5388 { 5389 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5390 } 5391 5392 int 5393 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5394 uint64_t offset_blocks, 5395 spdk_bdev_io_completion_cb cb, void *cb_arg) 5396 { 5397 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5398 } 5399 5400 uint64_t 5401 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5402 { 5403 return bdev_io->u.bdev.seek.offset; 5404 } 5405 5406 static int 5407 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5408 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5409 spdk_bdev_io_completion_cb cb, void *cb_arg) 5410 { 5411 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5412 struct spdk_bdev_io *bdev_io; 5413 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5414 5415 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5416 return -EINVAL; 5417 } 5418 5419 bdev_io = bdev_channel_get_io(channel); 5420 if (!bdev_io) { 5421 return -ENOMEM; 5422 } 5423 5424 bdev_io->internal.ch = channel; 5425 bdev_io->internal.desc = desc; 5426 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5427 bdev_io->u.bdev.iovs = &bdev_io->iov; 5428 bdev_io->u.bdev.iovs[0].iov_base = buf; 5429 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5430 bdev_io->u.bdev.iovcnt = 1; 5431 bdev_io->u.bdev.md_buf = md_buf; 5432 bdev_io->u.bdev.num_blocks = num_blocks; 5433 bdev_io->u.bdev.offset_blocks = offset_blocks; 5434 bdev_io->u.bdev.memory_domain = NULL; 5435 bdev_io->u.bdev.memory_domain_ctx = NULL; 5436 bdev_io->u.bdev.accel_sequence = NULL; 5437 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5438 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5439 5440 bdev_io_submit(bdev_io); 5441 return 0; 5442 } 5443 5444 int 5445 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5446 void *buf, uint64_t offset, uint64_t nbytes, 5447 spdk_bdev_io_completion_cb cb, void *cb_arg) 5448 { 5449 uint64_t offset_blocks, num_blocks; 5450 5451 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5452 return -EINVAL; 5453 } 5454 5455 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5456 } 5457 5458 int 5459 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5460 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5461 spdk_bdev_io_completion_cb cb, void *cb_arg) 5462 { 5463 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5464 } 5465 5466 int 5467 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5468 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5469 spdk_bdev_io_completion_cb cb, void *cb_arg) 5470 { 5471 struct iovec iov = { 5472 .iov_base = buf, 5473 }; 5474 5475 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5476 return -EINVAL; 5477 } 5478 5479 if (md_buf && !_is_buf_allocated(&iov)) { 5480 return -EINVAL; 5481 } 5482 5483 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5484 cb, cb_arg); 5485 } 5486 5487 int 5488 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5489 struct iovec *iov, int iovcnt, 5490 uint64_t offset, uint64_t nbytes, 5491 spdk_bdev_io_completion_cb cb, void *cb_arg) 5492 { 5493 uint64_t offset_blocks, num_blocks; 5494 5495 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5496 return -EINVAL; 5497 } 5498 5499 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5500 } 5501 5502 static int 5503 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5504 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5505 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5506 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5507 spdk_bdev_io_completion_cb cb, void *cb_arg) 5508 { 5509 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5510 struct spdk_bdev_io *bdev_io; 5511 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5512 5513 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5514 return -EINVAL; 5515 } 5516 5517 bdev_io = bdev_channel_get_io(channel); 5518 if (spdk_unlikely(!bdev_io)) { 5519 return -ENOMEM; 5520 } 5521 5522 bdev_io->internal.ch = channel; 5523 bdev_io->internal.desc = desc; 5524 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5525 bdev_io->u.bdev.iovs = iov; 5526 bdev_io->u.bdev.iovcnt = iovcnt; 5527 bdev_io->u.bdev.md_buf = md_buf; 5528 bdev_io->u.bdev.num_blocks = num_blocks; 5529 bdev_io->u.bdev.offset_blocks = offset_blocks; 5530 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5531 5532 if (seq != NULL) { 5533 bdev_io->internal.f.has_accel_sequence = true; 5534 bdev_io->internal.accel_sequence = seq; 5535 } 5536 5537 if (domain != NULL) { 5538 bdev_io->internal.f.has_memory_domain = true; 5539 bdev_io->internal.memory_domain = domain; 5540 bdev_io->internal.memory_domain_ctx = domain_ctx; 5541 } 5542 5543 bdev_io->u.bdev.memory_domain = domain; 5544 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5545 bdev_io->u.bdev.accel_sequence = seq; 5546 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5547 5548 _bdev_io_submit_ext(desc, bdev_io); 5549 5550 return 0; 5551 } 5552 5553 int 5554 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5555 struct iovec *iov, int iovcnt, 5556 uint64_t offset_blocks, uint64_t num_blocks, 5557 spdk_bdev_io_completion_cb cb, void *cb_arg) 5558 { 5559 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5560 5561 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5562 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5563 } 5564 5565 int 5566 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5567 struct iovec *iov, int iovcnt, void *md_buf, 5568 uint64_t offset_blocks, uint64_t num_blocks, 5569 spdk_bdev_io_completion_cb cb, void *cb_arg) 5570 { 5571 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5572 5573 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5574 return -EINVAL; 5575 } 5576 5577 if (md_buf && !_is_buf_allocated(iov)) { 5578 return -EINVAL; 5579 } 5580 5581 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5582 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5583 } 5584 5585 static inline bool 5586 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5587 { 5588 /* 5589 * We check if opts size is at least of size when we first introduced 5590 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5591 * are not checked internal. 5592 */ 5593 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5594 sizeof(opts->metadata) && 5595 opts->size <= sizeof(*opts) && 5596 /* When memory domain is used, the user must provide data buffers */ 5597 (!opts->memory_domain || (iov && iov[0].iov_base)); 5598 } 5599 5600 int 5601 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5602 struct iovec *iov, int iovcnt, 5603 uint64_t offset_blocks, uint64_t num_blocks, 5604 spdk_bdev_io_completion_cb cb, void *cb_arg, 5605 struct spdk_bdev_ext_io_opts *opts) 5606 { 5607 struct spdk_memory_domain *domain = NULL; 5608 struct spdk_accel_sequence *seq = NULL; 5609 void *domain_ctx = NULL, *md = NULL; 5610 uint32_t dif_check_flags = 0; 5611 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5612 5613 if (opts) { 5614 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5615 return -EINVAL; 5616 } 5617 5618 md = opts->metadata; 5619 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5620 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5621 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5622 if (md) { 5623 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5624 return -EINVAL; 5625 } 5626 5627 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5628 return -EINVAL; 5629 } 5630 5631 if (spdk_unlikely(seq != NULL)) { 5632 return -EINVAL; 5633 } 5634 } 5635 } 5636 5637 dif_check_flags = bdev->dif_check_flags & 5638 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5639 5640 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5641 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5642 } 5643 5644 static int 5645 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5646 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5647 spdk_bdev_io_completion_cb cb, void *cb_arg) 5648 { 5649 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5650 struct spdk_bdev_io *bdev_io; 5651 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5652 5653 if (!desc->write) { 5654 return -EBADF; 5655 } 5656 5657 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5658 return -EINVAL; 5659 } 5660 5661 bdev_io = bdev_channel_get_io(channel); 5662 if (!bdev_io) { 5663 return -ENOMEM; 5664 } 5665 5666 bdev_io->internal.ch = channel; 5667 bdev_io->internal.desc = desc; 5668 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5669 bdev_io->u.bdev.iovs = &bdev_io->iov; 5670 bdev_io->u.bdev.iovs[0].iov_base = buf; 5671 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5672 bdev_io->u.bdev.iovcnt = 1; 5673 bdev_io->u.bdev.md_buf = md_buf; 5674 bdev_io->u.bdev.num_blocks = num_blocks; 5675 bdev_io->u.bdev.offset_blocks = offset_blocks; 5676 bdev_io->u.bdev.memory_domain = NULL; 5677 bdev_io->u.bdev.memory_domain_ctx = NULL; 5678 bdev_io->u.bdev.accel_sequence = NULL; 5679 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5680 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5681 5682 bdev_io_submit(bdev_io); 5683 return 0; 5684 } 5685 5686 int 5687 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5688 void *buf, uint64_t offset, uint64_t nbytes, 5689 spdk_bdev_io_completion_cb cb, void *cb_arg) 5690 { 5691 uint64_t offset_blocks, num_blocks; 5692 5693 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5694 return -EINVAL; 5695 } 5696 5697 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5698 } 5699 5700 int 5701 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5702 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5703 spdk_bdev_io_completion_cb cb, void *cb_arg) 5704 { 5705 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5706 cb, cb_arg); 5707 } 5708 5709 int 5710 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5711 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5712 spdk_bdev_io_completion_cb cb, void *cb_arg) 5713 { 5714 struct iovec iov = { 5715 .iov_base = buf, 5716 }; 5717 5718 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5719 return -EINVAL; 5720 } 5721 5722 if (md_buf && !_is_buf_allocated(&iov)) { 5723 return -EINVAL; 5724 } 5725 5726 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5727 cb, cb_arg); 5728 } 5729 5730 static int 5731 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5732 struct iovec *iov, int iovcnt, void *md_buf, 5733 uint64_t offset_blocks, uint64_t num_blocks, 5734 struct spdk_memory_domain *domain, void *domain_ctx, 5735 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5736 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5737 spdk_bdev_io_completion_cb cb, void *cb_arg) 5738 { 5739 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5740 struct spdk_bdev_io *bdev_io; 5741 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5742 5743 if (spdk_unlikely(!desc->write)) { 5744 return -EBADF; 5745 } 5746 5747 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5748 return -EINVAL; 5749 } 5750 5751 bdev_io = bdev_channel_get_io(channel); 5752 if (spdk_unlikely(!bdev_io)) { 5753 return -ENOMEM; 5754 } 5755 5756 bdev_io->internal.ch = channel; 5757 bdev_io->internal.desc = desc; 5758 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5759 bdev_io->u.bdev.iovs = iov; 5760 bdev_io->u.bdev.iovcnt = iovcnt; 5761 bdev_io->u.bdev.md_buf = md_buf; 5762 bdev_io->u.bdev.num_blocks = num_blocks; 5763 bdev_io->u.bdev.offset_blocks = offset_blocks; 5764 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5765 if (seq != NULL) { 5766 bdev_io->internal.f.has_accel_sequence = true; 5767 bdev_io->internal.accel_sequence = seq; 5768 } 5769 5770 if (domain != NULL) { 5771 bdev_io->internal.f.has_memory_domain = true; 5772 bdev_io->internal.memory_domain = domain; 5773 bdev_io->internal.memory_domain_ctx = domain_ctx; 5774 } 5775 5776 bdev_io->u.bdev.memory_domain = domain; 5777 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5778 bdev_io->u.bdev.accel_sequence = seq; 5779 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5780 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5781 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5782 5783 _bdev_io_submit_ext(desc, bdev_io); 5784 5785 return 0; 5786 } 5787 5788 int 5789 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5790 struct iovec *iov, int iovcnt, 5791 uint64_t offset, uint64_t len, 5792 spdk_bdev_io_completion_cb cb, void *cb_arg) 5793 { 5794 uint64_t offset_blocks, num_blocks; 5795 5796 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 5797 return -EINVAL; 5798 } 5799 5800 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5801 } 5802 5803 int 5804 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5805 struct iovec *iov, int iovcnt, 5806 uint64_t offset_blocks, uint64_t num_blocks, 5807 spdk_bdev_io_completion_cb cb, void *cb_arg) 5808 { 5809 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5810 5811 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5812 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5813 cb, cb_arg); 5814 } 5815 5816 int 5817 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5818 struct iovec *iov, int iovcnt, void *md_buf, 5819 uint64_t offset_blocks, uint64_t num_blocks, 5820 spdk_bdev_io_completion_cb cb, void *cb_arg) 5821 { 5822 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5823 5824 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5825 return -EINVAL; 5826 } 5827 5828 if (md_buf && !_is_buf_allocated(iov)) { 5829 return -EINVAL; 5830 } 5831 5832 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5833 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5834 cb, cb_arg); 5835 } 5836 5837 int 5838 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5839 struct iovec *iov, int iovcnt, 5840 uint64_t offset_blocks, uint64_t num_blocks, 5841 spdk_bdev_io_completion_cb cb, void *cb_arg, 5842 struct spdk_bdev_ext_io_opts *opts) 5843 { 5844 struct spdk_memory_domain *domain = NULL; 5845 struct spdk_accel_sequence *seq = NULL; 5846 void *domain_ctx = NULL, *md = NULL; 5847 uint32_t dif_check_flags = 0; 5848 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5849 uint32_t nvme_cdw12_raw = 0; 5850 uint32_t nvme_cdw13_raw = 0; 5851 5852 if (opts) { 5853 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5854 return -EINVAL; 5855 } 5856 md = opts->metadata; 5857 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5858 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5859 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5860 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5861 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5862 if (md) { 5863 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5864 return -EINVAL; 5865 } 5866 5867 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5868 return -EINVAL; 5869 } 5870 5871 if (spdk_unlikely(seq != NULL)) { 5872 return -EINVAL; 5873 } 5874 } 5875 } 5876 5877 dif_check_flags = bdev->dif_check_flags & 5878 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5879 5880 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5881 domain, domain_ctx, seq, dif_check_flags, 5882 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5883 } 5884 5885 static void 5886 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5887 { 5888 struct spdk_bdev_io *parent_io = cb_arg; 5889 struct spdk_bdev *bdev = parent_io->bdev; 5890 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5891 int i, rc = 0; 5892 5893 if (!success) { 5894 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5895 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5896 spdk_bdev_free_io(bdev_io); 5897 return; 5898 } 5899 5900 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5901 rc = memcmp(read_buf, 5902 parent_io->u.bdev.iovs[i].iov_base, 5903 parent_io->u.bdev.iovs[i].iov_len); 5904 if (rc) { 5905 break; 5906 } 5907 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5908 } 5909 5910 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5911 rc = memcmp(bdev_io->u.bdev.md_buf, 5912 parent_io->u.bdev.md_buf, 5913 spdk_bdev_get_md_size(bdev)); 5914 } 5915 5916 spdk_bdev_free_io(bdev_io); 5917 5918 if (rc == 0) { 5919 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5920 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5921 } else { 5922 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5923 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5924 } 5925 } 5926 5927 static void 5928 bdev_compare_do_read(void *_bdev_io) 5929 { 5930 struct spdk_bdev_io *bdev_io = _bdev_io; 5931 int rc; 5932 5933 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5934 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5935 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5936 bdev_compare_do_read_done, bdev_io); 5937 5938 if (rc == -ENOMEM) { 5939 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5940 } else if (rc != 0) { 5941 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5942 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5943 } 5944 } 5945 5946 static int 5947 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5948 struct iovec *iov, int iovcnt, void *md_buf, 5949 uint64_t offset_blocks, uint64_t num_blocks, 5950 spdk_bdev_io_completion_cb cb, void *cb_arg) 5951 { 5952 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5953 struct spdk_bdev_io *bdev_io; 5954 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5955 5956 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5957 return -EINVAL; 5958 } 5959 5960 bdev_io = bdev_channel_get_io(channel); 5961 if (!bdev_io) { 5962 return -ENOMEM; 5963 } 5964 5965 bdev_io->internal.ch = channel; 5966 bdev_io->internal.desc = desc; 5967 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5968 bdev_io->u.bdev.iovs = iov; 5969 bdev_io->u.bdev.iovcnt = iovcnt; 5970 bdev_io->u.bdev.md_buf = md_buf; 5971 bdev_io->u.bdev.num_blocks = num_blocks; 5972 bdev_io->u.bdev.offset_blocks = offset_blocks; 5973 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5974 bdev_io->u.bdev.memory_domain = NULL; 5975 bdev_io->u.bdev.memory_domain_ctx = NULL; 5976 bdev_io->u.bdev.accel_sequence = NULL; 5977 5978 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5979 bdev_io_submit(bdev_io); 5980 return 0; 5981 } 5982 5983 bdev_compare_do_read(bdev_io); 5984 5985 return 0; 5986 } 5987 5988 int 5989 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5990 struct iovec *iov, int iovcnt, 5991 uint64_t offset_blocks, uint64_t num_blocks, 5992 spdk_bdev_io_completion_cb cb, void *cb_arg) 5993 { 5994 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5995 num_blocks, cb, cb_arg); 5996 } 5997 5998 int 5999 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6000 struct iovec *iov, int iovcnt, void *md_buf, 6001 uint64_t offset_blocks, uint64_t num_blocks, 6002 spdk_bdev_io_completion_cb cb, void *cb_arg) 6003 { 6004 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6005 return -EINVAL; 6006 } 6007 6008 if (md_buf && !_is_buf_allocated(iov)) { 6009 return -EINVAL; 6010 } 6011 6012 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6013 num_blocks, cb, cb_arg); 6014 } 6015 6016 static int 6017 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6018 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6019 spdk_bdev_io_completion_cb cb, void *cb_arg) 6020 { 6021 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6022 struct spdk_bdev_io *bdev_io; 6023 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6024 6025 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6026 return -EINVAL; 6027 } 6028 6029 bdev_io = bdev_channel_get_io(channel); 6030 if (!bdev_io) { 6031 return -ENOMEM; 6032 } 6033 6034 bdev_io->internal.ch = channel; 6035 bdev_io->internal.desc = desc; 6036 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6037 bdev_io->u.bdev.iovs = &bdev_io->iov; 6038 bdev_io->u.bdev.iovs[0].iov_base = buf; 6039 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 6040 bdev_io->u.bdev.iovcnt = 1; 6041 bdev_io->u.bdev.md_buf = md_buf; 6042 bdev_io->u.bdev.num_blocks = num_blocks; 6043 bdev_io->u.bdev.offset_blocks = offset_blocks; 6044 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6045 bdev_io->u.bdev.memory_domain = NULL; 6046 bdev_io->u.bdev.memory_domain_ctx = NULL; 6047 bdev_io->u.bdev.accel_sequence = NULL; 6048 6049 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6050 bdev_io_submit(bdev_io); 6051 return 0; 6052 } 6053 6054 bdev_compare_do_read(bdev_io); 6055 6056 return 0; 6057 } 6058 6059 int 6060 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6061 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6062 spdk_bdev_io_completion_cb cb, void *cb_arg) 6063 { 6064 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6065 cb, cb_arg); 6066 } 6067 6068 int 6069 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6070 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6071 spdk_bdev_io_completion_cb cb, void *cb_arg) 6072 { 6073 struct iovec iov = { 6074 .iov_base = buf, 6075 }; 6076 6077 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6078 return -EINVAL; 6079 } 6080 6081 if (md_buf && !_is_buf_allocated(&iov)) { 6082 return -EINVAL; 6083 } 6084 6085 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6086 cb, cb_arg); 6087 } 6088 6089 static void 6090 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6091 { 6092 struct spdk_bdev_io *bdev_io = ctx; 6093 6094 if (unlock_status) { 6095 SPDK_ERRLOG("LBA range unlock failed\n"); 6096 } 6097 6098 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6099 false, bdev_io->internal.caller_ctx); 6100 } 6101 6102 static void 6103 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6104 { 6105 bdev_io->internal.status = status; 6106 6107 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6108 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6109 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6110 } 6111 6112 static void 6113 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6114 { 6115 struct spdk_bdev_io *parent_io = cb_arg; 6116 6117 if (!success) { 6118 SPDK_ERRLOG("Compare and write operation failed\n"); 6119 } 6120 6121 spdk_bdev_free_io(bdev_io); 6122 6123 bdev_comparev_and_writev_blocks_unlock(parent_io, 6124 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6125 } 6126 6127 static void 6128 bdev_compare_and_write_do_write(void *_bdev_io) 6129 { 6130 struct spdk_bdev_io *bdev_io = _bdev_io; 6131 int rc; 6132 6133 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6134 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6135 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6136 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6137 bdev_compare_and_write_do_write_done, bdev_io); 6138 6139 6140 if (rc == -ENOMEM) { 6141 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6142 } else if (rc != 0) { 6143 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6144 } 6145 } 6146 6147 static void 6148 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6149 { 6150 struct spdk_bdev_io *parent_io = cb_arg; 6151 6152 spdk_bdev_free_io(bdev_io); 6153 6154 if (!success) { 6155 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6156 return; 6157 } 6158 6159 bdev_compare_and_write_do_write(parent_io); 6160 } 6161 6162 static void 6163 bdev_compare_and_write_do_compare(void *_bdev_io) 6164 { 6165 struct spdk_bdev_io *bdev_io = _bdev_io; 6166 int rc; 6167 6168 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6169 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6170 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6171 bdev_compare_and_write_do_compare_done, bdev_io); 6172 6173 if (rc == -ENOMEM) { 6174 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6175 } else if (rc != 0) { 6176 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6177 } 6178 } 6179 6180 static void 6181 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6182 { 6183 struct spdk_bdev_io *bdev_io = ctx; 6184 6185 if (status) { 6186 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6187 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6188 return; 6189 } 6190 6191 bdev_compare_and_write_do_compare(bdev_io); 6192 } 6193 6194 int 6195 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6196 struct iovec *compare_iov, int compare_iovcnt, 6197 struct iovec *write_iov, int write_iovcnt, 6198 uint64_t offset_blocks, uint64_t num_blocks, 6199 spdk_bdev_io_completion_cb cb, void *cb_arg) 6200 { 6201 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6202 struct spdk_bdev_io *bdev_io; 6203 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6204 6205 if (!desc->write) { 6206 return -EBADF; 6207 } 6208 6209 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6210 return -EINVAL; 6211 } 6212 6213 if (num_blocks > bdev->acwu) { 6214 return -EINVAL; 6215 } 6216 6217 bdev_io = bdev_channel_get_io(channel); 6218 if (!bdev_io) { 6219 return -ENOMEM; 6220 } 6221 6222 bdev_io->internal.ch = channel; 6223 bdev_io->internal.desc = desc; 6224 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6225 bdev_io->u.bdev.iovs = compare_iov; 6226 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6227 bdev_io->u.bdev.fused_iovs = write_iov; 6228 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6229 bdev_io->u.bdev.md_buf = NULL; 6230 bdev_io->u.bdev.num_blocks = num_blocks; 6231 bdev_io->u.bdev.offset_blocks = offset_blocks; 6232 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6233 bdev_io->u.bdev.memory_domain = NULL; 6234 bdev_io->u.bdev.memory_domain_ctx = NULL; 6235 bdev_io->u.bdev.accel_sequence = NULL; 6236 6237 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6238 bdev_io_submit(bdev_io); 6239 return 0; 6240 } 6241 6242 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6243 bdev_comparev_and_writev_blocks_locked, bdev_io); 6244 } 6245 6246 int 6247 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6248 struct iovec *iov, int iovcnt, 6249 uint64_t offset_blocks, uint64_t num_blocks, 6250 bool populate, 6251 spdk_bdev_io_completion_cb cb, void *cb_arg) 6252 { 6253 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6254 struct spdk_bdev_io *bdev_io; 6255 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6256 6257 if (!desc->write) { 6258 return -EBADF; 6259 } 6260 6261 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6262 return -EINVAL; 6263 } 6264 6265 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6266 return -ENOTSUP; 6267 } 6268 6269 bdev_io = bdev_channel_get_io(channel); 6270 if (!bdev_io) { 6271 return -ENOMEM; 6272 } 6273 6274 bdev_io->internal.ch = channel; 6275 bdev_io->internal.desc = desc; 6276 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6277 bdev_io->u.bdev.num_blocks = num_blocks; 6278 bdev_io->u.bdev.offset_blocks = offset_blocks; 6279 bdev_io->u.bdev.iovs = iov; 6280 bdev_io->u.bdev.iovcnt = iovcnt; 6281 bdev_io->u.bdev.md_buf = NULL; 6282 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6283 bdev_io->u.bdev.zcopy.commit = 0; 6284 bdev_io->u.bdev.zcopy.start = 1; 6285 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6286 bdev_io->u.bdev.memory_domain = NULL; 6287 bdev_io->u.bdev.memory_domain_ctx = NULL; 6288 bdev_io->u.bdev.accel_sequence = NULL; 6289 6290 bdev_io_submit(bdev_io); 6291 6292 return 0; 6293 } 6294 6295 int 6296 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6297 spdk_bdev_io_completion_cb cb, void *cb_arg) 6298 { 6299 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6300 return -EINVAL; 6301 } 6302 6303 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6304 bdev_io->u.bdev.zcopy.start = 0; 6305 bdev_io->internal.caller_ctx = cb_arg; 6306 bdev_io->internal.cb = cb; 6307 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6308 6309 bdev_io_submit(bdev_io); 6310 6311 return 0; 6312 } 6313 6314 int 6315 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6316 uint64_t offset, uint64_t len, 6317 spdk_bdev_io_completion_cb cb, void *cb_arg) 6318 { 6319 uint64_t offset_blocks, num_blocks; 6320 6321 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6322 return -EINVAL; 6323 } 6324 6325 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6326 } 6327 6328 int 6329 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6330 uint64_t offset_blocks, uint64_t num_blocks, 6331 spdk_bdev_io_completion_cb cb, void *cb_arg) 6332 { 6333 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6334 struct spdk_bdev_io *bdev_io; 6335 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6336 6337 if (!desc->write) { 6338 return -EBADF; 6339 } 6340 6341 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6342 return -EINVAL; 6343 } 6344 6345 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6346 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6347 return -ENOTSUP; 6348 } 6349 6350 bdev_io = bdev_channel_get_io(channel); 6351 6352 if (!bdev_io) { 6353 return -ENOMEM; 6354 } 6355 6356 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6357 bdev_io->internal.ch = channel; 6358 bdev_io->internal.desc = desc; 6359 bdev_io->u.bdev.offset_blocks = offset_blocks; 6360 bdev_io->u.bdev.num_blocks = num_blocks; 6361 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6362 bdev_io->u.bdev.memory_domain = NULL; 6363 bdev_io->u.bdev.memory_domain_ctx = NULL; 6364 bdev_io->u.bdev.accel_sequence = NULL; 6365 6366 /* If the write_zeroes size is large and should be split, use the generic split 6367 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6368 * 6369 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6370 * or emulate it using regular write request otherwise. 6371 */ 6372 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6373 bdev_io->internal.f.split) { 6374 bdev_io_submit(bdev_io); 6375 return 0; 6376 } 6377 6378 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6379 6380 return bdev_write_zero_buffer(bdev_io); 6381 } 6382 6383 int 6384 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6385 uint64_t offset, uint64_t nbytes, 6386 spdk_bdev_io_completion_cb cb, void *cb_arg) 6387 { 6388 uint64_t offset_blocks, num_blocks; 6389 6390 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6391 return -EINVAL; 6392 } 6393 6394 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6395 } 6396 6397 static void 6398 bdev_io_complete_cb(void *ctx) 6399 { 6400 struct spdk_bdev_io *bdev_io = ctx; 6401 6402 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6403 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6404 } 6405 6406 int 6407 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6408 uint64_t offset_blocks, uint64_t num_blocks, 6409 spdk_bdev_io_completion_cb cb, void *cb_arg) 6410 { 6411 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6412 struct spdk_bdev_io *bdev_io; 6413 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6414 6415 if (!desc->write) { 6416 return -EBADF; 6417 } 6418 6419 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6420 return -EINVAL; 6421 } 6422 6423 bdev_io = bdev_channel_get_io(channel); 6424 if (!bdev_io) { 6425 return -ENOMEM; 6426 } 6427 6428 bdev_io->internal.ch = channel; 6429 bdev_io->internal.desc = desc; 6430 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6431 6432 bdev_io->u.bdev.iovs = &bdev_io->iov; 6433 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6434 bdev_io->u.bdev.iovs[0].iov_len = 0; 6435 bdev_io->u.bdev.iovcnt = 1; 6436 6437 bdev_io->u.bdev.offset_blocks = offset_blocks; 6438 bdev_io->u.bdev.num_blocks = num_blocks; 6439 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6440 bdev_io->u.bdev.memory_domain = NULL; 6441 bdev_io->u.bdev.memory_domain_ctx = NULL; 6442 bdev_io->u.bdev.accel_sequence = NULL; 6443 6444 if (num_blocks == 0) { 6445 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6446 return 0; 6447 } 6448 6449 bdev_io_submit(bdev_io); 6450 return 0; 6451 } 6452 6453 int 6454 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6455 uint64_t offset, uint64_t length, 6456 spdk_bdev_io_completion_cb cb, void *cb_arg) 6457 { 6458 uint64_t offset_blocks, num_blocks; 6459 6460 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) { 6461 return -EINVAL; 6462 } 6463 6464 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6465 } 6466 6467 int 6468 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6469 uint64_t offset_blocks, uint64_t num_blocks, 6470 spdk_bdev_io_completion_cb cb, void *cb_arg) 6471 { 6472 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6473 struct spdk_bdev_io *bdev_io; 6474 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6475 6476 if (!desc->write) { 6477 return -EBADF; 6478 } 6479 6480 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6481 return -ENOTSUP; 6482 } 6483 6484 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6485 return -EINVAL; 6486 } 6487 6488 bdev_io = bdev_channel_get_io(channel); 6489 if (!bdev_io) { 6490 return -ENOMEM; 6491 } 6492 6493 bdev_io->internal.ch = channel; 6494 bdev_io->internal.desc = desc; 6495 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6496 bdev_io->u.bdev.iovs = NULL; 6497 bdev_io->u.bdev.iovcnt = 0; 6498 bdev_io->u.bdev.offset_blocks = offset_blocks; 6499 bdev_io->u.bdev.num_blocks = num_blocks; 6500 bdev_io->u.bdev.memory_domain = NULL; 6501 bdev_io->u.bdev.memory_domain_ctx = NULL; 6502 bdev_io->u.bdev.accel_sequence = NULL; 6503 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6504 6505 bdev_io_submit(bdev_io); 6506 return 0; 6507 } 6508 6509 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6510 6511 static void 6512 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6513 { 6514 struct spdk_bdev_io *bdev_io = _ctx; 6515 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6516 6517 if (status == -EBUSY) { 6518 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6519 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6520 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6521 } else { 6522 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6523 /* If outstanding IOs are still present and reset_io_drain_timeout 6524 * seconds passed, start the reset. */ 6525 bdev_io_submit_reset(bdev_io); 6526 } else { 6527 /* We still have in progress memory domain pull/push or we're 6528 * executing accel sequence. Since we cannot abort either of those 6529 * operations, fail the reset request. */ 6530 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6531 } 6532 } 6533 } else { 6534 SPDK_DEBUGLOG(bdev, 6535 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6536 ch->bdev->name); 6537 /* Mark the completion status as a SUCCESS and complete the reset. */ 6538 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6539 } 6540 } 6541 6542 static void 6543 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6544 struct spdk_io_channel *io_ch, void *_ctx) 6545 { 6546 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6547 int status = 0; 6548 6549 if (cur_ch->io_outstanding > 0 || 6550 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6551 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6552 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6553 * further iteration over the rest of the channels and pass non-zero status 6554 * to the callback function. */ 6555 status = -EBUSY; 6556 } 6557 spdk_bdev_for_each_channel_continue(i, status); 6558 } 6559 6560 static int 6561 bdev_reset_poll_for_outstanding_io(void *ctx) 6562 { 6563 struct spdk_bdev_io *bdev_io = ctx; 6564 6565 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6566 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6567 bdev_reset_check_outstanding_io_done); 6568 6569 return SPDK_POLLER_BUSY; 6570 } 6571 6572 static void 6573 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6574 { 6575 struct spdk_bdev_io *bdev_io = _ctx; 6576 6577 if (bdev->reset_io_drain_timeout == 0) { 6578 bdev_io_submit_reset(bdev_io); 6579 return; 6580 } 6581 6582 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6583 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6584 6585 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6586 * submit the reset to the underlying module only if outstanding I/O 6587 * remain after reset_io_drain_timeout seconds have passed. */ 6588 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6589 bdev_reset_check_outstanding_io_done); 6590 } 6591 6592 static void 6593 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6594 struct spdk_io_channel *ch, void *_ctx) 6595 { 6596 struct spdk_bdev_channel *channel; 6597 struct spdk_bdev_mgmt_channel *mgmt_channel; 6598 struct spdk_bdev_shared_resource *shared_resource; 6599 bdev_io_tailq_t tmp_queued; 6600 6601 TAILQ_INIT(&tmp_queued); 6602 6603 channel = __io_ch_to_bdev_ch(ch); 6604 shared_resource = channel->shared_resource; 6605 mgmt_channel = shared_resource->mgmt_ch; 6606 6607 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6608 6609 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6610 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6611 } 6612 6613 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6614 bdev_abort_all_buf_io(mgmt_channel, channel); 6615 bdev_abort_all_queued_io(&tmp_queued, channel); 6616 6617 spdk_bdev_for_each_channel_continue(i, 0); 6618 } 6619 6620 static void 6621 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6622 { 6623 struct spdk_bdev *bdev = bdev_io->bdev; 6624 bool freeze_channel = false; 6625 6626 bdev_ch_add_to_io_submitted(bdev_io); 6627 6628 /** 6629 * Take a channel reference for the target bdev for the life of this 6630 * reset. This guards against the channel getting destroyed before 6631 * the reset is completed. We will release the reference when this 6632 * reset is completed. 6633 */ 6634 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6635 6636 spdk_spin_lock(&bdev->internal.spinlock); 6637 if (bdev->internal.reset_in_progress == NULL) { 6638 bdev->internal.reset_in_progress = bdev_io; 6639 freeze_channel = true; 6640 } else { 6641 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6642 } 6643 spdk_spin_unlock(&bdev->internal.spinlock); 6644 6645 if (freeze_channel) { 6646 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6647 bdev_reset_freeze_channel_done); 6648 } 6649 } 6650 6651 int 6652 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6653 spdk_bdev_io_completion_cb cb, void *cb_arg) 6654 { 6655 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6656 struct spdk_bdev_io *bdev_io; 6657 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6658 6659 bdev_io = bdev_channel_get_io(channel); 6660 if (!bdev_io) { 6661 return -ENOMEM; 6662 } 6663 6664 bdev_io->internal.ch = channel; 6665 bdev_io->internal.desc = desc; 6666 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6667 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6668 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6669 6670 bdev_start_reset(bdev_io); 6671 return 0; 6672 } 6673 6674 void 6675 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6676 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6677 { 6678 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6679 6680 bdev_get_io_stat(stat, channel->stat); 6681 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 6682 } 6683 6684 static void 6685 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6686 { 6687 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6688 6689 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6690 bdev_iostat_ctx->cb_arg, 0); 6691 free(bdev_iostat_ctx); 6692 } 6693 6694 static void 6695 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6696 struct spdk_io_channel *ch, void *_ctx) 6697 { 6698 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6699 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6700 6701 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6702 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 6703 spdk_bdev_for_each_channel_continue(i, 0); 6704 } 6705 6706 void 6707 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6708 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6709 { 6710 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6711 6712 assert(bdev != NULL); 6713 assert(stat != NULL); 6714 assert(cb != NULL); 6715 6716 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6717 if (bdev_iostat_ctx == NULL) { 6718 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6719 cb(bdev, stat, cb_arg, -ENOMEM); 6720 return; 6721 } 6722 6723 bdev_iostat_ctx->stat = stat; 6724 bdev_iostat_ctx->cb = cb; 6725 bdev_iostat_ctx->cb_arg = cb_arg; 6726 bdev_iostat_ctx->reset_mode = reset_mode; 6727 6728 /* Start with the statistics from previously deleted channels. */ 6729 spdk_spin_lock(&bdev->internal.spinlock); 6730 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6731 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 6732 spdk_spin_unlock(&bdev->internal.spinlock); 6733 6734 /* Then iterate and add the statistics from each existing channel. */ 6735 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6736 bdev_get_device_stat_done); 6737 } 6738 6739 struct bdev_iostat_reset_ctx { 6740 enum spdk_bdev_reset_stat_mode mode; 6741 bdev_reset_device_stat_cb cb; 6742 void *cb_arg; 6743 }; 6744 6745 static void 6746 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6747 { 6748 struct bdev_iostat_reset_ctx *ctx = _ctx; 6749 6750 ctx->cb(bdev, ctx->cb_arg, 0); 6751 6752 free(ctx); 6753 } 6754 6755 static void 6756 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6757 struct spdk_io_channel *ch, void *_ctx) 6758 { 6759 struct bdev_iostat_reset_ctx *ctx = _ctx; 6760 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6761 6762 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6763 6764 spdk_bdev_for_each_channel_continue(i, 0); 6765 } 6766 6767 void 6768 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6769 bdev_reset_device_stat_cb cb, void *cb_arg) 6770 { 6771 struct bdev_iostat_reset_ctx *ctx; 6772 6773 assert(bdev != NULL); 6774 assert(cb != NULL); 6775 6776 ctx = calloc(1, sizeof(*ctx)); 6777 if (ctx == NULL) { 6778 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6779 cb(bdev, cb_arg, -ENOMEM); 6780 return; 6781 } 6782 6783 ctx->mode = mode; 6784 ctx->cb = cb; 6785 ctx->cb_arg = cb_arg; 6786 6787 spdk_spin_lock(&bdev->internal.spinlock); 6788 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6789 spdk_spin_unlock(&bdev->internal.spinlock); 6790 6791 spdk_bdev_for_each_channel(bdev, 6792 bdev_reset_each_channel_stat, 6793 ctx, 6794 bdev_reset_device_stat_done); 6795 } 6796 6797 int 6798 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6799 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6800 spdk_bdev_io_completion_cb cb, void *cb_arg) 6801 { 6802 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6803 struct spdk_bdev_io *bdev_io; 6804 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6805 6806 if (!desc->write) { 6807 return -EBADF; 6808 } 6809 6810 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6811 return -ENOTSUP; 6812 } 6813 6814 bdev_io = bdev_channel_get_io(channel); 6815 if (!bdev_io) { 6816 return -ENOMEM; 6817 } 6818 6819 bdev_io->internal.ch = channel; 6820 bdev_io->internal.desc = desc; 6821 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6822 bdev_io->u.nvme_passthru.cmd = *cmd; 6823 bdev_io->u.nvme_passthru.buf = buf; 6824 bdev_io->u.nvme_passthru.nbytes = nbytes; 6825 bdev_io->u.nvme_passthru.md_buf = NULL; 6826 bdev_io->u.nvme_passthru.md_len = 0; 6827 6828 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6829 6830 bdev_io_submit(bdev_io); 6831 return 0; 6832 } 6833 6834 int 6835 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6836 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6837 spdk_bdev_io_completion_cb cb, void *cb_arg) 6838 { 6839 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6840 struct spdk_bdev_io *bdev_io; 6841 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6842 6843 if (!desc->write) { 6844 /* 6845 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6846 * to easily determine if the command is a read or write, but for now just 6847 * do not allow io_passthru with a read-only descriptor. 6848 */ 6849 return -EBADF; 6850 } 6851 6852 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6853 return -ENOTSUP; 6854 } 6855 6856 bdev_io = bdev_channel_get_io(channel); 6857 if (!bdev_io) { 6858 return -ENOMEM; 6859 } 6860 6861 bdev_io->internal.ch = channel; 6862 bdev_io->internal.desc = desc; 6863 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6864 bdev_io->u.nvme_passthru.cmd = *cmd; 6865 bdev_io->u.nvme_passthru.buf = buf; 6866 bdev_io->u.nvme_passthru.nbytes = nbytes; 6867 bdev_io->u.nvme_passthru.md_buf = NULL; 6868 bdev_io->u.nvme_passthru.md_len = 0; 6869 6870 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6871 6872 bdev_io_submit(bdev_io); 6873 return 0; 6874 } 6875 6876 int 6877 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6878 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6879 spdk_bdev_io_completion_cb cb, void *cb_arg) 6880 { 6881 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6882 struct spdk_bdev_io *bdev_io; 6883 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6884 6885 if (!desc->write) { 6886 /* 6887 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6888 * to easily determine if the command is a read or write, but for now just 6889 * do not allow io_passthru with a read-only descriptor. 6890 */ 6891 return -EBADF; 6892 } 6893 6894 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6895 return -ENOTSUP; 6896 } 6897 6898 bdev_io = bdev_channel_get_io(channel); 6899 if (!bdev_io) { 6900 return -ENOMEM; 6901 } 6902 6903 bdev_io->internal.ch = channel; 6904 bdev_io->internal.desc = desc; 6905 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6906 bdev_io->u.nvme_passthru.cmd = *cmd; 6907 bdev_io->u.nvme_passthru.buf = buf; 6908 bdev_io->u.nvme_passthru.nbytes = nbytes; 6909 bdev_io->u.nvme_passthru.md_buf = md_buf; 6910 bdev_io->u.nvme_passthru.md_len = md_len; 6911 6912 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6913 6914 bdev_io_submit(bdev_io); 6915 return 0; 6916 } 6917 6918 int 6919 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6920 struct spdk_io_channel *ch, 6921 const struct spdk_nvme_cmd *cmd, 6922 struct iovec *iov, int iovcnt, size_t nbytes, 6923 void *md_buf, size_t md_len, 6924 spdk_bdev_io_completion_cb cb, void *cb_arg) 6925 { 6926 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6927 struct spdk_bdev_io *bdev_io; 6928 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6929 6930 if (!desc->write) { 6931 /* 6932 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6933 * to easily determine if the command is a read or write, but for now just 6934 * do not allow io_passthru with a read-only descriptor. 6935 */ 6936 return -EBADF; 6937 } 6938 6939 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6940 return -ENOTSUP; 6941 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6942 return -ENOTSUP; 6943 } 6944 6945 bdev_io = bdev_channel_get_io(channel); 6946 if (!bdev_io) { 6947 return -ENOMEM; 6948 } 6949 6950 bdev_io->internal.ch = channel; 6951 bdev_io->internal.desc = desc; 6952 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6953 bdev_io->u.nvme_passthru.cmd = *cmd; 6954 bdev_io->u.nvme_passthru.iovs = iov; 6955 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6956 bdev_io->u.nvme_passthru.nbytes = nbytes; 6957 bdev_io->u.nvme_passthru.md_buf = md_buf; 6958 bdev_io->u.nvme_passthru.md_len = md_len; 6959 6960 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6961 6962 bdev_io_submit(bdev_io); 6963 return 0; 6964 } 6965 6966 static void bdev_abort_retry(void *ctx); 6967 static void bdev_abort(struct spdk_bdev_io *parent_io); 6968 6969 static void 6970 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6971 { 6972 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6973 struct spdk_bdev_io *parent_io = cb_arg; 6974 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6975 6976 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6977 6978 spdk_bdev_free_io(bdev_io); 6979 6980 if (!success) { 6981 /* Check if the target I/O completed in the meantime. */ 6982 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6983 if (tmp_io == bio_to_abort) { 6984 break; 6985 } 6986 } 6987 6988 /* If the target I/O still exists, set the parent to failed. */ 6989 if (tmp_io != NULL) { 6990 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6991 } 6992 } 6993 6994 assert(parent_io->internal.f.split); 6995 6996 parent_io->internal.split.outstanding--; 6997 if (parent_io->internal.split.outstanding == 0) { 6998 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6999 bdev_abort_retry(parent_io); 7000 } else { 7001 bdev_io_complete(parent_io); 7002 } 7003 } 7004 } 7005 7006 static int 7007 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7008 struct spdk_bdev_io *bio_to_abort, 7009 spdk_bdev_io_completion_cb cb, void *cb_arg) 7010 { 7011 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7012 struct spdk_bdev_io *bdev_io; 7013 7014 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7015 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7016 /* TODO: Abort reset or abort request. */ 7017 return -ENOTSUP; 7018 } 7019 7020 bdev_io = bdev_channel_get_io(channel); 7021 if (bdev_io == NULL) { 7022 return -ENOMEM; 7023 } 7024 7025 bdev_io->internal.ch = channel; 7026 bdev_io->internal.desc = desc; 7027 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7028 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7029 7030 if (bio_to_abort->internal.f.split) { 7031 assert(bdev_io_should_split(bio_to_abort)); 7032 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7033 7034 /* Parent abort request is not submitted directly, but to manage its 7035 * execution add it to the submitted list here. 7036 */ 7037 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7038 bdev_ch_add_to_io_submitted(bdev_io); 7039 7040 bdev_abort(bdev_io); 7041 7042 return 0; 7043 } 7044 7045 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7046 7047 /* Submit the abort request to the underlying bdev module. */ 7048 bdev_io_submit(bdev_io); 7049 7050 return 0; 7051 } 7052 7053 static bool 7054 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7055 { 7056 struct spdk_bdev_io *iter; 7057 7058 TAILQ_FOREACH(iter, tailq, internal.link) { 7059 if (iter == bdev_io) { 7060 return true; 7061 } 7062 } 7063 7064 return false; 7065 } 7066 7067 static uint32_t 7068 _bdev_abort(struct spdk_bdev_io *parent_io) 7069 { 7070 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7071 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7072 void *bio_cb_arg; 7073 struct spdk_bdev_io *bio_to_abort; 7074 uint32_t matched_ios; 7075 int rc; 7076 7077 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7078 7079 /* matched_ios is returned and will be kept by the caller. 7080 * 7081 * This function will be used for two cases, 1) the same cb_arg is used for 7082 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7083 * Incrementing split_outstanding directly here may confuse readers especially 7084 * for the 1st case. 7085 * 7086 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7087 * works as expected. 7088 */ 7089 matched_ios = 0; 7090 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7091 7092 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7093 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7094 continue; 7095 } 7096 7097 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7098 /* Any I/O which was submitted after this abort command should be excluded. */ 7099 continue; 7100 } 7101 7102 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7103 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7104 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7105 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7106 break; 7107 } 7108 7109 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7110 if (rc != 0) { 7111 if (rc == -ENOMEM) { 7112 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7113 } else { 7114 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7115 } 7116 break; 7117 } 7118 matched_ios++; 7119 } 7120 7121 return matched_ios; 7122 } 7123 7124 static void 7125 bdev_abort_retry(void *ctx) 7126 { 7127 struct spdk_bdev_io *parent_io = ctx; 7128 uint32_t matched_ios; 7129 7130 matched_ios = _bdev_abort(parent_io); 7131 7132 if (matched_ios == 0) { 7133 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7134 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7135 } else { 7136 /* For retry, the case that no target I/O was found is success 7137 * because it means target I/Os completed in the meantime. 7138 */ 7139 bdev_io_complete(parent_io); 7140 } 7141 return; 7142 } 7143 7144 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7145 parent_io->internal.f.split = true; 7146 parent_io->internal.split.outstanding = matched_ios; 7147 } 7148 7149 static void 7150 bdev_abort(struct spdk_bdev_io *parent_io) 7151 { 7152 uint32_t matched_ios; 7153 7154 matched_ios = _bdev_abort(parent_io); 7155 7156 if (matched_ios == 0) { 7157 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7158 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7159 } else { 7160 /* The case the no target I/O was found is failure. */ 7161 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7162 bdev_io_complete(parent_io); 7163 } 7164 return; 7165 } 7166 7167 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7168 parent_io->internal.f.split = true; 7169 parent_io->internal.split.outstanding = matched_ios; 7170 } 7171 7172 int 7173 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7174 void *bio_cb_arg, 7175 spdk_bdev_io_completion_cb cb, void *cb_arg) 7176 { 7177 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7178 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7179 struct spdk_bdev_io *bdev_io; 7180 7181 if (bio_cb_arg == NULL) { 7182 return -EINVAL; 7183 } 7184 7185 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7186 return -ENOTSUP; 7187 } 7188 7189 bdev_io = bdev_channel_get_io(channel); 7190 if (bdev_io == NULL) { 7191 return -ENOMEM; 7192 } 7193 7194 bdev_io->internal.ch = channel; 7195 bdev_io->internal.desc = desc; 7196 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7197 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7198 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7199 7200 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7201 7202 /* Parent abort request is not submitted directly, but to manage its execution, 7203 * add it to the submitted list here. 7204 */ 7205 bdev_ch_add_to_io_submitted(bdev_io); 7206 7207 bdev_abort(bdev_io); 7208 7209 return 0; 7210 } 7211 7212 int 7213 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7214 struct spdk_bdev_io_wait_entry *entry) 7215 { 7216 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7217 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7218 7219 if (bdev != entry->bdev) { 7220 SPDK_ERRLOG("bdevs do not match\n"); 7221 return -EINVAL; 7222 } 7223 7224 if (mgmt_ch->per_thread_cache_count > 0) { 7225 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7226 return -EINVAL; 7227 } 7228 7229 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7230 return 0; 7231 } 7232 7233 static inline void 7234 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7235 { 7236 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7237 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7238 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7239 uint32_t blocklen = bdev_io->bdev->blocklen; 7240 7241 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7242 switch (bdev_io->type) { 7243 case SPDK_BDEV_IO_TYPE_READ: 7244 io_stat->bytes_read += num_blocks * blocklen; 7245 io_stat->num_read_ops++; 7246 io_stat->read_latency_ticks += tsc_diff; 7247 if (io_stat->max_read_latency_ticks < tsc_diff) { 7248 io_stat->max_read_latency_ticks = tsc_diff; 7249 } 7250 if (io_stat->min_read_latency_ticks > tsc_diff) { 7251 io_stat->min_read_latency_ticks = tsc_diff; 7252 } 7253 break; 7254 case SPDK_BDEV_IO_TYPE_WRITE: 7255 io_stat->bytes_written += num_blocks * blocklen; 7256 io_stat->num_write_ops++; 7257 io_stat->write_latency_ticks += tsc_diff; 7258 if (io_stat->max_write_latency_ticks < tsc_diff) { 7259 io_stat->max_write_latency_ticks = tsc_diff; 7260 } 7261 if (io_stat->min_write_latency_ticks > tsc_diff) { 7262 io_stat->min_write_latency_ticks = tsc_diff; 7263 } 7264 break; 7265 case SPDK_BDEV_IO_TYPE_UNMAP: 7266 io_stat->bytes_unmapped += num_blocks * blocklen; 7267 io_stat->num_unmap_ops++; 7268 io_stat->unmap_latency_ticks += tsc_diff; 7269 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7270 io_stat->max_unmap_latency_ticks = tsc_diff; 7271 } 7272 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7273 io_stat->min_unmap_latency_ticks = tsc_diff; 7274 } 7275 break; 7276 case SPDK_BDEV_IO_TYPE_ZCOPY: 7277 /* Track the data in the start phase only */ 7278 if (bdev_io->u.bdev.zcopy.start) { 7279 if (bdev_io->u.bdev.zcopy.populate) { 7280 io_stat->bytes_read += num_blocks * blocklen; 7281 io_stat->num_read_ops++; 7282 io_stat->read_latency_ticks += tsc_diff; 7283 if (io_stat->max_read_latency_ticks < tsc_diff) { 7284 io_stat->max_read_latency_ticks = tsc_diff; 7285 } 7286 if (io_stat->min_read_latency_ticks > tsc_diff) { 7287 io_stat->min_read_latency_ticks = tsc_diff; 7288 } 7289 } else { 7290 io_stat->bytes_written += num_blocks * blocklen; 7291 io_stat->num_write_ops++; 7292 io_stat->write_latency_ticks += tsc_diff; 7293 if (io_stat->max_write_latency_ticks < tsc_diff) { 7294 io_stat->max_write_latency_ticks = tsc_diff; 7295 } 7296 if (io_stat->min_write_latency_ticks > tsc_diff) { 7297 io_stat->min_write_latency_ticks = tsc_diff; 7298 } 7299 } 7300 } 7301 break; 7302 case SPDK_BDEV_IO_TYPE_COPY: 7303 io_stat->bytes_copied += num_blocks * blocklen; 7304 io_stat->num_copy_ops++; 7305 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7306 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7307 io_stat->max_copy_latency_ticks = tsc_diff; 7308 } 7309 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7310 io_stat->min_copy_latency_ticks = tsc_diff; 7311 } 7312 break; 7313 default: 7314 break; 7315 } 7316 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7317 io_stat = bdev_io->bdev->internal.stat; 7318 assert(io_stat->io_error != NULL); 7319 7320 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7321 io_stat->io_error->error_status[-io_status - 1]++; 7322 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7323 } 7324 7325 #ifdef SPDK_CONFIG_VTUNE 7326 uint64_t now_tsc = spdk_get_ticks(); 7327 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7328 uint64_t data[5]; 7329 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7330 7331 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7332 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7333 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7334 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7335 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7336 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7337 7338 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7339 __itt_metadata_u64, 5, data); 7340 7341 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7342 bdev_io->internal.ch->start_tsc = now_tsc; 7343 } 7344 #endif 7345 } 7346 7347 static inline void 7348 _bdev_io_complete(void *ctx) 7349 { 7350 struct spdk_bdev_io *bdev_io = ctx; 7351 7352 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7353 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7354 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7355 } 7356 7357 assert(bdev_io->internal.cb != NULL); 7358 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7359 7360 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7361 bdev_io->internal.caller_ctx); 7362 } 7363 7364 static inline void 7365 bdev_io_complete(void *ctx) 7366 { 7367 struct spdk_bdev_io *bdev_io = ctx; 7368 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7369 uint64_t tsc, tsc_diff; 7370 7371 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7372 /* 7373 * Defer completion to avoid potential infinite recursion if the 7374 * user's completion callback issues a new I/O. 7375 */ 7376 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7377 bdev_io_complete, bdev_io); 7378 return; 7379 } 7380 7381 tsc = spdk_get_ticks(); 7382 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7383 7384 bdev_ch_remove_from_io_submitted(bdev_io); 7385 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7386 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7387 7388 if (bdev_ch->histogram) { 7389 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7390 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7391 /* 7392 * Tally all I/O types if the histogram_io_type is set to 0. 7393 */ 7394 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7395 } 7396 } 7397 7398 bdev_io_update_io_stat(bdev_io, tsc_diff); 7399 _bdev_io_complete(bdev_io); 7400 } 7401 7402 /* The difference between this function and bdev_io_complete() is that this should be called to 7403 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7404 * io_submitted list and don't have submit_tsc updated. 7405 */ 7406 static inline void 7407 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7408 { 7409 /* Since the IO hasn't been submitted it's bound to be failed */ 7410 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7411 7412 /* At this point we don't know if the IO is completed from submission context or not, but, 7413 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7414 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7415 _bdev_io_complete, bdev_io); 7416 } 7417 7418 static void bdev_destroy_cb(void *io_device); 7419 7420 static inline void 7421 _bdev_reset_complete(void *ctx) 7422 { 7423 struct spdk_bdev_io *bdev_io = ctx; 7424 7425 /* Put the channel reference we got in submission. */ 7426 assert(bdev_io->u.reset.ch_ref != NULL); 7427 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7428 bdev_io->u.reset.ch_ref = NULL; 7429 7430 bdev_io_complete(bdev_io); 7431 } 7432 7433 static void 7434 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7435 { 7436 struct spdk_bdev_io *bdev_io = _ctx; 7437 bdev_io_tailq_t queued_resets; 7438 struct spdk_bdev_io *queued_reset; 7439 7440 assert(bdev_io == bdev->internal.reset_in_progress); 7441 7442 TAILQ_INIT(&queued_resets); 7443 7444 spdk_spin_lock(&bdev->internal.spinlock); 7445 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7446 spdk_bdev_io, internal.link); 7447 bdev->internal.reset_in_progress = NULL; 7448 spdk_spin_unlock(&bdev->internal.spinlock); 7449 7450 while (!TAILQ_EMPTY(&queued_resets)) { 7451 queued_reset = TAILQ_FIRST(&queued_resets); 7452 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7453 queued_reset->internal.status = bdev_io->internal.status; 7454 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7455 _bdev_reset_complete, queued_reset); 7456 } 7457 7458 _bdev_reset_complete(bdev_io); 7459 7460 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7461 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7462 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7463 } 7464 } 7465 7466 static void 7467 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7468 struct spdk_io_channel *_ch, void *_ctx) 7469 { 7470 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7471 7472 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7473 7474 spdk_bdev_for_each_channel_continue(i, 0); 7475 } 7476 7477 static void 7478 bdev_io_complete_sequence_cb(void *ctx, int status) 7479 { 7480 struct spdk_bdev_io *bdev_io = ctx; 7481 7482 /* u.bdev.accel_sequence should have already been cleared at this point */ 7483 assert(bdev_io->u.bdev.accel_sequence == NULL); 7484 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7485 bdev_io->internal.f.has_accel_sequence = false; 7486 7487 if (spdk_unlikely(status != 0)) { 7488 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7489 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7490 } 7491 7492 bdev_io_complete(bdev_io); 7493 } 7494 7495 void 7496 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7497 { 7498 struct spdk_bdev *bdev = bdev_io->bdev; 7499 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7500 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7501 7502 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7503 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7504 spdk_bdev_get_module_name(bdev), 7505 bdev_io_status_get_string(bdev_io->internal.status)); 7506 assert(false); 7507 } 7508 bdev_io->internal.status = status; 7509 7510 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7511 assert(bdev_io == bdev->internal.reset_in_progress); 7512 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7513 bdev_reset_complete); 7514 return; 7515 } else { 7516 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7517 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7518 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7519 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7520 return; 7521 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7522 !bdev_io_use_accel_sequence(bdev_io))) { 7523 _bdev_io_push_bounce_data_buffer(bdev_io, 7524 _bdev_io_complete_push_bounce_done); 7525 /* bdev IO will be completed in the callback */ 7526 return; 7527 } 7528 } 7529 7530 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7531 return; 7532 } 7533 } 7534 7535 bdev_io_complete(bdev_io); 7536 } 7537 7538 void 7539 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7540 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7541 { 7542 enum spdk_bdev_io_status status; 7543 7544 if (sc == SPDK_SCSI_STATUS_GOOD) { 7545 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7546 } else { 7547 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7548 bdev_io->internal.error.scsi.sc = sc; 7549 bdev_io->internal.error.scsi.sk = sk; 7550 bdev_io->internal.error.scsi.asc = asc; 7551 bdev_io->internal.error.scsi.ascq = ascq; 7552 } 7553 7554 spdk_bdev_io_complete(bdev_io, status); 7555 } 7556 7557 void 7558 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7559 int *sc, int *sk, int *asc, int *ascq) 7560 { 7561 assert(sc != NULL); 7562 assert(sk != NULL); 7563 assert(asc != NULL); 7564 assert(ascq != NULL); 7565 7566 switch (bdev_io->internal.status) { 7567 case SPDK_BDEV_IO_STATUS_SUCCESS: 7568 *sc = SPDK_SCSI_STATUS_GOOD; 7569 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7570 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7571 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7572 break; 7573 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7574 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7575 break; 7576 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7577 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7578 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7579 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7580 *ascq = bdev_io->internal.error.scsi.ascq; 7581 break; 7582 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7583 *sc = bdev_io->internal.error.scsi.sc; 7584 *sk = bdev_io->internal.error.scsi.sk; 7585 *asc = bdev_io->internal.error.scsi.asc; 7586 *ascq = bdev_io->internal.error.scsi.ascq; 7587 break; 7588 default: 7589 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7590 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7591 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7592 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7593 break; 7594 } 7595 } 7596 7597 void 7598 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7599 { 7600 enum spdk_bdev_io_status status; 7601 7602 if (aio_result == 0) { 7603 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7604 } else { 7605 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7606 } 7607 7608 bdev_io->internal.error.aio_result = aio_result; 7609 7610 spdk_bdev_io_complete(bdev_io, status); 7611 } 7612 7613 void 7614 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7615 { 7616 assert(aio_result != NULL); 7617 7618 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7619 *aio_result = bdev_io->internal.error.aio_result; 7620 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7621 *aio_result = 0; 7622 } else { 7623 *aio_result = -EIO; 7624 } 7625 } 7626 7627 void 7628 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7629 { 7630 enum spdk_bdev_io_status status; 7631 7632 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7633 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7634 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7635 status = SPDK_BDEV_IO_STATUS_ABORTED; 7636 } else { 7637 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7638 } 7639 7640 bdev_io->internal.error.nvme.cdw0 = cdw0; 7641 bdev_io->internal.error.nvme.sct = sct; 7642 bdev_io->internal.error.nvme.sc = sc; 7643 7644 spdk_bdev_io_complete(bdev_io, status); 7645 } 7646 7647 void 7648 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7649 { 7650 assert(sct != NULL); 7651 assert(sc != NULL); 7652 assert(cdw0 != NULL); 7653 7654 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7655 *sct = SPDK_NVME_SCT_GENERIC; 7656 *sc = SPDK_NVME_SC_SUCCESS; 7657 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7658 *cdw0 = 0; 7659 } else { 7660 *cdw0 = 1U; 7661 } 7662 return; 7663 } 7664 7665 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7666 *sct = SPDK_NVME_SCT_GENERIC; 7667 *sc = SPDK_NVME_SC_SUCCESS; 7668 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7669 *sct = bdev_io->internal.error.nvme.sct; 7670 *sc = bdev_io->internal.error.nvme.sc; 7671 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7672 *sct = SPDK_NVME_SCT_GENERIC; 7673 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7674 } else { 7675 *sct = SPDK_NVME_SCT_GENERIC; 7676 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7677 } 7678 7679 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7680 } 7681 7682 void 7683 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7684 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7685 { 7686 assert(first_sct != NULL); 7687 assert(first_sc != NULL); 7688 assert(second_sct != NULL); 7689 assert(second_sc != NULL); 7690 assert(cdw0 != NULL); 7691 7692 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7693 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7694 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7695 *first_sct = bdev_io->internal.error.nvme.sct; 7696 *first_sc = bdev_io->internal.error.nvme.sc; 7697 *second_sct = SPDK_NVME_SCT_GENERIC; 7698 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7699 } else { 7700 *first_sct = SPDK_NVME_SCT_GENERIC; 7701 *first_sc = SPDK_NVME_SC_SUCCESS; 7702 *second_sct = bdev_io->internal.error.nvme.sct; 7703 *second_sc = bdev_io->internal.error.nvme.sc; 7704 } 7705 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7706 *first_sct = SPDK_NVME_SCT_GENERIC; 7707 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7708 *second_sct = SPDK_NVME_SCT_GENERIC; 7709 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7710 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7711 *first_sct = SPDK_NVME_SCT_GENERIC; 7712 *first_sc = SPDK_NVME_SC_SUCCESS; 7713 *second_sct = SPDK_NVME_SCT_GENERIC; 7714 *second_sc = SPDK_NVME_SC_SUCCESS; 7715 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7716 *first_sct = SPDK_NVME_SCT_GENERIC; 7717 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7718 *second_sct = SPDK_NVME_SCT_GENERIC; 7719 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7720 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7721 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7722 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7723 *second_sct = SPDK_NVME_SCT_GENERIC; 7724 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7725 } else { 7726 *first_sct = SPDK_NVME_SCT_GENERIC; 7727 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7728 *second_sct = SPDK_NVME_SCT_GENERIC; 7729 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7730 } 7731 7732 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7733 } 7734 7735 void 7736 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7737 const struct spdk_bdev_io *base_io) 7738 { 7739 switch (base_io->internal.status) { 7740 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7741 spdk_bdev_io_complete_nvme_status(bdev_io, 7742 base_io->internal.error.nvme.cdw0, 7743 base_io->internal.error.nvme.sct, 7744 base_io->internal.error.nvme.sc); 7745 break; 7746 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7747 spdk_bdev_io_complete_scsi_status(bdev_io, 7748 base_io->internal.error.scsi.sc, 7749 base_io->internal.error.scsi.sk, 7750 base_io->internal.error.scsi.asc, 7751 base_io->internal.error.scsi.ascq); 7752 break; 7753 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7754 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7755 break; 7756 default: 7757 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7758 break; 7759 } 7760 } 7761 7762 struct spdk_thread * 7763 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7764 { 7765 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7766 } 7767 7768 struct spdk_io_channel * 7769 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7770 { 7771 return bdev_io->internal.ch->channel; 7772 } 7773 7774 static int 7775 bdev_register(struct spdk_bdev *bdev) 7776 { 7777 char *bdev_name; 7778 char uuid[SPDK_UUID_STRING_LEN]; 7779 struct spdk_iobuf_opts iobuf_opts; 7780 int ret; 7781 7782 assert(bdev->module != NULL); 7783 7784 if (!bdev->name) { 7785 SPDK_ERRLOG("Bdev name is NULL\n"); 7786 return -EINVAL; 7787 } 7788 7789 if (!strlen(bdev->name)) { 7790 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7791 return -EINVAL; 7792 } 7793 7794 /* Users often register their own I/O devices using the bdev name. In 7795 * order to avoid conflicts, prepend bdev_. */ 7796 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7797 if (!bdev_name) { 7798 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7799 return -ENOMEM; 7800 } 7801 7802 bdev->internal.stat = bdev_alloc_io_stat(true); 7803 if (!bdev->internal.stat) { 7804 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7805 free(bdev_name); 7806 return -ENOMEM; 7807 } 7808 7809 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7810 bdev->internal.measured_queue_depth = UINT64_MAX; 7811 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7812 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7813 bdev->internal.qd_poller = NULL; 7814 bdev->internal.qos = NULL; 7815 7816 TAILQ_INIT(&bdev->internal.open_descs); 7817 TAILQ_INIT(&bdev->internal.locked_ranges); 7818 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7819 TAILQ_INIT(&bdev->internal.queued_resets); 7820 TAILQ_INIT(&bdev->aliases); 7821 7822 /* UUID may be specified by the user or defined by bdev itself. 7823 * Otherwise it will be generated here, so this field will never be empty. */ 7824 if (spdk_uuid_is_null(&bdev->uuid)) { 7825 spdk_uuid_generate(&bdev->uuid); 7826 } 7827 7828 /* Add the UUID alias only if it's different than the name */ 7829 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7830 if (strcmp(bdev->name, uuid) != 0) { 7831 ret = spdk_bdev_alias_add(bdev, uuid); 7832 if (ret != 0) { 7833 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7834 bdev_free_io_stat(bdev->internal.stat); 7835 free(bdev_name); 7836 return ret; 7837 } 7838 } 7839 7840 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7841 if (spdk_bdev_get_buf_align(bdev) > 1) { 7842 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7843 iobuf_opts.large_bufsize / bdev->blocklen); 7844 } 7845 7846 /* If the user didn't specify a write unit size, set it to one. */ 7847 if (bdev->write_unit_size == 0) { 7848 bdev->write_unit_size = 1; 7849 } 7850 7851 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7852 if (bdev->acwu == 0) { 7853 bdev->acwu = bdev->write_unit_size; 7854 } 7855 7856 if (bdev->phys_blocklen == 0) { 7857 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7858 } 7859 7860 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7861 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7862 } 7863 7864 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7865 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7866 } 7867 7868 bdev->internal.reset_in_progress = NULL; 7869 bdev->internal.qd_poll_in_progress = false; 7870 bdev->internal.period = 0; 7871 bdev->internal.new_period = 0; 7872 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7873 7874 /* 7875 * Initialize spinlock before registering IO device because spinlock is used in 7876 * bdev_channel_create 7877 */ 7878 spdk_spin_init(&bdev->internal.spinlock); 7879 7880 spdk_io_device_register(__bdev_to_io_dev(bdev), 7881 bdev_channel_create, bdev_channel_destroy, 7882 sizeof(struct spdk_bdev_channel), 7883 bdev_name); 7884 7885 /* 7886 * Register bdev name only after the bdev object is ready. 7887 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7888 * create IO channels... 7889 */ 7890 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7891 if (ret != 0) { 7892 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7893 bdev_free_io_stat(bdev->internal.stat); 7894 spdk_spin_destroy(&bdev->internal.spinlock); 7895 free(bdev_name); 7896 return ret; 7897 } 7898 7899 free(bdev_name); 7900 7901 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7902 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7903 7904 return 0; 7905 } 7906 7907 static void 7908 bdev_destroy_cb(void *io_device) 7909 { 7910 int rc; 7911 struct spdk_bdev *bdev; 7912 spdk_bdev_unregister_cb cb_fn; 7913 void *cb_arg; 7914 7915 bdev = __bdev_from_io_dev(io_device); 7916 7917 if (bdev->internal.unregister_td != spdk_get_thread()) { 7918 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7919 return; 7920 } 7921 7922 cb_fn = bdev->internal.unregister_cb; 7923 cb_arg = bdev->internal.unregister_ctx; 7924 7925 spdk_spin_destroy(&bdev->internal.spinlock); 7926 free(bdev->internal.qos); 7927 bdev_free_io_stat(bdev->internal.stat); 7928 spdk_trace_unregister_owner(bdev->internal.trace_id); 7929 7930 rc = bdev->fn_table->destruct(bdev->ctxt); 7931 if (rc < 0) { 7932 SPDK_ERRLOG("destruct failed\n"); 7933 } 7934 if (rc <= 0 && cb_fn != NULL) { 7935 cb_fn(cb_arg, rc); 7936 } 7937 } 7938 7939 void 7940 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7941 { 7942 if (bdev->internal.unregister_cb != NULL) { 7943 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7944 } 7945 } 7946 7947 static void 7948 _remove_notify(void *arg) 7949 { 7950 struct spdk_bdev_desc *desc = arg; 7951 7952 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7953 } 7954 7955 /* returns: 0 - bdev removed and ready to be destructed. 7956 * -EBUSY - bdev can't be destructed yet. */ 7957 static int 7958 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7959 { 7960 struct spdk_bdev_desc *desc, *tmp; 7961 struct spdk_bdev_alias *alias; 7962 int rc = 0; 7963 char uuid[SPDK_UUID_STRING_LEN]; 7964 7965 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7966 assert(spdk_spin_held(&bdev->internal.spinlock)); 7967 7968 /* Notify each descriptor about hotremoval */ 7969 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7970 rc = -EBUSY; 7971 /* 7972 * Defer invocation of the event_cb to a separate message that will 7973 * run later on its thread. This ensures this context unwinds and 7974 * we don't recursively unregister this bdev again if the event_cb 7975 * immediately closes its descriptor. 7976 */ 7977 event_notify(desc, _remove_notify); 7978 } 7979 7980 /* If there are no descriptors, proceed removing the bdev */ 7981 if (rc == 0) { 7982 bdev_examine_allowlist_remove(bdev->name); 7983 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 7984 bdev_examine_allowlist_remove(alias->alias.name); 7985 } 7986 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7987 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7988 7989 /* Delete the name and the UUID alias */ 7990 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7991 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7992 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7993 7994 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7995 7996 if (bdev->internal.reset_in_progress != NULL) { 7997 /* If reset is in progress, let the completion callback for reset 7998 * unregister the bdev. 7999 */ 8000 rc = -EBUSY; 8001 } 8002 } 8003 8004 return rc; 8005 } 8006 8007 static void 8008 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8009 struct spdk_io_channel *io_ch, void *_ctx) 8010 { 8011 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8012 8013 bdev_channel_abort_queued_ios(bdev_ch); 8014 spdk_bdev_for_each_channel_continue(i, 0); 8015 } 8016 8017 static void 8018 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8019 { 8020 int rc; 8021 8022 spdk_spin_lock(&g_bdev_mgr.spinlock); 8023 spdk_spin_lock(&bdev->internal.spinlock); 8024 /* 8025 * Set the status to REMOVING after completing to abort channels. Otherwise, 8026 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8027 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8028 * may fail. 8029 */ 8030 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8031 rc = bdev_unregister_unsafe(bdev); 8032 spdk_spin_unlock(&bdev->internal.spinlock); 8033 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8034 8035 if (rc == 0) { 8036 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8037 } 8038 } 8039 8040 void 8041 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8042 { 8043 struct spdk_thread *thread; 8044 8045 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8046 8047 thread = spdk_get_thread(); 8048 if (!thread) { 8049 /* The user called this from a non-SPDK thread. */ 8050 if (cb_fn != NULL) { 8051 cb_fn(cb_arg, -ENOTSUP); 8052 } 8053 return; 8054 } 8055 8056 spdk_spin_lock(&g_bdev_mgr.spinlock); 8057 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8058 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8059 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8060 if (cb_fn) { 8061 cb_fn(cb_arg, -EBUSY); 8062 } 8063 return; 8064 } 8065 8066 spdk_spin_lock(&bdev->internal.spinlock); 8067 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8068 bdev->internal.unregister_cb = cb_fn; 8069 bdev->internal.unregister_ctx = cb_arg; 8070 bdev->internal.unregister_td = thread; 8071 spdk_spin_unlock(&bdev->internal.spinlock); 8072 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8073 8074 spdk_bdev_set_qd_sampling_period(bdev, 0); 8075 8076 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8077 bdev_unregister); 8078 } 8079 8080 int 8081 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8082 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8083 { 8084 struct spdk_bdev_desc *desc; 8085 struct spdk_bdev *bdev; 8086 int rc; 8087 8088 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8089 if (rc != 0) { 8090 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8091 return rc; 8092 } 8093 8094 bdev = spdk_bdev_desc_get_bdev(desc); 8095 8096 if (bdev->module != module) { 8097 spdk_bdev_close(desc); 8098 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8099 bdev_name); 8100 return -ENODEV; 8101 } 8102 8103 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8104 8105 spdk_bdev_close(desc); 8106 8107 return 0; 8108 } 8109 8110 static int 8111 bdev_start_qos(struct spdk_bdev *bdev) 8112 { 8113 struct set_qos_limit_ctx *ctx; 8114 8115 /* Enable QoS */ 8116 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8117 ctx = calloc(1, sizeof(*ctx)); 8118 if (ctx == NULL) { 8119 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8120 return -ENOMEM; 8121 } 8122 ctx->bdev = bdev; 8123 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8124 } 8125 8126 return 0; 8127 } 8128 8129 static void 8130 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8131 struct spdk_bdev *bdev) 8132 { 8133 enum spdk_bdev_claim_type type; 8134 const char *typename, *modname; 8135 extern struct spdk_log_flag SPDK_LOG_bdev; 8136 8137 assert(spdk_spin_held(&bdev->internal.spinlock)); 8138 8139 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8140 return; 8141 } 8142 8143 type = bdev->internal.claim_type; 8144 typename = spdk_bdev_claim_get_name(type); 8145 8146 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8147 modname = bdev->internal.claim.v1.module->name; 8148 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8149 bdev->name, detail, typename, modname); 8150 return; 8151 } 8152 8153 if (claim_type_is_v2(type)) { 8154 struct spdk_bdev_module_claim *claim; 8155 8156 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8157 modname = claim->module->name; 8158 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8159 bdev->name, detail, typename, modname); 8160 } 8161 return; 8162 } 8163 8164 assert(false); 8165 } 8166 8167 static int 8168 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8169 { 8170 struct spdk_thread *thread; 8171 int rc = 0; 8172 8173 thread = spdk_get_thread(); 8174 if (!thread) { 8175 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8176 return -ENOTSUP; 8177 } 8178 8179 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8180 spdk_get_thread()); 8181 8182 desc->bdev = bdev; 8183 desc->thread = thread; 8184 desc->write = write; 8185 8186 spdk_spin_lock(&bdev->internal.spinlock); 8187 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8188 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8189 spdk_spin_unlock(&bdev->internal.spinlock); 8190 return -ENODEV; 8191 } 8192 8193 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8194 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8195 spdk_spin_unlock(&bdev->internal.spinlock); 8196 return -EPERM; 8197 } 8198 8199 rc = bdev_start_qos(bdev); 8200 if (rc != 0) { 8201 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8202 spdk_spin_unlock(&bdev->internal.spinlock); 8203 return rc; 8204 } 8205 8206 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8207 8208 spdk_spin_unlock(&bdev->internal.spinlock); 8209 8210 return 0; 8211 } 8212 8213 static void 8214 bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size) 8215 { 8216 if (!opts) { 8217 SPDK_ERRLOG("opts should not be NULL.\n"); 8218 return; 8219 } 8220 8221 if (!opts_size) { 8222 SPDK_ERRLOG("opts_size should not be zero.\n"); 8223 return; 8224 } 8225 8226 memset(opts, 0, opts_size); 8227 opts->size = opts_size; 8228 8229 #define FIELD_OK(field) \ 8230 offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size 8231 8232 #define SET_FIELD(field, value) \ 8233 if (FIELD_OK(field)) { \ 8234 opts->field = value; \ 8235 } \ 8236 8237 SET_FIELD(hide_metadata, false); 8238 8239 #undef FIELD_OK 8240 #undef SET_FIELD 8241 } 8242 8243 static void 8244 bdev_open_opts_copy(struct spdk_bdev_open_opts *opts, 8245 const struct spdk_bdev_open_opts *opts_src, size_t opts_size) 8246 { 8247 assert(opts); 8248 assert(opts_src); 8249 8250 #define SET_FIELD(field) \ 8251 if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \ 8252 opts->field = opts_src->field; \ 8253 } \ 8254 8255 SET_FIELD(hide_metadata); 8256 8257 opts->size = opts_src->size; 8258 8259 /* We should not remove this statement, but need to update the assert statement 8260 * if we add a new field, and also add a corresponding SET_FIELD statement. 8261 */ 8262 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size"); 8263 8264 #undef SET_FIELD 8265 } 8266 8267 void 8268 spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size) 8269 { 8270 struct spdk_bdev_open_opts opts_local; 8271 8272 bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local)); 8273 bdev_open_opts_copy(opts, &opts_local, opts_size); 8274 } 8275 8276 static int 8277 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8278 struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc) 8279 { 8280 struct spdk_bdev_desc *desc; 8281 struct spdk_bdev_open_opts opts; 8282 unsigned int i; 8283 8284 bdev_open_opts_get_defaults(&opts, sizeof(opts)); 8285 if (user_opts != NULL) { 8286 bdev_open_opts_copy(&opts, user_opts, user_opts->size); 8287 } 8288 8289 desc = calloc(1, sizeof(*desc)); 8290 if (desc == NULL) { 8291 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8292 return -ENOMEM; 8293 } 8294 8295 desc->opts = opts; 8296 8297 TAILQ_INIT(&desc->pending_media_events); 8298 TAILQ_INIT(&desc->free_media_events); 8299 8300 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8301 desc->callback.event_fn = event_cb; 8302 desc->callback.ctx = event_ctx; 8303 spdk_spin_init(&desc->spinlock); 8304 8305 if (desc->opts.hide_metadata) { 8306 if (spdk_bdev_is_md_separate(bdev)) { 8307 SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n"); 8308 bdev_desc_free(desc); 8309 return -EINVAL; 8310 } 8311 } 8312 8313 if (bdev->media_events) { 8314 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8315 sizeof(*desc->media_events_buffer)); 8316 if (desc->media_events_buffer == NULL) { 8317 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8318 bdev_desc_free(desc); 8319 return -ENOMEM; 8320 } 8321 8322 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8323 TAILQ_INSERT_TAIL(&desc->free_media_events, 8324 &desc->media_events_buffer[i], tailq); 8325 } 8326 } 8327 8328 if (bdev->fn_table->accel_sequence_supported != NULL) { 8329 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8330 desc->accel_sequence_supported[i] = 8331 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8332 (enum spdk_bdev_io_type)i); 8333 } 8334 } 8335 8336 *_desc = desc; 8337 8338 return 0; 8339 } 8340 8341 static int 8342 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8343 void *event_ctx, struct spdk_bdev_open_opts *opts, 8344 struct spdk_bdev_desc **_desc) 8345 { 8346 struct spdk_bdev_desc *desc; 8347 struct spdk_bdev *bdev; 8348 int rc; 8349 8350 bdev = bdev_get_by_name(bdev_name); 8351 8352 if (bdev == NULL) { 8353 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8354 return -ENODEV; 8355 } 8356 8357 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc); 8358 if (rc != 0) { 8359 return rc; 8360 } 8361 8362 rc = bdev_open(bdev, write, desc); 8363 if (rc != 0) { 8364 bdev_desc_free(desc); 8365 desc = NULL; 8366 } 8367 8368 *_desc = desc; 8369 8370 return rc; 8371 } 8372 8373 int 8374 spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8375 void *event_ctx, struct spdk_bdev_open_opts *opts, 8376 struct spdk_bdev_desc **_desc) 8377 { 8378 int rc; 8379 8380 if (event_cb == NULL) { 8381 SPDK_ERRLOG("Missing event callback function\n"); 8382 return -EINVAL; 8383 } 8384 8385 spdk_spin_lock(&g_bdev_mgr.spinlock); 8386 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc); 8387 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8388 8389 return rc; 8390 } 8391 8392 int 8393 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8394 void *event_ctx, struct spdk_bdev_desc **_desc) 8395 { 8396 return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc); 8397 } 8398 8399 struct spdk_bdev_open_async_ctx { 8400 char *bdev_name; 8401 spdk_bdev_event_cb_t event_cb; 8402 void *event_ctx; 8403 bool write; 8404 int rc; 8405 spdk_bdev_open_async_cb_t cb_fn; 8406 void *cb_arg; 8407 struct spdk_bdev_desc *desc; 8408 struct spdk_bdev_open_async_opts opts; 8409 uint64_t start_ticks; 8410 struct spdk_thread *orig_thread; 8411 struct spdk_poller *poller; 8412 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8413 }; 8414 8415 static void 8416 bdev_open_async_done(void *arg) 8417 { 8418 struct spdk_bdev_open_async_ctx *ctx = arg; 8419 8420 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8421 8422 free(ctx->bdev_name); 8423 free(ctx); 8424 } 8425 8426 static void 8427 bdev_open_async_cancel(void *arg) 8428 { 8429 struct spdk_bdev_open_async_ctx *ctx = arg; 8430 8431 assert(ctx->rc == -ESHUTDOWN); 8432 8433 spdk_poller_unregister(&ctx->poller); 8434 8435 bdev_open_async_done(ctx); 8436 } 8437 8438 /* This is called when the bdev library finishes at shutdown. */ 8439 static void 8440 bdev_open_async_fini(void) 8441 { 8442 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8443 8444 spdk_spin_lock(&g_bdev_mgr.spinlock); 8445 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8446 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8447 /* 8448 * We have to move to ctx->orig_thread to unregister ctx->poller. 8449 * However, there is a chance that ctx->poller is executed before 8450 * message is executed, which could result in bdev_open_async_done() 8451 * being called twice. To avoid such race condition, set ctx->rc to 8452 * -ESHUTDOWN. 8453 */ 8454 ctx->rc = -ESHUTDOWN; 8455 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8456 } 8457 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8458 } 8459 8460 static int bdev_open_async(void *arg); 8461 8462 static void 8463 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8464 { 8465 uint64_t timeout_ticks; 8466 8467 if (ctx->rc == -ESHUTDOWN) { 8468 /* This context is being canceled. Do nothing. */ 8469 return; 8470 } 8471 8472 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8473 NULL, &ctx->desc); 8474 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8475 goto exit; 8476 } 8477 8478 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8479 if (spdk_get_ticks() >= timeout_ticks) { 8480 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8481 ctx->rc = -ETIMEDOUT; 8482 goto exit; 8483 } 8484 8485 return; 8486 8487 exit: 8488 spdk_poller_unregister(&ctx->poller); 8489 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8490 8491 /* Completion callback is processed after stack unwinding. */ 8492 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8493 } 8494 8495 static int 8496 bdev_open_async(void *arg) 8497 { 8498 struct spdk_bdev_open_async_ctx *ctx = arg; 8499 8500 spdk_spin_lock(&g_bdev_mgr.spinlock); 8501 8502 _bdev_open_async(ctx); 8503 8504 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8505 8506 return SPDK_POLLER_BUSY; 8507 } 8508 8509 static void 8510 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8511 struct spdk_bdev_open_async_opts *opts_src, 8512 size_t size) 8513 { 8514 assert(opts); 8515 assert(opts_src); 8516 8517 opts->size = size; 8518 8519 #define SET_FIELD(field) \ 8520 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8521 opts->field = opts_src->field; \ 8522 } \ 8523 8524 SET_FIELD(timeout_ms); 8525 8526 /* Do not remove this statement, you should always update this statement when you adding a new field, 8527 * and do not forget to add the SET_FIELD statement for your added field. */ 8528 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8529 8530 #undef SET_FIELD 8531 } 8532 8533 static void 8534 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8535 { 8536 assert(opts); 8537 8538 opts->size = size; 8539 8540 #define SET_FIELD(field, value) \ 8541 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8542 opts->field = value; \ 8543 } \ 8544 8545 SET_FIELD(timeout_ms, 0); 8546 8547 #undef SET_FIELD 8548 } 8549 8550 int 8551 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8552 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8553 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8554 { 8555 struct spdk_bdev_open_async_ctx *ctx; 8556 8557 if (event_cb == NULL) { 8558 SPDK_ERRLOG("Missing event callback function\n"); 8559 return -EINVAL; 8560 } 8561 8562 if (open_cb == NULL) { 8563 SPDK_ERRLOG("Missing open callback function\n"); 8564 return -EINVAL; 8565 } 8566 8567 if (opts != NULL && opts->size == 0) { 8568 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8569 return -EINVAL; 8570 } 8571 8572 ctx = calloc(1, sizeof(*ctx)); 8573 if (ctx == NULL) { 8574 SPDK_ERRLOG("Failed to allocate open context\n"); 8575 return -ENOMEM; 8576 } 8577 8578 ctx->bdev_name = strdup(bdev_name); 8579 if (ctx->bdev_name == NULL) { 8580 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8581 free(ctx); 8582 return -ENOMEM; 8583 } 8584 8585 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8586 if (ctx->poller == NULL) { 8587 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8588 free(ctx->bdev_name); 8589 free(ctx); 8590 return -ENOMEM; 8591 } 8592 8593 ctx->cb_fn = open_cb; 8594 ctx->cb_arg = open_cb_arg; 8595 ctx->write = write; 8596 ctx->event_cb = event_cb; 8597 ctx->event_ctx = event_ctx; 8598 ctx->orig_thread = spdk_get_thread(); 8599 ctx->start_ticks = spdk_get_ticks(); 8600 8601 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8602 if (opts != NULL) { 8603 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8604 } 8605 8606 spdk_spin_lock(&g_bdev_mgr.spinlock); 8607 8608 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8609 _bdev_open_async(ctx); 8610 8611 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8612 8613 return 0; 8614 } 8615 8616 static void 8617 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8618 { 8619 int rc; 8620 8621 spdk_spin_lock(&bdev->internal.spinlock); 8622 spdk_spin_lock(&desc->spinlock); 8623 8624 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8625 8626 desc->closed = true; 8627 8628 if (desc->claim != NULL) { 8629 bdev_desc_release_claims(desc); 8630 } 8631 8632 if (0 == desc->refs) { 8633 spdk_spin_unlock(&desc->spinlock); 8634 bdev_desc_free(desc); 8635 } else { 8636 spdk_spin_unlock(&desc->spinlock); 8637 } 8638 8639 /* If no more descriptors, kill QoS channel */ 8640 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8641 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8642 bdev->name, spdk_get_thread()); 8643 8644 if (bdev_qos_destroy(bdev)) { 8645 /* There isn't anything we can do to recover here. Just let the 8646 * old QoS poller keep running. The QoS handling won't change 8647 * cores when the user allocates a new channel, but it won't break. */ 8648 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8649 } 8650 } 8651 8652 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8653 rc = bdev_unregister_unsafe(bdev); 8654 spdk_spin_unlock(&bdev->internal.spinlock); 8655 8656 if (rc == 0) { 8657 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8658 } 8659 } else { 8660 spdk_spin_unlock(&bdev->internal.spinlock); 8661 } 8662 } 8663 8664 void 8665 spdk_bdev_close(struct spdk_bdev_desc *desc) 8666 { 8667 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8668 8669 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8670 spdk_get_thread()); 8671 8672 assert(desc->thread == spdk_get_thread()); 8673 8674 spdk_poller_unregister(&desc->io_timeout_poller); 8675 8676 spdk_spin_lock(&g_bdev_mgr.spinlock); 8677 8678 bdev_close(bdev, desc); 8679 8680 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8681 } 8682 8683 int32_t 8684 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8685 { 8686 if (bdev->numa.id_valid) { 8687 return bdev->numa.id; 8688 } else { 8689 return SPDK_ENV_NUMA_ID_ANY; 8690 } 8691 } 8692 8693 static void 8694 bdev_register_finished(void *arg) 8695 { 8696 struct spdk_bdev_desc *desc = arg; 8697 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8698 8699 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8700 8701 spdk_spin_lock(&g_bdev_mgr.spinlock); 8702 8703 bdev_close(bdev, desc); 8704 8705 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8706 } 8707 8708 int 8709 spdk_bdev_register(struct spdk_bdev *bdev) 8710 { 8711 struct spdk_bdev_desc *desc; 8712 struct spdk_thread *thread = spdk_get_thread(); 8713 int rc; 8714 8715 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8716 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8717 thread ? spdk_thread_get_name(thread) : "null"); 8718 return -EINVAL; 8719 } 8720 8721 rc = bdev_register(bdev); 8722 if (rc != 0) { 8723 return rc; 8724 } 8725 8726 /* A descriptor is opened to prevent bdev deletion during examination */ 8727 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 8728 if (rc != 0) { 8729 spdk_bdev_unregister(bdev, NULL, NULL); 8730 return rc; 8731 } 8732 8733 rc = bdev_open(bdev, false, desc); 8734 if (rc != 0) { 8735 bdev_desc_free(desc); 8736 spdk_bdev_unregister(bdev, NULL, NULL); 8737 return rc; 8738 } 8739 8740 /* Examine configuration before initializing I/O */ 8741 bdev_examine(bdev); 8742 8743 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8744 if (rc != 0) { 8745 bdev_close(bdev, desc); 8746 spdk_bdev_unregister(bdev, NULL, NULL); 8747 } 8748 8749 return rc; 8750 } 8751 8752 int 8753 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8754 struct spdk_bdev_module *module) 8755 { 8756 spdk_spin_lock(&bdev->internal.spinlock); 8757 8758 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8759 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8760 spdk_spin_unlock(&bdev->internal.spinlock); 8761 return -EPERM; 8762 } 8763 8764 if (desc && !desc->write) { 8765 desc->write = true; 8766 } 8767 8768 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8769 bdev->internal.claim.v1.module = module; 8770 8771 spdk_spin_unlock(&bdev->internal.spinlock); 8772 return 0; 8773 } 8774 8775 void 8776 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8777 { 8778 spdk_spin_lock(&bdev->internal.spinlock); 8779 8780 assert(bdev->internal.claim.v1.module != NULL); 8781 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8782 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8783 bdev->internal.claim.v1.module = NULL; 8784 8785 spdk_spin_unlock(&bdev->internal.spinlock); 8786 } 8787 8788 /* 8789 * Start claims v2 8790 */ 8791 8792 const char * 8793 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8794 { 8795 switch (type) { 8796 case SPDK_BDEV_CLAIM_NONE: 8797 return "not_claimed"; 8798 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8799 return "exclusive_write"; 8800 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8801 return "read_many_write_one"; 8802 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8803 return "read_many_write_none"; 8804 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8805 return "read_many_write_many"; 8806 default: 8807 break; 8808 } 8809 return "invalid_claim"; 8810 } 8811 8812 static bool 8813 claim_type_is_v2(enum spdk_bdev_claim_type type) 8814 { 8815 switch (type) { 8816 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8817 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8818 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8819 return true; 8820 default: 8821 break; 8822 } 8823 return false; 8824 } 8825 8826 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8827 static bool 8828 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8829 { 8830 switch (type) { 8831 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8832 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8833 return true; 8834 default: 8835 break; 8836 } 8837 return false; 8838 } 8839 8840 void 8841 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8842 { 8843 if (opts == NULL) { 8844 SPDK_ERRLOG("opts should not be NULL\n"); 8845 assert(opts != NULL); 8846 return; 8847 } 8848 if (size == 0) { 8849 SPDK_ERRLOG("size should not be zero\n"); 8850 assert(size != 0); 8851 return; 8852 } 8853 8854 memset(opts, 0, size); 8855 opts->opts_size = size; 8856 8857 #define FIELD_OK(field) \ 8858 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8859 8860 #define SET_FIELD(field, value) \ 8861 if (FIELD_OK(field)) { \ 8862 opts->field = value; \ 8863 } \ 8864 8865 SET_FIELD(shared_claim_key, 0); 8866 8867 #undef FIELD_OK 8868 #undef SET_FIELD 8869 } 8870 8871 static int 8872 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8873 { 8874 if (src->opts_size == 0) { 8875 SPDK_ERRLOG("size should not be zero\n"); 8876 return -1; 8877 } 8878 8879 memset(dst, 0, sizeof(*dst)); 8880 dst->opts_size = src->opts_size; 8881 8882 #define FIELD_OK(field) \ 8883 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8884 8885 #define SET_FIELD(field) \ 8886 if (FIELD_OK(field)) { \ 8887 dst->field = src->field; \ 8888 } \ 8889 8890 if (FIELD_OK(name)) { 8891 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8892 } 8893 8894 SET_FIELD(shared_claim_key); 8895 8896 /* You should not remove this statement, but need to update the assert statement 8897 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8898 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8899 8900 #undef FIELD_OK 8901 #undef SET_FIELD 8902 return 0; 8903 } 8904 8905 /* Returns 0 if a read-write-once claim can be taken. */ 8906 static int 8907 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8908 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8909 { 8910 struct spdk_bdev *bdev = desc->bdev; 8911 struct spdk_bdev_desc *open_desc; 8912 8913 assert(spdk_spin_held(&bdev->internal.spinlock)); 8914 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8915 8916 if (opts->shared_claim_key != 0) { 8917 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8918 bdev->name); 8919 return -EINVAL; 8920 } 8921 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8922 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8923 return -EPERM; 8924 } 8925 if (desc->claim != NULL) { 8926 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8927 bdev->name, desc->claim->module->name); 8928 return -EPERM; 8929 } 8930 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8931 if (desc != open_desc && open_desc->write) { 8932 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8933 "another descriptor is open for writing\n", 8934 bdev->name); 8935 return -EPERM; 8936 } 8937 } 8938 8939 return 0; 8940 } 8941 8942 /* Returns 0 if a read-only-many claim can be taken. */ 8943 static int 8944 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8945 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8946 { 8947 struct spdk_bdev *bdev = desc->bdev; 8948 struct spdk_bdev_desc *open_desc; 8949 8950 assert(spdk_spin_held(&bdev->internal.spinlock)); 8951 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8952 assert(desc->claim == NULL); 8953 8954 if (desc->write) { 8955 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8956 bdev->name); 8957 return -EINVAL; 8958 } 8959 if (opts->shared_claim_key != 0) { 8960 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8961 return -EINVAL; 8962 } 8963 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8964 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8965 if (open_desc->write) { 8966 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8967 "another descriptor is open for writing\n", 8968 bdev->name); 8969 return -EPERM; 8970 } 8971 } 8972 } 8973 8974 return 0; 8975 } 8976 8977 /* Returns 0 if a read-write-many claim can be taken. */ 8978 static int 8979 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8980 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8981 { 8982 struct spdk_bdev *bdev = desc->bdev; 8983 struct spdk_bdev_desc *open_desc; 8984 8985 assert(spdk_spin_held(&bdev->internal.spinlock)); 8986 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8987 assert(desc->claim == NULL); 8988 8989 if (opts->shared_claim_key == 0) { 8990 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8991 bdev->name); 8992 return -EINVAL; 8993 } 8994 switch (bdev->internal.claim_type) { 8995 case SPDK_BDEV_CLAIM_NONE: 8996 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8997 if (open_desc == desc) { 8998 continue; 8999 } 9000 if (open_desc->write) { 9001 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 9002 "another descriptor is open for writing without a " 9003 "claim\n", bdev->name); 9004 return -EPERM; 9005 } 9006 } 9007 break; 9008 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9009 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 9010 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 9011 return -EPERM; 9012 } 9013 break; 9014 default: 9015 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9016 return -EBUSY; 9017 } 9018 9019 return 0; 9020 } 9021 9022 /* Updates desc and its bdev with a v2 claim. */ 9023 static int 9024 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9025 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9026 { 9027 struct spdk_bdev *bdev = desc->bdev; 9028 struct spdk_bdev_module_claim *claim; 9029 9030 assert(spdk_spin_held(&bdev->internal.spinlock)); 9031 assert(claim_type_is_v2(type)); 9032 assert(desc->claim == NULL); 9033 9034 claim = calloc(1, sizeof(*desc->claim)); 9035 if (claim == NULL) { 9036 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 9037 return -ENOMEM; 9038 } 9039 claim->module = module; 9040 claim->desc = desc; 9041 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 9042 memcpy(claim->name, opts->name, sizeof(claim->name)); 9043 desc->claim = claim; 9044 9045 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9046 bdev->internal.claim_type = type; 9047 TAILQ_INIT(&bdev->internal.claim.v2.claims); 9048 bdev->internal.claim.v2.key = opts->shared_claim_key; 9049 } 9050 assert(type == bdev->internal.claim_type); 9051 9052 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 9053 9054 if (!desc->write && claim_type_promotes_to_write(type)) { 9055 desc->write = true; 9056 } 9057 9058 return 0; 9059 } 9060 9061 int 9062 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9063 struct spdk_bdev_claim_opts *_opts, 9064 struct spdk_bdev_module *module) 9065 { 9066 struct spdk_bdev *bdev; 9067 struct spdk_bdev_claim_opts opts; 9068 int rc = 0; 9069 9070 if (desc == NULL) { 9071 SPDK_ERRLOG("descriptor must not be NULL\n"); 9072 return -EINVAL; 9073 } 9074 9075 bdev = desc->bdev; 9076 9077 if (_opts == NULL) { 9078 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 9079 } else if (claim_opts_copy(_opts, &opts) != 0) { 9080 return -EINVAL; 9081 } 9082 9083 spdk_spin_lock(&bdev->internal.spinlock); 9084 9085 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9086 bdev->internal.claim_type != type) { 9087 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9088 spdk_spin_unlock(&bdev->internal.spinlock); 9089 return -EPERM; 9090 } 9091 9092 if (claim_type_is_v2(type) && desc->claim != NULL) { 9093 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9094 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9095 spdk_spin_unlock(&bdev->internal.spinlock); 9096 return -EPERM; 9097 } 9098 9099 switch (type) { 9100 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9101 spdk_spin_unlock(&bdev->internal.spinlock); 9102 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9103 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9104 rc = claim_verify_rwo(desc, type, &opts, module); 9105 break; 9106 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9107 rc = claim_verify_rom(desc, type, &opts, module); 9108 break; 9109 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9110 rc = claim_verify_rwm(desc, type, &opts, module); 9111 break; 9112 default: 9113 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9114 rc = -ENOTSUP; 9115 } 9116 9117 if (rc == 0) { 9118 rc = claim_bdev(desc, type, &opts, module); 9119 } 9120 9121 spdk_spin_unlock(&bdev->internal.spinlock); 9122 return rc; 9123 } 9124 9125 static void 9126 claim_reset(struct spdk_bdev *bdev) 9127 { 9128 assert(spdk_spin_held(&bdev->internal.spinlock)); 9129 assert(claim_type_is_v2(bdev->internal.claim_type)); 9130 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9131 9132 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9133 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9134 } 9135 9136 static void 9137 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9138 { 9139 struct spdk_bdev *bdev = desc->bdev; 9140 9141 assert(spdk_spin_held(&bdev->internal.spinlock)); 9142 assert(claim_type_is_v2(bdev->internal.claim_type)); 9143 9144 if (bdev->internal.examine_in_progress == 0) { 9145 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9146 free(desc->claim); 9147 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9148 claim_reset(bdev); 9149 } 9150 } else { 9151 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9152 desc->claim->module = NULL; 9153 desc->claim->desc = NULL; 9154 } 9155 desc->claim = NULL; 9156 } 9157 9158 /* 9159 * End claims v2 9160 */ 9161 9162 struct spdk_bdev * 9163 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9164 { 9165 assert(desc != NULL); 9166 return desc->bdev; 9167 } 9168 9169 int 9170 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9171 { 9172 struct spdk_bdev *bdev, *tmp; 9173 struct spdk_bdev_desc *desc; 9174 int rc = 0; 9175 9176 assert(fn != NULL); 9177 9178 spdk_spin_lock(&g_bdev_mgr.spinlock); 9179 bdev = spdk_bdev_first(); 9180 while (bdev != NULL) { 9181 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9182 if (rc != 0) { 9183 break; 9184 } 9185 rc = bdev_open(bdev, false, desc); 9186 if (rc != 0) { 9187 bdev_desc_free(desc); 9188 if (rc == -ENODEV) { 9189 /* Ignore the error and move to the next bdev. */ 9190 rc = 0; 9191 bdev = spdk_bdev_next(bdev); 9192 continue; 9193 } 9194 break; 9195 } 9196 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9197 9198 rc = fn(ctx, bdev); 9199 9200 spdk_spin_lock(&g_bdev_mgr.spinlock); 9201 tmp = spdk_bdev_next(bdev); 9202 bdev_close(bdev, desc); 9203 if (rc != 0) { 9204 break; 9205 } 9206 bdev = tmp; 9207 } 9208 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9209 9210 return rc; 9211 } 9212 9213 int 9214 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9215 { 9216 struct spdk_bdev *bdev, *tmp; 9217 struct spdk_bdev_desc *desc; 9218 int rc = 0; 9219 9220 assert(fn != NULL); 9221 9222 spdk_spin_lock(&g_bdev_mgr.spinlock); 9223 bdev = spdk_bdev_first_leaf(); 9224 while (bdev != NULL) { 9225 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9226 if (rc != 0) { 9227 break; 9228 } 9229 rc = bdev_open(bdev, false, desc); 9230 if (rc != 0) { 9231 bdev_desc_free(desc); 9232 if (rc == -ENODEV) { 9233 /* Ignore the error and move to the next bdev. */ 9234 rc = 0; 9235 bdev = spdk_bdev_next_leaf(bdev); 9236 continue; 9237 } 9238 break; 9239 } 9240 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9241 9242 rc = fn(ctx, bdev); 9243 9244 spdk_spin_lock(&g_bdev_mgr.spinlock); 9245 tmp = spdk_bdev_next_leaf(bdev); 9246 bdev_close(bdev, desc); 9247 if (rc != 0) { 9248 break; 9249 } 9250 bdev = tmp; 9251 } 9252 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9253 9254 return rc; 9255 } 9256 9257 void 9258 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9259 { 9260 struct iovec *iovs; 9261 int iovcnt; 9262 9263 if (bdev_io == NULL) { 9264 return; 9265 } 9266 9267 switch (bdev_io->type) { 9268 case SPDK_BDEV_IO_TYPE_READ: 9269 case SPDK_BDEV_IO_TYPE_WRITE: 9270 case SPDK_BDEV_IO_TYPE_ZCOPY: 9271 iovs = bdev_io->u.bdev.iovs; 9272 iovcnt = bdev_io->u.bdev.iovcnt; 9273 break; 9274 default: 9275 iovs = NULL; 9276 iovcnt = 0; 9277 break; 9278 } 9279 9280 if (iovp) { 9281 *iovp = iovs; 9282 } 9283 if (iovcntp) { 9284 *iovcntp = iovcnt; 9285 } 9286 } 9287 9288 void * 9289 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9290 { 9291 if (bdev_io == NULL) { 9292 return NULL; 9293 } 9294 9295 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9296 return NULL; 9297 } 9298 9299 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9300 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9301 return bdev_io->u.bdev.md_buf; 9302 } 9303 9304 return NULL; 9305 } 9306 9307 void * 9308 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9309 { 9310 if (bdev_io == NULL) { 9311 assert(false); 9312 return NULL; 9313 } 9314 9315 return bdev_io->internal.caller_ctx; 9316 } 9317 9318 void 9319 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9320 { 9321 9322 if (spdk_bdev_module_list_find(bdev_module->name)) { 9323 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9324 assert(false); 9325 } 9326 9327 spdk_spin_init(&bdev_module->internal.spinlock); 9328 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9329 9330 /* 9331 * Modules with examine callbacks must be initialized first, so they are 9332 * ready to handle examine callbacks from later modules that will 9333 * register physical bdevs. 9334 */ 9335 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9336 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9337 } else { 9338 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9339 } 9340 } 9341 9342 struct spdk_bdev_module * 9343 spdk_bdev_module_list_find(const char *name) 9344 { 9345 struct spdk_bdev_module *bdev_module; 9346 9347 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9348 if (strcmp(name, bdev_module->name) == 0) { 9349 break; 9350 } 9351 } 9352 9353 return bdev_module; 9354 } 9355 9356 static int 9357 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9358 { 9359 uint64_t num_blocks; 9360 void *md_buf = NULL; 9361 9362 num_blocks = bdev_io->u.bdev.num_blocks; 9363 9364 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9365 md_buf = (char *)g_bdev_mgr.zero_buffer + 9366 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9367 } 9368 9369 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9370 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9371 g_bdev_mgr.zero_buffer, md_buf, 9372 bdev_io->u.bdev.offset_blocks, num_blocks, 9373 bdev_write_zero_buffer_done, bdev_io); 9374 } 9375 9376 static void 9377 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9378 { 9379 struct spdk_bdev_io *parent_io = cb_arg; 9380 9381 spdk_bdev_free_io(bdev_io); 9382 9383 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9384 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9385 } 9386 9387 static void 9388 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9389 { 9390 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9391 ctx->bdev->internal.qos_mod_in_progress = false; 9392 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9393 9394 if (ctx->cb_fn) { 9395 ctx->cb_fn(ctx->cb_arg, status); 9396 } 9397 free(ctx); 9398 } 9399 9400 static void 9401 bdev_disable_qos_done(void *cb_arg) 9402 { 9403 struct set_qos_limit_ctx *ctx = cb_arg; 9404 struct spdk_bdev *bdev = ctx->bdev; 9405 struct spdk_bdev_qos *qos; 9406 9407 spdk_spin_lock(&bdev->internal.spinlock); 9408 qos = bdev->internal.qos; 9409 bdev->internal.qos = NULL; 9410 spdk_spin_unlock(&bdev->internal.spinlock); 9411 9412 if (qos->thread != NULL) { 9413 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9414 spdk_poller_unregister(&qos->poller); 9415 } 9416 9417 free(qos); 9418 9419 bdev_set_qos_limit_done(ctx, 0); 9420 } 9421 9422 static void 9423 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9424 { 9425 struct set_qos_limit_ctx *ctx = _ctx; 9426 struct spdk_thread *thread; 9427 9428 spdk_spin_lock(&bdev->internal.spinlock); 9429 thread = bdev->internal.qos->thread; 9430 spdk_spin_unlock(&bdev->internal.spinlock); 9431 9432 if (thread != NULL) { 9433 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9434 } else { 9435 bdev_disable_qos_done(ctx); 9436 } 9437 } 9438 9439 static void 9440 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9441 struct spdk_io_channel *ch, void *_ctx) 9442 { 9443 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9444 struct spdk_bdev_io *bdev_io; 9445 9446 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9447 9448 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9449 /* Re-submit the queued I/O. */ 9450 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9451 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9452 _bdev_io_submit(bdev_io); 9453 } 9454 9455 spdk_bdev_for_each_channel_continue(i, 0); 9456 } 9457 9458 static void 9459 bdev_update_qos_rate_limit_msg(void *cb_arg) 9460 { 9461 struct set_qos_limit_ctx *ctx = cb_arg; 9462 struct spdk_bdev *bdev = ctx->bdev; 9463 9464 spdk_spin_lock(&bdev->internal.spinlock); 9465 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9466 spdk_spin_unlock(&bdev->internal.spinlock); 9467 9468 bdev_set_qos_limit_done(ctx, 0); 9469 } 9470 9471 static void 9472 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9473 struct spdk_io_channel *ch, void *_ctx) 9474 { 9475 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9476 9477 spdk_spin_lock(&bdev->internal.spinlock); 9478 bdev_enable_qos(bdev, bdev_ch); 9479 spdk_spin_unlock(&bdev->internal.spinlock); 9480 spdk_bdev_for_each_channel_continue(i, 0); 9481 } 9482 9483 static void 9484 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9485 { 9486 struct set_qos_limit_ctx *ctx = _ctx; 9487 9488 bdev_set_qos_limit_done(ctx, status); 9489 } 9490 9491 static void 9492 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9493 { 9494 int i; 9495 9496 assert(bdev->internal.qos != NULL); 9497 9498 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9499 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9500 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9501 9502 if (limits[i] == 0) { 9503 bdev->internal.qos->rate_limits[i].limit = 9504 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9505 } 9506 } 9507 } 9508 } 9509 9510 void 9511 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9512 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9513 { 9514 struct set_qos_limit_ctx *ctx; 9515 uint32_t limit_set_complement; 9516 uint64_t min_limit_per_sec; 9517 int i; 9518 bool disable_rate_limit = true; 9519 9520 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9521 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9522 continue; 9523 } 9524 9525 if (limits[i] > 0) { 9526 disable_rate_limit = false; 9527 } 9528 9529 if (bdev_qos_is_iops_rate_limit(i) == true) { 9530 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9531 } else { 9532 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9533 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9534 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9535 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9536 } 9537 /* Change from megabyte to byte rate limit */ 9538 limits[i] = limits[i] * 1024 * 1024; 9539 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9540 } 9541 9542 limit_set_complement = limits[i] % min_limit_per_sec; 9543 if (limit_set_complement) { 9544 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9545 limits[i], min_limit_per_sec); 9546 limits[i] += min_limit_per_sec - limit_set_complement; 9547 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9548 } 9549 } 9550 9551 ctx = calloc(1, sizeof(*ctx)); 9552 if (ctx == NULL) { 9553 cb_fn(cb_arg, -ENOMEM); 9554 return; 9555 } 9556 9557 ctx->cb_fn = cb_fn; 9558 ctx->cb_arg = cb_arg; 9559 ctx->bdev = bdev; 9560 9561 spdk_spin_lock(&bdev->internal.spinlock); 9562 if (bdev->internal.qos_mod_in_progress) { 9563 spdk_spin_unlock(&bdev->internal.spinlock); 9564 free(ctx); 9565 cb_fn(cb_arg, -EAGAIN); 9566 return; 9567 } 9568 bdev->internal.qos_mod_in_progress = true; 9569 9570 if (disable_rate_limit == true && bdev->internal.qos) { 9571 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9572 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9573 (bdev->internal.qos->rate_limits[i].limit > 0 && 9574 bdev->internal.qos->rate_limits[i].limit != 9575 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9576 disable_rate_limit = false; 9577 break; 9578 } 9579 } 9580 } 9581 9582 if (disable_rate_limit == false) { 9583 if (bdev->internal.qos == NULL) { 9584 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9585 if (!bdev->internal.qos) { 9586 spdk_spin_unlock(&bdev->internal.spinlock); 9587 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9588 bdev_set_qos_limit_done(ctx, -ENOMEM); 9589 return; 9590 } 9591 } 9592 9593 if (bdev->internal.qos->thread == NULL) { 9594 /* Enabling */ 9595 bdev_set_qos_rate_limits(bdev, limits); 9596 9597 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9598 bdev_enable_qos_done); 9599 } else { 9600 /* Updating */ 9601 bdev_set_qos_rate_limits(bdev, limits); 9602 9603 spdk_thread_send_msg(bdev->internal.qos->thread, 9604 bdev_update_qos_rate_limit_msg, ctx); 9605 } 9606 } else { 9607 if (bdev->internal.qos != NULL) { 9608 bdev_set_qos_rate_limits(bdev, limits); 9609 9610 /* Disabling */ 9611 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9612 bdev_disable_qos_msg_done); 9613 } else { 9614 spdk_spin_unlock(&bdev->internal.spinlock); 9615 bdev_set_qos_limit_done(ctx, 0); 9616 return; 9617 } 9618 } 9619 9620 spdk_spin_unlock(&bdev->internal.spinlock); 9621 } 9622 9623 struct spdk_bdev_histogram_ctx { 9624 spdk_bdev_histogram_status_cb cb_fn; 9625 void *cb_arg; 9626 struct spdk_bdev *bdev; 9627 int status; 9628 }; 9629 9630 static void 9631 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9632 { 9633 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9634 9635 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9636 ctx->bdev->internal.histogram_in_progress = false; 9637 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9638 ctx->cb_fn(ctx->cb_arg, ctx->status); 9639 free(ctx); 9640 } 9641 9642 static void 9643 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9644 struct spdk_io_channel *_ch, void *_ctx) 9645 { 9646 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9647 9648 if (ch->histogram != NULL) { 9649 spdk_histogram_data_free(ch->histogram); 9650 ch->histogram = NULL; 9651 } 9652 spdk_bdev_for_each_channel_continue(i, 0); 9653 } 9654 9655 static void 9656 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9657 { 9658 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9659 9660 if (status != 0) { 9661 ctx->status = status; 9662 ctx->bdev->internal.histogram_enabled = false; 9663 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9664 bdev_histogram_disable_channel_cb); 9665 } else { 9666 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9667 ctx->bdev->internal.histogram_in_progress = false; 9668 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9669 ctx->cb_fn(ctx->cb_arg, ctx->status); 9670 free(ctx); 9671 } 9672 } 9673 9674 static void 9675 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9676 struct spdk_io_channel *_ch, void *_ctx) 9677 { 9678 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9679 int status = 0; 9680 9681 if (ch->histogram == NULL) { 9682 ch->histogram = spdk_histogram_data_alloc(); 9683 if (ch->histogram == NULL) { 9684 status = -ENOMEM; 9685 } 9686 } 9687 9688 spdk_bdev_for_each_channel_continue(i, status); 9689 } 9690 9691 void 9692 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9693 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9694 { 9695 struct spdk_bdev_histogram_ctx *ctx; 9696 9697 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9698 if (ctx == NULL) { 9699 cb_fn(cb_arg, -ENOMEM); 9700 return; 9701 } 9702 9703 ctx->bdev = bdev; 9704 ctx->status = 0; 9705 ctx->cb_fn = cb_fn; 9706 ctx->cb_arg = cb_arg; 9707 9708 spdk_spin_lock(&bdev->internal.spinlock); 9709 if (bdev->internal.histogram_in_progress) { 9710 spdk_spin_unlock(&bdev->internal.spinlock); 9711 free(ctx); 9712 cb_fn(cb_arg, -EAGAIN); 9713 return; 9714 } 9715 9716 bdev->internal.histogram_in_progress = true; 9717 spdk_spin_unlock(&bdev->internal.spinlock); 9718 9719 bdev->internal.histogram_enabled = enable; 9720 bdev->internal.histogram_io_type = opts->io_type; 9721 9722 if (enable) { 9723 /* Allocate histogram for each channel */ 9724 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9725 bdev_histogram_enable_channel_cb); 9726 } else { 9727 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9728 bdev_histogram_disable_channel_cb); 9729 } 9730 } 9731 9732 void 9733 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9734 { 9735 if (opts == NULL) { 9736 SPDK_ERRLOG("opts should not be NULL\n"); 9737 assert(opts != NULL); 9738 return; 9739 } 9740 if (size == 0) { 9741 SPDK_ERRLOG("size should not be zero\n"); 9742 assert(size != 0); 9743 return; 9744 } 9745 9746 memset(opts, 0, size); 9747 opts->size = size; 9748 9749 #define FIELD_OK(field) \ 9750 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9751 9752 #define SET_FIELD(field, value) \ 9753 if (FIELD_OK(field)) { \ 9754 opts->field = value; \ 9755 } \ 9756 9757 SET_FIELD(io_type, 0); 9758 9759 /* You should not remove this statement, but need to update the assert statement 9760 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9761 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9762 9763 #undef FIELD_OK 9764 #undef SET_FIELD 9765 } 9766 9767 void 9768 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9769 void *cb_arg, bool enable) 9770 { 9771 struct spdk_bdev_enable_histogram_opts opts; 9772 9773 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9774 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9775 } 9776 9777 struct spdk_bdev_histogram_data_ctx { 9778 spdk_bdev_histogram_data_cb cb_fn; 9779 void *cb_arg; 9780 struct spdk_bdev *bdev; 9781 /** merged histogram data from all channels */ 9782 struct spdk_histogram_data *histogram; 9783 }; 9784 9785 static void 9786 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9787 { 9788 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9789 9790 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9791 free(ctx); 9792 } 9793 9794 static void 9795 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9796 struct spdk_io_channel *_ch, void *_ctx) 9797 { 9798 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9799 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9800 int status = 0; 9801 9802 if (ch->histogram == NULL) { 9803 status = -EFAULT; 9804 } else { 9805 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9806 } 9807 9808 spdk_bdev_for_each_channel_continue(i, status); 9809 } 9810 9811 void 9812 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9813 spdk_bdev_histogram_data_cb cb_fn, 9814 void *cb_arg) 9815 { 9816 struct spdk_bdev_histogram_data_ctx *ctx; 9817 9818 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9819 if (ctx == NULL) { 9820 cb_fn(cb_arg, -ENOMEM, NULL); 9821 return; 9822 } 9823 9824 ctx->bdev = bdev; 9825 ctx->cb_fn = cb_fn; 9826 ctx->cb_arg = cb_arg; 9827 9828 ctx->histogram = histogram; 9829 9830 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9831 bdev_histogram_get_channel_cb); 9832 } 9833 9834 void 9835 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9836 void *cb_arg) 9837 { 9838 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9839 int status = 0; 9840 9841 assert(cb_fn != NULL); 9842 9843 if (bdev_ch->histogram == NULL) { 9844 status = -EFAULT; 9845 } 9846 cb_fn(cb_arg, status, bdev_ch->histogram); 9847 } 9848 9849 size_t 9850 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9851 size_t max_events) 9852 { 9853 struct media_event_entry *entry; 9854 size_t num_events = 0; 9855 9856 for (; num_events < max_events; ++num_events) { 9857 entry = TAILQ_FIRST(&desc->pending_media_events); 9858 if (entry == NULL) { 9859 break; 9860 } 9861 9862 events[num_events] = entry->event; 9863 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9864 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9865 } 9866 9867 return num_events; 9868 } 9869 9870 int 9871 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9872 size_t num_events) 9873 { 9874 struct spdk_bdev_desc *desc; 9875 struct media_event_entry *entry; 9876 size_t event_id; 9877 int rc = 0; 9878 9879 assert(bdev->media_events); 9880 9881 spdk_spin_lock(&bdev->internal.spinlock); 9882 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9883 if (desc->write) { 9884 break; 9885 } 9886 } 9887 9888 if (desc == NULL || desc->media_events_buffer == NULL) { 9889 rc = -ENODEV; 9890 goto out; 9891 } 9892 9893 for (event_id = 0; event_id < num_events; ++event_id) { 9894 entry = TAILQ_FIRST(&desc->free_media_events); 9895 if (entry == NULL) { 9896 break; 9897 } 9898 9899 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9900 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9901 entry->event = events[event_id]; 9902 } 9903 9904 rc = event_id; 9905 out: 9906 spdk_spin_unlock(&bdev->internal.spinlock); 9907 return rc; 9908 } 9909 9910 static void 9911 _media_management_notify(void *arg) 9912 { 9913 struct spdk_bdev_desc *desc = arg; 9914 9915 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9916 } 9917 9918 void 9919 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9920 { 9921 struct spdk_bdev_desc *desc; 9922 9923 spdk_spin_lock(&bdev->internal.spinlock); 9924 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9925 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9926 event_notify(desc, _media_management_notify); 9927 } 9928 } 9929 spdk_spin_unlock(&bdev->internal.spinlock); 9930 } 9931 9932 struct locked_lba_range_ctx { 9933 struct lba_range range; 9934 struct lba_range *current_range; 9935 struct lba_range *owner_range; 9936 struct spdk_poller *poller; 9937 lock_range_cb cb_fn; 9938 void *cb_arg; 9939 }; 9940 9941 static void 9942 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9943 { 9944 struct locked_lba_range_ctx *ctx = _ctx; 9945 9946 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9947 free(ctx); 9948 } 9949 9950 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9951 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9952 9953 static void 9954 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9955 { 9956 struct locked_lba_range_ctx *ctx = _ctx; 9957 9958 if (status == -ENOMEM) { 9959 /* One of the channels could not allocate a range object. 9960 * So we have to go back and clean up any ranges that were 9961 * allocated successfully before we return error status to 9962 * the caller. We can reuse the unlock function to do that 9963 * clean up. 9964 */ 9965 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9966 bdev_lock_error_cleanup_cb); 9967 return; 9968 } 9969 9970 /* All channels have locked this range and no I/O overlapping the range 9971 * are outstanding! Set the owner_ch for the range object for the 9972 * locking channel, so that this channel will know that it is allowed 9973 * to write to this range. 9974 */ 9975 if (ctx->owner_range != NULL) { 9976 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9977 } 9978 9979 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9980 9981 /* Don't free the ctx here. Its range is in the bdev's global list of 9982 * locked ranges still, and will be removed and freed when this range 9983 * is later unlocked. 9984 */ 9985 } 9986 9987 static int 9988 bdev_lock_lba_range_check_io(void *_i) 9989 { 9990 struct spdk_bdev_channel_iter *i = _i; 9991 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9992 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9993 struct locked_lba_range_ctx *ctx = i->ctx; 9994 struct lba_range *range = ctx->current_range; 9995 struct spdk_bdev_io *bdev_io; 9996 9997 spdk_poller_unregister(&ctx->poller); 9998 9999 /* The range is now in the locked_ranges, so no new IO can be submitted to this 10000 * range. But we need to wait until any outstanding IO overlapping with this range 10001 * are completed. 10002 */ 10003 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 10004 if (bdev_io_range_is_locked(bdev_io, range)) { 10005 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 10006 return SPDK_POLLER_BUSY; 10007 } 10008 } 10009 10010 spdk_bdev_for_each_channel_continue(i, 0); 10011 return SPDK_POLLER_BUSY; 10012 } 10013 10014 static void 10015 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10016 struct spdk_io_channel *_ch, void *_ctx) 10017 { 10018 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10019 struct locked_lba_range_ctx *ctx = _ctx; 10020 struct lba_range *range; 10021 10022 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10023 if (range->length == ctx->range.length && 10024 range->offset == ctx->range.offset && 10025 range->locked_ctx == ctx->range.locked_ctx) { 10026 /* This range already exists on this channel, so don't add 10027 * it again. This can happen when a new channel is created 10028 * while the for_each_channel operation is in progress. 10029 * Do not check for outstanding I/O in that case, since the 10030 * range was locked before any I/O could be submitted to the 10031 * new channel. 10032 */ 10033 spdk_bdev_for_each_channel_continue(i, 0); 10034 return; 10035 } 10036 } 10037 10038 range = calloc(1, sizeof(*range)); 10039 if (range == NULL) { 10040 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 10041 return; 10042 } 10043 10044 range->length = ctx->range.length; 10045 range->offset = ctx->range.offset; 10046 range->locked_ctx = ctx->range.locked_ctx; 10047 range->quiesce = ctx->range.quiesce; 10048 ctx->current_range = range; 10049 if (ctx->range.owner_ch == ch) { 10050 /* This is the range object for the channel that will hold 10051 * the lock. Store it in the ctx object so that we can easily 10052 * set its owner_ch after the lock is finally acquired. 10053 */ 10054 ctx->owner_range = range; 10055 } 10056 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 10057 bdev_lock_lba_range_check_io(i); 10058 } 10059 10060 static void 10061 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 10062 { 10063 assert(spdk_get_thread() == ctx->range.owner_thread); 10064 assert(ctx->range.owner_ch == NULL || 10065 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 10066 10067 /* We will add a copy of this range to each channel now. */ 10068 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 10069 bdev_lock_lba_range_cb); 10070 } 10071 10072 static bool 10073 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 10074 { 10075 struct lba_range *r; 10076 10077 TAILQ_FOREACH(r, tailq, tailq) { 10078 if (bdev_lba_range_overlapped(range, r)) { 10079 return true; 10080 } 10081 } 10082 return false; 10083 } 10084 10085 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10086 10087 static int 10088 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10089 uint64_t offset, uint64_t length, 10090 lock_range_cb cb_fn, void *cb_arg) 10091 { 10092 struct locked_lba_range_ctx *ctx; 10093 10094 ctx = calloc(1, sizeof(*ctx)); 10095 if (ctx == NULL) { 10096 return -ENOMEM; 10097 } 10098 10099 ctx->range.offset = offset; 10100 ctx->range.length = length; 10101 ctx->range.owner_thread = spdk_get_thread(); 10102 ctx->range.owner_ch = ch; 10103 ctx->range.locked_ctx = cb_arg; 10104 ctx->range.bdev = bdev; 10105 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10106 ctx->cb_fn = cb_fn; 10107 ctx->cb_arg = cb_arg; 10108 10109 spdk_spin_lock(&bdev->internal.spinlock); 10110 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10111 /* There is an active lock overlapping with this range. 10112 * Put it on the pending list until this range no 10113 * longer overlaps with another. 10114 */ 10115 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10116 } else { 10117 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10118 bdev_lock_lba_range_ctx(bdev, ctx); 10119 } 10120 spdk_spin_unlock(&bdev->internal.spinlock); 10121 return 0; 10122 } 10123 10124 static int 10125 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10126 uint64_t offset, uint64_t length, 10127 lock_range_cb cb_fn, void *cb_arg) 10128 { 10129 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10130 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10131 10132 if (cb_arg == NULL) { 10133 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10134 return -EINVAL; 10135 } 10136 10137 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10138 } 10139 10140 static void 10141 bdev_lock_lba_range_ctx_msg(void *_ctx) 10142 { 10143 struct locked_lba_range_ctx *ctx = _ctx; 10144 10145 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10146 } 10147 10148 static void 10149 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10150 { 10151 struct locked_lba_range_ctx *ctx = _ctx; 10152 struct locked_lba_range_ctx *pending_ctx; 10153 struct lba_range *range, *tmp; 10154 10155 spdk_spin_lock(&bdev->internal.spinlock); 10156 /* Check if there are any pending locked ranges that overlap with this range 10157 * that was just unlocked. If there are, check that it doesn't overlap with any 10158 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10159 * the lock process. 10160 */ 10161 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10162 if (bdev_lba_range_overlapped(range, &ctx->range) && 10163 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10164 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10165 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10166 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10167 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10168 bdev_lock_lba_range_ctx_msg, pending_ctx); 10169 } 10170 } 10171 spdk_spin_unlock(&bdev->internal.spinlock); 10172 10173 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10174 free(ctx); 10175 } 10176 10177 static void 10178 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10179 struct spdk_io_channel *_ch, void *_ctx) 10180 { 10181 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10182 struct locked_lba_range_ctx *ctx = _ctx; 10183 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10184 struct spdk_bdev_io *bdev_io; 10185 struct lba_range *range; 10186 10187 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10188 if (ctx->range.offset == range->offset && 10189 ctx->range.length == range->length && 10190 ctx->range.locked_ctx == range->locked_ctx) { 10191 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10192 free(range); 10193 break; 10194 } 10195 } 10196 10197 /* Note: we should almost always be able to assert that the range specified 10198 * was found. But there are some very rare corner cases where a new channel 10199 * gets created simultaneously with a range unlock, where this function 10200 * would execute on that new channel and wouldn't have the range. 10201 * We also use this to clean up range allocations when a later allocation 10202 * fails in the locking path. 10203 * So we can't actually assert() here. 10204 */ 10205 10206 /* Swap the locked IO into a temporary list, and then try to submit them again. 10207 * We could hyper-optimize this to only resubmit locked I/O that overlap 10208 * with the range that was just unlocked, but this isn't a performance path so 10209 * we go for simplicity here. 10210 */ 10211 TAILQ_INIT(&io_locked); 10212 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10213 while (!TAILQ_EMPTY(&io_locked)) { 10214 bdev_io = TAILQ_FIRST(&io_locked); 10215 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10216 bdev_io_submit(bdev_io); 10217 } 10218 10219 spdk_bdev_for_each_channel_continue(i, 0); 10220 } 10221 10222 static int 10223 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10224 lock_range_cb cb_fn, void *cb_arg) 10225 { 10226 struct locked_lba_range_ctx *ctx; 10227 struct lba_range *range; 10228 10229 spdk_spin_lock(&bdev->internal.spinlock); 10230 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10231 * and remove it. This ensures new channels don't inherit the locked range. 10232 * Then we will send a message to each channel to remove the range from its 10233 * per-channel list. 10234 */ 10235 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10236 if (range->offset == offset && range->length == length && 10237 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10238 break; 10239 } 10240 } 10241 if (range == NULL) { 10242 assert(false); 10243 spdk_spin_unlock(&bdev->internal.spinlock); 10244 return -EINVAL; 10245 } 10246 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10247 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10248 spdk_spin_unlock(&bdev->internal.spinlock); 10249 10250 ctx->cb_fn = cb_fn; 10251 ctx->cb_arg = cb_arg; 10252 10253 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10254 bdev_unlock_lba_range_cb); 10255 return 0; 10256 } 10257 10258 static int 10259 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10260 uint64_t offset, uint64_t length, 10261 lock_range_cb cb_fn, void *cb_arg) 10262 { 10263 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10264 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10265 struct lba_range *range; 10266 bool range_found = false; 10267 10268 /* Let's make sure the specified channel actually has a lock on 10269 * the specified range. Note that the range must match exactly. 10270 */ 10271 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10272 if (range->offset == offset && range->length == length && 10273 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10274 range_found = true; 10275 break; 10276 } 10277 } 10278 10279 if (!range_found) { 10280 return -EINVAL; 10281 } 10282 10283 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10284 } 10285 10286 struct bdev_quiesce_ctx { 10287 spdk_bdev_quiesce_cb cb_fn; 10288 void *cb_arg; 10289 }; 10290 10291 static void 10292 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10293 { 10294 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10295 10296 if (quiesce_ctx->cb_fn != NULL) { 10297 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10298 } 10299 10300 free(quiesce_ctx); 10301 } 10302 10303 static void 10304 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10305 { 10306 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10307 struct spdk_bdev_module *module = range->bdev->module; 10308 10309 if (status != 0) { 10310 if (quiesce_ctx->cb_fn != NULL) { 10311 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10312 } 10313 free(quiesce_ctx); 10314 return; 10315 } 10316 10317 spdk_spin_lock(&module->internal.spinlock); 10318 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10319 spdk_spin_unlock(&module->internal.spinlock); 10320 10321 if (quiesce_ctx->cb_fn != NULL) { 10322 /* copy the context in case the range is unlocked by the callback */ 10323 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10324 10325 quiesce_ctx->cb_fn = NULL; 10326 quiesce_ctx->cb_arg = NULL; 10327 10328 tmp.cb_fn(tmp.cb_arg, status); 10329 } 10330 /* quiesce_ctx will be freed on unquiesce */ 10331 } 10332 10333 static int 10334 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10335 uint64_t offset, uint64_t length, 10336 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10337 bool unquiesce) 10338 { 10339 struct bdev_quiesce_ctx *quiesce_ctx; 10340 int rc; 10341 10342 if (module != bdev->module) { 10343 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10344 return -EINVAL; 10345 } 10346 10347 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10348 return -EINVAL; 10349 } 10350 10351 if (unquiesce) { 10352 struct lba_range *range; 10353 10354 /* Make sure the specified range is actually quiesced in the specified module and 10355 * then remove it from the list. Note that the range must match exactly. 10356 */ 10357 spdk_spin_lock(&module->internal.spinlock); 10358 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10359 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10360 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10361 break; 10362 } 10363 } 10364 spdk_spin_unlock(&module->internal.spinlock); 10365 10366 if (range == NULL) { 10367 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10368 return -EINVAL; 10369 } 10370 10371 quiesce_ctx = range->locked_ctx; 10372 quiesce_ctx->cb_fn = cb_fn; 10373 quiesce_ctx->cb_arg = cb_arg; 10374 10375 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10376 } else { 10377 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10378 if (quiesce_ctx == NULL) { 10379 return -ENOMEM; 10380 } 10381 10382 quiesce_ctx->cb_fn = cb_fn; 10383 quiesce_ctx->cb_arg = cb_arg; 10384 10385 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10386 if (rc != 0) { 10387 free(quiesce_ctx); 10388 } 10389 } 10390 10391 return rc; 10392 } 10393 10394 int 10395 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10396 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10397 { 10398 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10399 } 10400 10401 int 10402 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10403 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10404 { 10405 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10406 } 10407 10408 int 10409 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10410 uint64_t offset, uint64_t length, 10411 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10412 { 10413 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10414 } 10415 10416 int 10417 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10418 uint64_t offset, uint64_t length, 10419 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10420 { 10421 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10422 } 10423 10424 int 10425 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10426 int array_size) 10427 { 10428 if (!bdev) { 10429 return -EINVAL; 10430 } 10431 10432 if (bdev->fn_table->get_memory_domains) { 10433 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10434 } 10435 10436 return 0; 10437 } 10438 10439 struct spdk_bdev_for_each_io_ctx { 10440 void *ctx; 10441 spdk_bdev_io_fn fn; 10442 spdk_bdev_for_each_io_cb cb; 10443 }; 10444 10445 static void 10446 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10447 struct spdk_io_channel *io_ch, void *_ctx) 10448 { 10449 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10450 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10451 struct spdk_bdev_io *bdev_io; 10452 int rc = 0; 10453 10454 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10455 rc = ctx->fn(ctx->ctx, bdev_io); 10456 if (rc != 0) { 10457 break; 10458 } 10459 } 10460 10461 spdk_bdev_for_each_channel_continue(i, rc); 10462 } 10463 10464 static void 10465 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10466 { 10467 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10468 10469 ctx->cb(ctx->ctx, status); 10470 10471 free(ctx); 10472 } 10473 10474 void 10475 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10476 spdk_bdev_for_each_io_cb cb) 10477 { 10478 struct spdk_bdev_for_each_io_ctx *ctx; 10479 10480 assert(fn != NULL && cb != NULL); 10481 10482 ctx = calloc(1, sizeof(*ctx)); 10483 if (ctx == NULL) { 10484 SPDK_ERRLOG("Failed to allocate context.\n"); 10485 cb(_ctx, -ENOMEM); 10486 return; 10487 } 10488 10489 ctx->ctx = _ctx; 10490 ctx->fn = fn; 10491 ctx->cb = cb; 10492 10493 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10494 bdev_for_each_io_done); 10495 } 10496 10497 void 10498 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10499 { 10500 spdk_for_each_channel_continue(iter->i, status); 10501 } 10502 10503 static struct spdk_bdev * 10504 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10505 { 10506 void *io_device = spdk_io_channel_iter_get_io_device(i); 10507 10508 return __bdev_from_io_dev(io_device); 10509 } 10510 10511 static void 10512 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10513 { 10514 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10515 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10516 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10517 10518 iter->i = i; 10519 iter->fn(iter, bdev, ch, iter->ctx); 10520 } 10521 10522 static void 10523 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10524 { 10525 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10526 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10527 10528 iter->i = i; 10529 iter->cpl(bdev, iter->ctx, status); 10530 10531 free(iter); 10532 } 10533 10534 void 10535 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10536 void *ctx, spdk_bdev_for_each_channel_done cpl) 10537 { 10538 struct spdk_bdev_channel_iter *iter; 10539 10540 assert(bdev != NULL && fn != NULL && ctx != NULL); 10541 10542 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10543 if (iter == NULL) { 10544 SPDK_ERRLOG("Unable to allocate iterator\n"); 10545 assert(false); 10546 return; 10547 } 10548 10549 iter->fn = fn; 10550 iter->cpl = cpl; 10551 iter->ctx = ctx; 10552 10553 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10554 iter, bdev_each_channel_cpl); 10555 } 10556 10557 static void 10558 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10559 { 10560 struct spdk_bdev_io *parent_io = cb_arg; 10561 10562 spdk_bdev_free_io(bdev_io); 10563 10564 /* Check return status of write */ 10565 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10566 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10567 } 10568 10569 static void 10570 bdev_copy_do_write(void *_bdev_io) 10571 { 10572 struct spdk_bdev_io *bdev_io = _bdev_io; 10573 int rc; 10574 10575 /* Write blocks */ 10576 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10577 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10578 bdev_io->u.bdev.iovs[0].iov_base, 10579 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10580 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10581 10582 if (rc == -ENOMEM) { 10583 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10584 } else if (rc != 0) { 10585 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10586 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10587 } 10588 } 10589 10590 static void 10591 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10592 { 10593 struct spdk_bdev_io *parent_io = cb_arg; 10594 10595 spdk_bdev_free_io(bdev_io); 10596 10597 /* Check return status of read */ 10598 if (!success) { 10599 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10600 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10601 return; 10602 } 10603 10604 /* Do write */ 10605 bdev_copy_do_write(parent_io); 10606 } 10607 10608 static void 10609 bdev_copy_do_read(void *_bdev_io) 10610 { 10611 struct spdk_bdev_io *bdev_io = _bdev_io; 10612 int rc; 10613 10614 /* Read blocks */ 10615 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10616 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10617 bdev_io->u.bdev.iovs[0].iov_base, 10618 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10619 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10620 10621 if (rc == -ENOMEM) { 10622 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10623 } else if (rc != 0) { 10624 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10625 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10626 } 10627 } 10628 10629 static void 10630 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10631 { 10632 if (!success) { 10633 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10634 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10635 return; 10636 } 10637 10638 bdev_copy_do_read(bdev_io); 10639 } 10640 10641 int 10642 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10643 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10644 spdk_bdev_io_completion_cb cb, void *cb_arg) 10645 { 10646 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10647 struct spdk_bdev_io *bdev_io; 10648 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10649 10650 if (!desc->write) { 10651 return -EBADF; 10652 } 10653 10654 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10655 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10656 SPDK_DEBUGLOG(bdev, 10657 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10658 dst_offset_blocks, src_offset_blocks, num_blocks); 10659 return -EINVAL; 10660 } 10661 10662 bdev_io = bdev_channel_get_io(channel); 10663 if (!bdev_io) { 10664 return -ENOMEM; 10665 } 10666 10667 bdev_io->internal.ch = channel; 10668 bdev_io->internal.desc = desc; 10669 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10670 10671 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10672 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10673 bdev_io->u.bdev.num_blocks = num_blocks; 10674 bdev_io->u.bdev.memory_domain = NULL; 10675 bdev_io->u.bdev.memory_domain_ctx = NULL; 10676 bdev_io->u.bdev.iovs = NULL; 10677 bdev_io->u.bdev.iovcnt = 0; 10678 bdev_io->u.bdev.md_buf = NULL; 10679 bdev_io->u.bdev.accel_sequence = NULL; 10680 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10681 10682 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10683 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10684 return 0; 10685 } 10686 10687 10688 /* If the copy size is large and should be split, use the generic split logic 10689 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10690 * 10691 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10692 * emulate it using regular read and write requests otherwise. 10693 */ 10694 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10695 bdev_io->internal.f.split) { 10696 bdev_io_submit(bdev_io); 10697 return 0; 10698 } 10699 10700 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10701 10702 return 0; 10703 } 10704 10705 SPDK_LOG_REGISTER_COMPONENT(bdev) 10706 10707 static void 10708 bdev_trace(void) 10709 { 10710 struct spdk_trace_tpoint_opts opts[] = { 10711 { 10712 "BDEV_IO_START", TRACE_BDEV_IO_START, 10713 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10714 { 10715 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10716 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10717 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10718 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10719 } 10720 }, 10721 { 10722 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10723 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10724 { 10725 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10726 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10727 } 10728 }, 10729 { 10730 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10731 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10732 { 10733 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10734 } 10735 }, 10736 { 10737 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10738 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10739 { 10740 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10741 } 10742 }, 10743 }; 10744 10745 10746 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10747 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10748 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10749 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10750 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10751 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10752 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10753 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 10754 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 10755 } 10756 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10757